From 4410ecd239caf53de680d04557620781cc9e8953 Mon Sep 17 00:00:00 2001
From: Liangliang He <lliang.he@gmail.com>
Date: Wed, 7 Mar 2018 21:47:25 +0800
Subject: [PATCH] Reformatting code and enable cpplint

---
 .gitlab-ci.yml                                |   9 +
 mace/core/allocator.h                         |   4 +-
 mace/core/arg_helper.cc                       |   4 +-
 mace/core/buffer.h                            | 143 +++---
 mace/core/mace.cc                             | 425 ++++++------------
 mace/core/net.cc                              |  14 +-
 mace/core/operator.h                          |   4 +-
 mace/core/preallocated_pooled_allocator.h     |   4 +-
 .../hexagon/hexagon_control_wrapper.cc        | 152 +++----
 .../runtime/hexagon/hexagon_control_wrapper.h |  18 +-
 .../hexagon/hexagon_controller_dummy.cc       | 166 +++++--
 mace/core/runtime/hexagon/hexagon_nn.h        | 202 ++++++---
 mace/core/runtime/hexagon/hexagon_nn_ops.h    |  10 +-
 mace/core/runtime/hexagon/ops.h               |   1 -
 mace/core/runtime/hexagon/quantize.cc         |  10 +-
 mace/core/runtime/hexagon/quantize.h          |  18 +-
 mace/core/runtime/opencl/opencl_allocator.cc  |  27 +-
 .../core/runtime/opencl/opencl_development.cc |   5 +-
 mace/core/runtime/opencl/opencl_production.cc |   3 +-
 mace/core/runtime/opencl/opencl_runtime.cc    |  37 +-
 mace/core/runtime/opencl/opencl_runtime.h     |   4 +-
 mace/core/runtime/opencl/opencl_wrapper.h     |   8 +-
 mace/core/tensor.h                            |  95 ++--
 mace/core/testing/test_benchmark.cc           |   4 +-
 mace/core/testing/test_benchmark.h            |   2 +-
 mace/core/types.cc                            |  20 +-
 mace/core/workspace.cc                        |  75 ++--
 mace/core/workspace.h                         |   4 +-
 mace/kernels/activation.h                     |  22 +-
 mace/kernels/addn.h                           |   2 +-
 mace/kernels/batch_norm.h                     |  10 +-
 mace/kernels/bias_add.h                       |   4 +-
 mace/kernels/buffer_to_image.h                |  14 +-
 mace/kernels/channel_shuffle.h                |   6 +-
 mace/kernels/concat.h                         |  20 +-
 mace/kernels/conv_2d.h                        |  12 +-
 mace/kernels/conv_pool_2d_util.cc             |  31 +-
 mace/kernels/conv_pool_2d_util.h              |   2 +-
 mace/kernels/depthwise_conv2d.h               |   9 +-
 mace/kernels/eltwise.h                        |  25 +-
 mace/kernels/fully_connected.h                |  18 +-
 mace/kernels/global_avg_pooling.h             |   6 +-
 mace/kernels/matmul.h                         |   5 +-
 mace/kernels/neon/batch_norm_neon.cc          |   3 +-
 mace/kernels/neon/conv_2d_neon.cc             |   9 +-
 mace/kernels/neon/conv_2d_neon_3x3.cc         |  12 +-
 mace/kernels/neon/depthwise_conv_neon.cc      |   9 +-
 mace/kernels/opencl/activation_opencl.cc      |   7 +-
 mace/kernels/opencl/addn.cc                   |  24 +-
 mace/kernels/opencl/batch_norm_opencl.cc      |   9 +-
 mace/kernels/opencl/bias_add_opencl.cc        |  21 +-
 mace/kernels/opencl/buffer_to_image.cc        |  54 +--
 mace/kernels/opencl/cl/common.h               |   4 +-
 mace/kernels/opencl/concat.cc                 |  53 +--
 mace/kernels/opencl/conv_2d_opencl.cc         |  37 +-
 mace/kernels/opencl/conv_2d_opencl_1x1.cc     |  16 +-
 mace/kernels/opencl/conv_2d_opencl_3x3.cc     |  15 +-
 mace/kernels/opencl/conv_2d_opencl_general.cc |  15 +-
 mace/kernels/opencl/depthwise_conv_opencl.cc  |  23 +-
 mace/kernels/opencl/eltwise_opencl.cc         |  20 +-
 mace/kernels/opencl/fully_connected_opencl.cc |  35 +-
 mace/kernels/opencl/helper.cc                 |  88 ++--
 mace/kernels/opencl/helper.h                  |  13 +-
 mace/kernels/opencl/matmul.cc                 |  27 +-
 mace/kernels/opencl/pooling_opencl.cc         |  27 +-
 mace/kernels/opencl/resize_bilinear_opencl.cc |  20 +-
 mace/kernels/opencl/softmax_opencl.cc         |  17 +-
 mace/kernels/opencl/space_to_batch_opencl.cc  |  31 +-
 mace/kernels/opencl/winograd_transform.cc     | 101 +++--
 mace/kernels/pooling.h                        |  51 ++-
 mace/kernels/reshape.h                        |   3 +-
 mace/kernels/resize_bilinear.h                |  21 +-
 mace/kernels/space_to_batch.h                 |  22 +-
 mace/kernels/winograd_transform.h             |  44 +-
 mace/ops/activation.h                         |   3 +-
 mace/ops/activation_test.cc                   |   7 +-
 mace/ops/addn.h                               |  10 +-
 mace/ops/addn_benchmark.cc                    |   3 +-
 mace/ops/batch_norm_benchmark.cc              |   2 +-
 mace/ops/batch_to_space.h                     |  27 +-
 mace/ops/batch_to_space_benchmark.cc          |   2 +-
 mace/ops/bias_add_benchmark.cc                |   2 +-
 mace/ops/buffer_to_image.h                    |   9 +-
 mace/ops/buffer_to_image_test.cc              |  71 +--
 mace/ops/channel_shuffle.h                    |   4 +-
 mace/ops/channel_shuffle_benchmark.cc         |   2 +-
 mace/ops/concat.h                             |   5 +-
 mace/ops/concat_benchmark.cc                  |  19 +-
 mace/ops/concat_test.cc                       |   8 +-
 mace/ops/conv_2d_test.cc                      | 104 ++---
 mace/ops/eltwise.h                            |  10 +-
 mace/ops/eltwise_benchmark.cc                 |   2 +-
 mace/ops/eltwise_test.cc                      |  96 ++--
 mace/ops/folded_batch_norm.cc                 |  27 +-
 mace/ops/folded_batch_norm_test.cc            |   2 +-
 mace/ops/fully_connected.h                    |   9 +-
 mace/ops/fully_connected_benchmark.cc         |  28 +-
 mace/ops/fully_connected_test.cc              |  98 ++--
 mace/ops/fused_conv_2d_test.cc                |  40 +-
 mace/ops/global_avg_pooling.h                 |   2 +-
 mace/ops/global_avg_pooling_benchmark.cc      |   4 +-
 mace/ops/image_to_buffer.h                    |   9 +-
 mace/ops/matmul.h                             |   4 +-
 mace/ops/matmul_test.cc                       |  62 ++-
 mace/ops/ops_test_util.h                      |   9 +-
 mace/ops/pooling.h                            |   8 +-
 mace/ops/pooling_benchmark.cc                 |   4 +-
 mace/ops/pooling_test.cc                      |   6 +-
 mace/ops/reshape.h                            |   8 +-
 mace/ops/reshape_test.cc                      |   1 -
 mace/ops/softmax.cc                           |   6 +-
 mace/ops/softmax.h                            |   5 +-
 mace/ops/softmax_test.cc                      |  16 +-
 mace/ops/space_to_batch.h                     |  25 +-
 mace/ops/space_to_batch_benchmark.cc          |   2 +-
 mace/ops/winograd_convolution_test.cc         |  36 +-
 mace/ops/winograd_inverse_transform.h         |   4 +-
 mace/ops/winograd_transform.h                 |   6 +-
 mace/ops/winograd_transform_benchmark.cc      |  42 +-
 mace/public/mace.h                            |  46 +-
 mace/utils/command_line_flags.h               |   2 +-
 mace/utils/env_time.h                         |   1 -
 mace/utils/logging.h                          |   2 +-
 mace/utils/string_util.h                      |   2 +-
 mace/utils/timer.h                            |  16 +-
 mace/utils/tuner_test.cc                      |  29 +-
 mace/utils/utils.h                            |   4 +-
 mace/utils/utils_test.cc                      |  29 +-
 128 files changed, 1634 insertions(+), 1831 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1144cbc3..d25fa503 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,6 +1,15 @@
 stages:
   - ops_test
   - ops_benchmark
+  - cpplint
+
+cpplint:
+  stage: cpplint
+  only:
+    - master
+  script:
+    - curl -o cpplint.py https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py
+    - python cpplint.py --root=mace --linelength=80 --counting=detailed $(find mace -name *.h -or -name *.cc | grep -vE "half.h")
 
 ops_test:
   stage: ops_test
diff --git a/mace/core/allocator.h b/mace/core/allocator.h
index 7ab65f0b..eebbb32b 100644
--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -9,8 +9,8 @@
 #include <malloc.h>
 
 #include "mace/core/registry.h"
-#include "mace/public/mace.h"
 #include "mace/core/types.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 
@@ -81,7 +81,7 @@ class CPUAllocator : public Allocator {
     free(data);
   };
   void *Map(void *buffer, size_t offset, size_t nbytes) const override {
-    return (char*)buffer + offset;
+    return (char *)buffer + offset;
   }
   void *MapImage(void *buffer,
                  const std::vector<size_t> &image_shape,
diff --git a/mace/core/arg_helper.cc b/mace/core/arg_helper.cc
index 41892b7a..8b6d57fb 100644
--- a/mace/core/arg_helper.cc
+++ b/mace/core/arg_helper.cc
@@ -83,12 +83,12 @@ INSTANTIATE_GET_SINGLE_ARGUMENT(string, s, false)
 #define INSTANTIATE_GET_REPEATED_ARGUMENT(T, fieldname,                   \
                                           enforce_lossless_conversion)    \
   template <>                                                             \
-  std::vector<T> ArgumentHelper::GetRepeatedArgument<T>(                       \
+  std::vector<T> ArgumentHelper::GetRepeatedArgument<T>(                  \
       const string &name, const std::vector<T> &default_value) const {    \
     if (arg_map_.count(name) == 0) {                                      \
       return default_value;                                               \
     }                                                                     \
-    std::vector<T> values;                                                     \
+    std::vector<T> values;                                                \
     for (const auto &v : arg_map_.at(name).fieldname()) {                 \
       if (enforce_lossless_conversion) {                                  \
         auto supportsConversion =                                         \
diff --git a/mace/core/buffer.h b/mace/core/buffer.h
index c17c4a1d..38c577a5 100644
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -5,9 +5,9 @@
 #ifndef MACE_CORE_BUFFER_H_
 #define MACE_CORE_BUFFER_H_
 
-#include "mace/core/types.h"
-#include "mace/core/allocator.h"
 #include <vector>
+#include "mace/core/allocator.h"
+#include "mace/core/types.h"
 
 namespace mace {
 
@@ -39,23 +39,19 @@ class BufferBase {
 
   virtual bool OnHost() const = 0;
 
-  virtual index_t offset() const {
-    return 0;
-  };
+  virtual index_t offset() const { return 0; };
 
-  template<typename T>
+  template <typename T>
   const T *data() const {
     return reinterpret_cast<const T *>(raw_data());
   }
 
-  template<typename T>
+  template <typename T>
   T *mutable_data() {
     return reinterpret_cast<T *>(raw_mutable_data());
   }
 
-  index_t size() const {
-    return size_;
-  }
+  index_t size() const { return size_; }
 
  protected:
   index_t size_;
@@ -64,26 +60,26 @@ class BufferBase {
 class Buffer : public BufferBase {
  public:
   Buffer(Allocator *allocator)
-    : BufferBase(0),
-      allocator_(allocator),
-      buf_(nullptr),
-      mapped_buf_(nullptr),
-      is_data_owner_(true) {}
+      : BufferBase(0),
+        allocator_(allocator),
+        buf_(nullptr),
+        mapped_buf_(nullptr),
+        is_data_owner_(true) {}
 
   Buffer(Allocator *allocator, index_t size)
-    : BufferBase(size),
-      allocator_(allocator),
-      mapped_buf_(nullptr),
-      is_data_owner_(true) {
+      : BufferBase(size),
+        allocator_(allocator),
+        mapped_buf_(nullptr),
+        is_data_owner_(true) {
     buf_ = allocator->New(size);
   }
 
   Buffer(Allocator *allocator, void *data, index_t size)
-    : BufferBase(size),
-      allocator_(allocator),
-      buf_(data),
-      mapped_buf_(nullptr),
-      is_data_owner_(false) {}
+      : BufferBase(size),
+        allocator_(allocator),
+        buf_(data),
+        mapped_buf_(nullptr),
+        is_data_owner_(false) {}
 
   virtual ~Buffer() {
     if (mapped_buf_ != nullptr) {
@@ -155,12 +151,10 @@ class Buffer : public BufferBase {
   void Copy(void *src, index_t offset, index_t length) {
     MACE_CHECK_NOTNULL(mapped_buf_);
     MACE_CHECK(length <= size_, "out of buffer");
-    memcpy(mapped_buf_, (char *) src + offset, length);
+    memcpy(mapped_buf_, (char *)src + offset, length);
   }
 
-  bool OnHost() const {
-    return allocator_->OnHost();
-  }
+  bool OnHost() const { return allocator_->OnHost(); }
 
  private:
   Allocator *allocator_;
@@ -168,23 +162,24 @@ class Buffer : public BufferBase {
   void *mapped_buf_;
   bool is_data_owner_;
 
- DISABLE_COPY_AND_ASSIGN(Buffer);
+  DISABLE_COPY_AND_ASSIGN(Buffer);
 };
 
 class Image : public BufferBase {
  public:
   Image()
-    : BufferBase(0),
-      allocator_(GetDeviceAllocator(OPENCL)),
-      buf_(nullptr),
-      mapped_buf_(nullptr) {}
+      : BufferBase(0),
+        allocator_(GetDeviceAllocator(OPENCL)),
+        buf_(nullptr),
+        mapped_buf_(nullptr) {}
 
   Image(std::vector<size_t> shape, DataType data_type)
-    : BufferBase(std::accumulate(shape.begin(), shape.end(),
-                                 1, std::multiplies<index_t>())
-                   * GetEnumTypeSize(data_type)),
-      allocator_(GetDeviceAllocator(OPENCL)),
-      mapped_buf_(nullptr) {
+      : BufferBase(
+            std::accumulate(
+                shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
+            GetEnumTypeSize(data_type)),
+        allocator_(GetDeviceAllocator(OPENCL)),
+        mapped_buf_(nullptr) {
     shape_ = shape;
     data_type_ = data_type;
     buf_ = allocator_->NewImage(shape, data_type);
@@ -214,9 +209,7 @@ class Image : public BufferBase {
     return mapped_buf_;
   }
 
-  std::vector<size_t> image_shape() const {
-    return shape_;
-  }
+  std::vector<size_t> image_shape() const { return shape_; }
 
   void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
     MACE_NOT_IMPLEMENTED;
@@ -241,17 +234,11 @@ class Image : public BufferBase {
     mapped_buf_ = nullptr;
   };
 
-  void Resize(index_t size) {
-    MACE_NOT_IMPLEMENTED;
-  }
+  void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
 
-  void Copy(void *src, index_t offset, index_t length) {
-    MACE_NOT_IMPLEMENTED;
-  }
+  void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
 
-  bool OnHost() const {
-    return allocator_->OnHost();
-  }
+  bool OnHost() const { return allocator_->OnHost(); }
 
  private:
   Allocator *allocator_;
@@ -260,34 +247,25 @@ class Image : public BufferBase {
   void *buf_;
   void *mapped_buf_;
 
- DISABLE_COPY_AND_ASSIGN(Image);
+  DISABLE_COPY_AND_ASSIGN(Image);
 };
 
 class BufferSlice : public BufferBase {
  public:
   BufferSlice()
-    : buffer_(nullptr),
-      mapped_buf_(nullptr),
-      offset_(0),
-      length_(0) {}
+      : buffer_(nullptr), mapped_buf_(nullptr), offset_(0), length_(0) {}
   BufferSlice(BufferBase *buffer, index_t offset, index_t length)
-    : BufferBase(buffer->size()),
-      buffer_(buffer),
-      mapped_buf_(nullptr),
-      offset_(offset),
-      length_(length) {
+      : BufferBase(buffer->size()),
+        buffer_(buffer),
+        mapped_buf_(nullptr),
+        offset_(offset),
+        length_(length) {
     MACE_CHECK(offset >= 0, "buffer slice offset should >= 0");
-    MACE_CHECK(offset + length <= size_,
-               "buffer slice offset + length (",
-               offset,
-               " + ",
-               length,
-               ") should <= ",
-               size_);
+    MACE_CHECK(offset + length <= size_, "buffer slice offset + length (",
+               offset, " + ", length, ") should <= ", size_);
   }
-  BufferSlice(const BufferSlice &other) : BufferSlice(other.buffer_,
-                                                      other.offset_,
-                                                      other.length_) {}
+  BufferSlice(const BufferSlice &other)
+      : BufferSlice(other.buffer_, other.offset_, other.length_) {}
 
   ~BufferSlice() {
     if (buffer_ != nullptr && mapped_buf_ != nullptr) {
@@ -303,7 +281,7 @@ class BufferSlice : public BufferBase {
   const void *raw_data() const {
     if (OnHost()) {
       MACE_CHECK_NOTNULL(buffer_);
-      return (char *) buffer_->raw_data() + offset_;
+      return (char *)buffer_->raw_data() + offset_;
     } else {
       MACE_CHECK_NOTNULL(mapped_buf_);
       return mapped_buf_;
@@ -320,9 +298,7 @@ class BufferSlice : public BufferBase {
     return nullptr;
   }
 
-  void UnMap(void *mapped_ptr) const {
-    MACE_NOT_IMPLEMENTED;
-  }
+  void UnMap(void *mapped_ptr) const { MACE_NOT_IMPLEMENTED; }
 
   void Map(std::vector<size_t> *pitch) {
     MACE_CHECK_NOTNULL(buffer_);
@@ -336,21 +312,13 @@ class BufferSlice : public BufferBase {
     mapped_buf_ = nullptr;
   };
 
-  void Resize(index_t size) {
-    MACE_NOT_IMPLEMENTED;
-  }
+  void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
 
-  void Copy(void *src, index_t offset, index_t length) {
-    MACE_NOT_IMPLEMENTED;
-  }
+  void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
 
-  index_t offset() const {
-    return offset_;
-  }
+  index_t offset() const { return offset_; }
 
-  bool OnHost() const {
-    return buffer_->OnHost();
-  }
+  bool OnHost() const { return buffer_->OnHost(); }
 
  private:
   BufferBase *buffer_;
@@ -358,7 +326,6 @@ class BufferSlice : public BufferBase {
   index_t offset_;
   index_t length_;
 };
-
 }
 
-#endif // MACE_CORE_BUFFER_H_
+#endif  // MACE_CORE_BUFFER_H_
diff --git a/mace/core/mace.cc b/mace/core/mace.cc
index 0da8449f..52483181 100644
--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
@@ -3,9 +3,9 @@
 //
 
 #include "mace/public/mace.h"
-#include "mace/core/types.h"
 #include "mace/core/net.h"
 #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
+#include "mace/core/types.h"
 
 namespace mace {
 
@@ -13,46 +13,34 @@ ConstTensor::ConstTensor(const std::string &name,
                          const unsigned char *data,
                          const std::vector<int64_t> &dims,
                          const DataType data_type,
-                         uint32_t node_id) :
-    name_(name),
-    data_(data),
-    data_size_(std::accumulate(dims.begin(), dims.end(), 1,
-                               std::multiplies<int64_t>())),
-    dims_(dims.begin(), dims.end()),
-    data_type_(data_type),
-    node_id_(node_id) {}
+                         uint32_t node_id)
+    : name_(name),
+      data_(data),
+      data_size_(std::accumulate(
+          dims.begin(), dims.end(), 1, std::multiplies<int64_t>())),
+      dims_(dims.begin(), dims.end()),
+      data_type_(data_type),
+      node_id_(node_id) {}
 
 ConstTensor::ConstTensor(const std::string &name,
                          const unsigned char *data,
                          const std::vector<int64_t> &dims,
                          const int data_type,
-                         uint32_t node_id) :
-    name_(name),
-    data_(data),
-    data_size_(std::accumulate(dims.begin(), dims.end(), 1,
-                               std::multiplies<int64_t>())),
-    dims_(dims.begin(), dims.end()),
-    data_type_(static_cast<DataType>(data_type)),
-    node_id_(node_id) {}
-
-const std::string &ConstTensor::name() const {
-  return name_;
-}
-const unsigned char *ConstTensor::data() const {
-  return data_;
-}
-int64_t ConstTensor::data_size() const {
-  return data_size_;
-}
-const std::vector<int64_t> &ConstTensor::dims() const {
-  return dims_;
-}
-DataType ConstTensor::data_type() const {
-  return data_type_;
-}
-uint32_t ConstTensor::node_id() const {
-  return node_id_;
-}
+                         uint32_t node_id)
+    : name_(name),
+      data_(data),
+      data_size_(std::accumulate(
+          dims.begin(), dims.end(), 1, std::multiplies<int64_t>())),
+      dims_(dims.begin(), dims.end()),
+      data_type_(static_cast<DataType>(data_type)),
+      node_id_(node_id) {}
+
+const std::string &ConstTensor::name() const { return name_; }
+const unsigned char *ConstTensor::data() const { return data_; }
+int64_t ConstTensor::data_size() const { return data_size_; }
+const std::vector<int64_t> &ConstTensor::dims() const { return dims_; }
+DataType ConstTensor::data_type() const { return data_type_; }
+uint32_t ConstTensor::node_id() const { return node_id_; }
 
 Argument::Argument() : has_bits_(0) {}
 
@@ -73,74 +61,42 @@ void Argument::CopyFrom(const Argument &from) {
 
   this->has_bits_ = from.has_bits_;
 }
-const std::string &Argument::name() const {
-  return name_;
-}
-void Argument::set_name(const std::string &value) {
-  name_ = value;
-}
-bool Argument::has_f() const {
-  return (has_bits_ & 0x00000001u) != 0;
-}
-void Argument::set_has_f() {
-  has_bits_ |= 0x00000001u;
-}
-float Argument::f() const {
-  return f_;
-}
+const std::string &Argument::name() const { return name_; }
+void Argument::set_name(const std::string &value) { name_ = value; }
+bool Argument::has_f() const { return (has_bits_ & 0x00000001u) != 0; }
+void Argument::set_has_f() { has_bits_ |= 0x00000001u; }
+float Argument::f() const { return f_; }
 void Argument::set_f(float value) {
   set_has_f();
   f_ = value;
 }
-bool Argument::has_i() const {
-  return (has_bits_ & 0x00000002u) != 0;
-}
-void Argument::set_has_i() {
-  has_bits_ |= 0x00000002u;
-}
-int64_t Argument::i() const {
-  return i_;
-}
+bool Argument::has_i() const { return (has_bits_ & 0x00000002u) != 0; }
+void Argument::set_has_i() { has_bits_ |= 0x00000002u; }
+int64_t Argument::i() const { return i_; }
 void Argument::set_i(int64_t value) {
   set_has_i();
   i_ = value;
 }
-bool Argument::has_s() const {
-  return (has_bits_ & 0x00000004u) != 0;
-}
-void Argument::set_has_s() {
-  has_bits_ |= 0x00000004u;
-}
-std::string Argument::s() const {
-  return s_;
-}
+bool Argument::has_s() const { return (has_bits_ & 0x00000004u) != 0; }
+void Argument::set_has_s() { has_bits_ |= 0x00000004u; }
+std::string Argument::s() const { return s_; }
 void Argument::set_s(const std::string &value) {
   set_has_s();
   s_ = value;
 }
-const std::vector<float> &Argument::floats() const {
-  return floats_;
-}
-void Argument::add_floats(float value) {
-  floats_.push_back(value);
-}
+const std::vector<float> &Argument::floats() const { return floats_; }
+void Argument::add_floats(float value) { floats_.push_back(value); }
 void Argument::set_floats(const std::vector<float> &value) {
   floats_.resize(value.size());
   std::copy(value.begin(), value.end(), floats_.begin());
 }
-const std::vector<int64_t> &Argument::ints() const {
-  return ints_;
-}
-void Argument::add_ints(int64_t value) {
-  ints_.push_back(value);
-}
+const std::vector<int64_t> &Argument::ints() const { return ints_; }
+void Argument::add_ints(int64_t value) { ints_.push_back(value); }
 void Argument::set_ints(const std::vector<int64_t> &value) {
   ints_.resize(value.size());
   std::copy(value.begin(), value.end(), ints_.begin());
 }
-const std::vector<std::string> &Argument::strings() const {
-  return strings_;
-}
+const std::vector<std::string> &Argument::strings() const { return strings_; }
 void Argument::add_strings(const ::std::string &value) {
   strings_.push_back(value);
 }
@@ -156,31 +112,21 @@ void NodeInput::CopyFrom(const NodeInput &from) {
   node_id_ = from.node_id();
   output_port_ = from.output_port();
 }
-int NodeInput::node_id() const {
-  return node_id_;
-}
-void NodeInput::set_node_id(int node_id) {
-  node_id_ = node_id;
-}
-int NodeInput::output_port() const {
-  return output_port_;
-}
-void NodeInput::set_output_port(int output_port) {
-  output_port_ = output_port;
-}
+int NodeInput::node_id() const { return node_id_; }
+void NodeInput::set_node_id(int node_id) { node_id_ = node_id; }
+int NodeInput::output_port() const { return output_port_; }
+void NodeInput::set_output_port(int output_port) { output_port_ = output_port; }
 
 // OutputShape
 OutputShape::OutputShape() {}
-OutputShape::OutputShape(const std::vector<int64_t> &dims) :
-    dims_(dims.begin(), dims.end()) {}
+OutputShape::OutputShape(const std::vector<int64_t> &dims)
+    : dims_(dims.begin(), dims.end()) {}
 void OutputShape::CopyFrom(const OutputShape &from) {
   auto from_dims = from.dims();
   dims_.resize(from_dims.size());
   std::copy(from_dims.begin(), from_dims.end(), dims_.begin());
 }
-const std::vector<int64_t> &OutputShape::dims() const {
-  return dims_;
-}
+const std::vector<int64_t> &OutputShape::dims() const { return dims_; }
 
 // Operator Def
 void OperatorDef::CopyFrom(const OperatorDef &from) {
@@ -220,68 +166,38 @@ void OperatorDef::CopyFrom(const OperatorDef &from) {
   }
   auto from_out_max_byte_size = from.out_max_byte_size();
   out_max_byte_size_.resize(from_out_max_byte_size.size());
-  std::copy(from_out_max_byte_size.begin(),
-            from_out_max_byte_size.end(),
+  std::copy(from_out_max_byte_size.begin(), from_out_max_byte_size.end(),
             out_max_byte_size_.begin());
 
   has_bits_ = from.has_bits_;
-
 }
 
-const std::string &OperatorDef::name() const {
-  return name_;
-}
+const std::string &OperatorDef::name() const { return name_; }
 void OperatorDef::set_name(const std::string &name_) {
   set_has_name();
   OperatorDef::name_ = name_;
 }
-bool OperatorDef::has_name() const {
-  return (has_bits_ & 0x00000001u) != 0;
-}
-void OperatorDef::set_has_name() {
-  has_bits_ |= 0x00000001u;
-}
-const std::string &OperatorDef::type() const {
-  return type_;
-}
+bool OperatorDef::has_name() const { return (has_bits_ & 0x00000001u) != 0; }
+void OperatorDef::set_has_name() { has_bits_ |= 0x00000001u; }
+const std::string &OperatorDef::type() const { return type_; }
 void OperatorDef::set_type(const std::string &type_) {
   set_has_type();
   OperatorDef::type_ = type_;
 }
-bool OperatorDef::has_type() const {
-  return (has_bits_ & 0x00000002u) != 0;
-}
-void OperatorDef::set_has_type() {
-  has_bits_ |= 0x00000002u;
-}
-int OperatorDef::mem_id() const {
-  return mem_id_;
-}
+bool OperatorDef::has_type() const { return (has_bits_ & 0x00000002u) != 0; }
+void OperatorDef::set_has_type() { has_bits_ |= 0x00000002u; }
+int OperatorDef::mem_id() const { return mem_id_; }
 void OperatorDef::set_mem_id(const int mem_id) {
   set_has_mem_id();
   mem_id_ = mem_id;
 }
-bool OperatorDef::has_mem_id() const {
-  return (has_bits_ & 0x00000004u) != 0;
-}
-void OperatorDef::set_has_mem_id() {
-  has_bits_ |= 0x00000004u;
-}
-uint32_t OperatorDef::node_id() const {
-  return node_id_;
-}
-void OperatorDef::set_node_id(uint32_t node_id) {
-  node_id_ = node_id;
-}
-uint32_t OperatorDef::op_id() const {
-  return op_id_;
-}
-uint32_t OperatorDef::padding() const {
-  return padding_;
-}
-void OperatorDef::set_padding(uint32_t padding) {
-  padding_ = padding;
-}
+bool OperatorDef::has_mem_id() const { return (has_bits_ & 0x00000004u) != 0; }
+void OperatorDef::set_has_mem_id() { has_bits_ |= 0x00000004u; }
+uint32_t OperatorDef::node_id() const { return node_id_; }
+void OperatorDef::set_node_id(uint32_t node_id) { node_id_ = node_id; }
+uint32_t OperatorDef::op_id() const { return op_id_; }
+uint32_t OperatorDef::padding() const { return padding_; }
+void OperatorDef::set_padding(uint32_t padding) { padding_ = padding; }
 const std::vector<NodeInput> &OperatorDef::node_input() const {
   return node_input_;
 }
@@ -294,9 +210,7 @@ const std::vector<int> &OperatorDef::out_max_byte_size() const {
 void OperatorDef::add_out_max_byte_size(int value) {
   out_max_byte_size_.push_back(value);
 }
-const std::vector<std::string> &OperatorDef::input() const {
-  return input_;
-}
+const std::vector<std::string> &OperatorDef::input() const { return input_; }
 const std::string &OperatorDef::input(int index) const {
   MACE_CHECK(0 <= index && index <= input_.size());
   return input_[index];
@@ -308,16 +222,12 @@ std::string *OperatorDef::add_input() {
 void OperatorDef::add_input(const ::std::string &value) {
   input_.push_back(value);
 }
-void OperatorDef::add_input(::std::string &&value) {
-  input_.push_back(value);
-}
+void OperatorDef::add_input(::std::string &&value) { input_.push_back(value); }
 void OperatorDef::set_input(const std::vector<std::string> &value) {
   input_.resize(value.size());
   std::copy(value.begin(), value.end(), input_.begin());
 }
-const std::vector<std::string> &OperatorDef::output() const {
-  return output_;
-}
+const std::vector<std::string> &OperatorDef::output() const { return output_; }
 const std::string &OperatorDef::output(int index) const {
   MACE_CHECK(0 <= index && index <= output_.size());
   return output_[index];
@@ -336,9 +246,7 @@ void OperatorDef::set_output(const std::vector<std::string> &value) {
   output_.resize(value.size());
   std::copy(value.begin(), value.end(), output_.begin());
 }
-const std::vector<Argument> &OperatorDef::arg() const {
-  return arg_;
-}
+const std::vector<Argument> &OperatorDef::arg() const { return arg_; }
 Argument *OperatorDef::add_arg() {
   arg_.emplace_back(Argument());
   return &arg_.back();
@@ -358,18 +266,12 @@ void OperatorDef::set_output_type(const std::vector<DataType> &value) {
 }
 
 // MemoryBlock
-MemoryBlock::MemoryBlock(int mem_id, uint32_t x, uint32_t y) :
-    mem_id_(mem_id), x_(x), y_(y) {}
+MemoryBlock::MemoryBlock(int mem_id, uint32_t x, uint32_t y)
+    : mem_id_(mem_id), x_(x), y_(y) {}
 
-int MemoryBlock::mem_id() const {
-  return mem_id_;
-}
-uint32_t MemoryBlock::x() const {
-  return x_;
-}
-uint32_t MemoryBlock::y() const {
-  return y_;
-}
+int MemoryBlock::mem_id() const { return mem_id_; }
+uint32_t MemoryBlock::x() const { return x_; }
+uint32_t MemoryBlock::y() const { return y_; }
 
 // MemoryArena
 const std::vector<MemoryBlock> &MemoryArena::mem_block() const {
@@ -378,131 +280,69 @@ const std::vector<MemoryBlock> &MemoryArena::mem_block() const {
 std::vector<MemoryBlock> &MemoryArena::mutable_mem_block() {
   return mem_block_;
 }
-int MemoryArena::mem_block_size() const {
-  return mem_block_.size();
-}
+int MemoryArena::mem_block_size() const { return mem_block_.size(); }
 
 // InputInfo
-const std::string &InputInfo::name() const {
-  return name_;
-}
-int32_t InputInfo::node_id() const {
-  return node_id_;
-}
-int32_t InputInfo::max_byte_size() const {
-  return max_byte_size_;
-}
-DataType InputInfo::data_type() const {
-  return data_type_;
-}
-const std::vector<int32_t> &InputInfo::dims() const {
-  return dims_;
-}
+const std::string &InputInfo::name() const { return name_; }
+int32_t InputInfo::node_id() const { return node_id_; }
+int32_t InputInfo::max_byte_size() const { return max_byte_size_; }
+DataType InputInfo::data_type() const { return data_type_; }
+const std::vector<int32_t> &InputInfo::dims() const { return dims_; }
 
 // OutputInfo
-const std::string &OutputInfo::name() const {
-  return name_;
-}
-int32_t OutputInfo::node_id() const {
-  return node_id_;
-}
-int32_t OutputInfo::max_byte_size() const {
-  return max_byte_size_;
-}
-DataType OutputInfo::data_type() const {
-  return data_type_;
-}
-void OutputInfo::set_data_type(DataType data_type) {
-  data_type_ = data_type;
-}
-const std::vector<int32_t> &OutputInfo::dims() const {
-  return dims_;
-}
-void OutputInfo::set_dims(const std::vector<int32_t> &dims) {
-  dims_ = dims;
-}
+const std::string &OutputInfo::name() const { return name_; }
+int32_t OutputInfo::node_id() const { return node_id_; }
+int32_t OutputInfo::max_byte_size() const { return max_byte_size_; }
+DataType OutputInfo::data_type() const { return data_type_; }
+void OutputInfo::set_data_type(DataType data_type) { data_type_ = data_type; }
+const std::vector<int32_t> &OutputInfo::dims() const { return dims_; }
+void OutputInfo::set_dims(const std::vector<int32_t> &dims) { dims_ = dims; }
 
 // NetDef
 NetDef::NetDef() : has_bits_(0) {}
 
-const std::string &NetDef::name() const {
-  return name_;
-}
+const std::string &NetDef::name() const { return name_; }
 void NetDef::set_name(const std::string &value) {
   set_has_name();
   name_ = value;
 }
-bool NetDef::has_name() const {
-  return (has_bits_ & 0x00000001u) != 0;
-}
-void NetDef::set_has_name() {
-  has_bits_ |= 0x00000001u;
-}
-const std::string &NetDef::version() const {
-  return version_;
-}
+bool NetDef::has_name() const { return (has_bits_ & 0x00000001u) != 0; }
+void NetDef::set_has_name() { has_bits_ |= 0x00000001u; }
+const std::string &NetDef::version() const { return version_; }
 void NetDef::set_version(const std::string &value) {
   set_has_version();
   version_ = value;
 }
-bool NetDef::has_version() const {
-  return (has_bits_ & 0x00000002u) != 0;
-}
-void NetDef::set_has_version() {
-  has_bits_ |= 0x00000002u;
-}
-const std::vector<OperatorDef> &NetDef::op() const {
-  return op_;
-}
+bool NetDef::has_version() const { return (has_bits_ & 0x00000002u) != 0; }
+void NetDef::set_has_version() { has_bits_ |= 0x00000002u; }
+const std::vector<OperatorDef> &NetDef::op() const { return op_; }
 OperatorDef *NetDef::add_op() {
   op_.emplace_back(OperatorDef());
   return &op_.back();
 }
-std::vector<OperatorDef> &NetDef::mutable_op() {
-  return op_;
-}
-const std::vector<Argument> &NetDef::arg() const {
-  return arg_;
-}
+std::vector<OperatorDef> &NetDef::mutable_op() { return op_; }
+const std::vector<Argument> &NetDef::arg() const { return arg_; }
 Argument *NetDef::add_arg() {
   arg_.emplace_back(Argument());
   return &arg_.back();
 }
-std::vector<Argument> &NetDef::mutable_arg() {
-  return arg_;
-}
-const std::vector<ConstTensor> &NetDef::tensors() const {
-  return tensors_;
-}
-std::vector<ConstTensor> &NetDef::mutable_tensors() {
-  return tensors_;
-}
-const MemoryArena &NetDef::mem_arena() const {
-  return mem_arena_;
-}
+std::vector<Argument> &NetDef::mutable_arg() { return arg_; }
+const std::vector<ConstTensor> &NetDef::tensors() const { return tensors_; }
+std::vector<ConstTensor> &NetDef::mutable_tensors() { return tensors_; }
+const MemoryArena &NetDef::mem_arena() const { return mem_arena_; }
 MemoryArena &NetDef::mutable_mem_arena() {
   set_has_mem_arena();
   return mem_arena_;
 }
-bool NetDef::has_mem_arena() const {
-  return (has_bits_ & 0x00000004u) != 0;
-}
-void NetDef::set_has_mem_arena() {
-  has_bits_ |= 0x00000004u;
-}
-const std::vector<InputInfo> &NetDef::input_info() const {
-  return input_info_;
-}
+bool NetDef::has_mem_arena() const { return (has_bits_ & 0x00000004u) != 0; }
+void NetDef::set_has_mem_arena() { has_bits_ |= 0x00000004u; }
+const std::vector<InputInfo> &NetDef::input_info() const { return input_info_; }
 const std::vector<OutputInfo> &NetDef::output_info() const {
   return output_info_;
 }
-std::vector<OutputInfo> &NetDef::mutable_output_info() {
-  return output_info_;
-}
+std::vector<OutputInfo> &NetDef::mutable_output_info() { return output_info_; }
 
-int NetDef::op_size() const {
-  return op_.size();
-}
+int NetDef::op_size() const { return op_.size(); }
 
 const OperatorDef &NetDef::op(const int idx) const {
   MACE_CHECK(0 <= idx && idx < op_size());
@@ -510,26 +350,27 @@ const OperatorDef &NetDef::op(const int idx) const {
 }
 
 // Mace Engine
-MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
-    op_registry_(new OperatorRegistry()), device_type_(device_type),
-    ws_(new Workspace()), net_(nullptr), hexagon_controller_(nullptr) {
-  ws_->CreateTensor("mace_input_node:0",
-                    GetDeviceAllocator(device_type_),
+MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type)
+    : op_registry_(new OperatorRegistry()),
+      device_type_(device_type),
+      ws_(new Workspace()),
+      net_(nullptr),
+      hexagon_controller_(nullptr) {
+  ws_->CreateTensor("mace_input_node:0", GetDeviceAllocator(device_type_),
                     DT_FLOAT);
-  ws_->CreateTensor("mace_output_node:0",
-                    GetDeviceAllocator(device_type_),
+  ws_->CreateTensor("mace_output_node:0", GetDeviceAllocator(device_type_),
                     DT_FLOAT);
   if (device_type == HEXAGON) {
     hexagon_controller_.reset(new HexagonControlWrapper());
     MACE_CHECK(hexagon_controller_->Config(), "hexagon config error");
     MACE_CHECK(hexagon_controller_->Init(), "hexagon init error");
     hexagon_controller_->SetDebugLevel(
-      static_cast<int>(mace::logging::LogMessage::MinVLogLevel()));
-    int dsp_mode = ArgumentHelper::GetSingleArgument<NetDef, int>(
-        *net_def, "dsp_mode", 0);
+        static_cast<int>(mace::logging::LogMessage::MinVLogLevel()));
+    int dsp_mode =
+        ArgumentHelper::GetSingleArgument<NetDef, int>(*net_def, "dsp_mode", 0);
     hexagon_controller_->SetGraphMode(dsp_mode);
     MACE_CHECK(hexagon_controller_->SetupGraph(*net_def),
-                "hexagon setup graph error");
+               "hexagon setup graph error");
     if (VLOG_IS_ON(2)) {
       hexagon_controller_->PrintGraph();
     }
@@ -537,8 +378,8 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
     ws_->LoadModelTensor(*net_def, device_type);
 
     // Init model
-    auto net = CreateNet(op_registry_, *net_def, ws_.get(),
-                         device_type, NetMode::INIT);
+    auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type,
+                         NetMode::INIT);
     if (!net->Run()) {
       LOG(FATAL) << "Net init run failed";
     }
@@ -548,18 +389,19 @@ MaceEngine::MaceEngine(const NetDef *net_def, DeviceType device_type) :
 MaceEngine::MaceEngine(const NetDef *net_def,
                        DeviceType device_type,
                        const std::vector<std::string> &input_nodes,
-                       const std::vector<std::string> &output_nodes) :
-    op_registry_(new OperatorRegistry()), device_type_(device_type),
-    ws_(new Workspace()), net_(nullptr), hexagon_controller_(nullptr) {
+                       const std::vector<std::string> &output_nodes)
+    : op_registry_(new OperatorRegistry()),
+      device_type_(device_type),
+      ws_(new Workspace()),
+      net_(nullptr),
+      hexagon_controller_(nullptr) {
   for (auto input_name : input_nodes) {
     ws_->CreateTensor(MakeString("mace_input_node_", input_name, ":0"),
-                      GetDeviceAllocator(device_type_),
-                      DT_FLOAT);
+                      GetDeviceAllocator(device_type_), DT_FLOAT);
   }
   for (auto output_name : output_nodes) {
     ws_->CreateTensor(MakeString("mace_output_node_", output_name, ":0"),
-                      GetDeviceAllocator(device_type_),
-                      DT_FLOAT);
+                      GetDeviceAllocator(device_type_), DT_FLOAT);
   }
   if (device_type == HEXAGON) {
     hexagon_controller_.reset(new HexagonControlWrapper());
@@ -567,8 +409,8 @@ MaceEngine::MaceEngine(const NetDef *net_def,
     MACE_CHECK(hexagon_controller_->Init(), "hexagon init error");
     hexagon_controller_->SetDebugLevel(
         static_cast<int>(mace::logging::LogMessage::MinVLogLevel()));
-    int dsp_mode = ArgumentHelper::GetSingleArgument<NetDef, int>(
-        *net_def, "dsp_mode", 0);
+    int dsp_mode =
+        ArgumentHelper::GetSingleArgument<NetDef, int>(*net_def, "dsp_mode", 0);
     hexagon_controller_->SetGraphMode(dsp_mode);
     MACE_CHECK(hexagon_controller_->SetupGraph(*net_def),
                "hexagon setup graph error");
@@ -579,14 +421,13 @@ MaceEngine::MaceEngine(const NetDef *net_def,
     ws_->LoadModelTensor(*net_def, device_type);
 
     // Init model
-    auto net = CreateNet(op_registry_, *net_def, ws_.get(),
-                         device_type, NetMode::INIT);
+    auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_type,
+                         NetMode::INIT);
     if (!net->Run()) {
       LOG(FATAL) << "Net init run failed";
     }
     net_ = std::move(CreateNet(op_registry_, *net_def, ws_.get(), device_type));
   }
-
 }
 MaceEngine::~MaceEngine() {
   if (device_type_ == HEXAGON) {
@@ -643,10 +484,11 @@ bool MaceEngine::Run(const float *input,
 bool MaceEngine::Run(const std::vector<MaceInputInfo> &inputs,
                      std::map<std::string, float *> &outputs,
                      RunMetadata *run_metadata) {
-
-  MACE_CHECK(device_type_ != HEXAGON, "HEXAGON not supports multiple outputs now");
+  MACE_CHECK(device_type_ != HEXAGON,
+             "HEXAGON not supports multiple outputs now");
   for (auto input : inputs) {
-    Tensor *input_tensor = ws_->GetTensor(MakeString("mace_input_node_", input.name, ":0"));
+    Tensor *input_tensor =
+        ws_->GetTensor(MakeString("mace_input_node_", input.name, ":0"));
     input_tensor->Resize(input.shape);
     {
       Tensor::MappingGuard input_guard(input_tensor);
@@ -658,7 +500,8 @@ bool MaceEngine::Run(const std::vector<MaceInputInfo> &inputs,
     LOG(FATAL) << "Net run failed";
   }
   for (auto output : outputs) {
-    Tensor *output_tensor = ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0"));
+    Tensor *output_tensor =
+        ws_->GetTensor(MakeString("mace_output_node_", output.first + ":0"));
     // save output
     if (output_tensor != nullptr && output.second != nullptr) {
       Tensor::MappingGuard output_guard(output_tensor);
diff --git a/mace/core/net.cc b/mace/core/net.cc
index ce44b951..2439a67f 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -3,9 +3,9 @@
 //
 
 #include "mace/core/net.h"
-#include "mace/utils/utils.h"
-#include "mace/utils/timer.h"
 #include "mace/utils/memory_logging.h"
+#include "mace/utils/timer.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
 
@@ -20,8 +20,7 @@ SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
                      Workspace *ws,
                      DeviceType type,
                      const NetMode mode)
-    :  NetBase(op_registry, net_def, ws, type),
-      device_type_(type) {
+    : NetBase(op_registry, net_def, ws, type), device_type_(type) {
   MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
   for (int idx = 0; idx < net_def->op_size(); ++idx) {
     const auto &operator_def = net_def->op(idx);
@@ -41,8 +40,8 @@ bool SerialNet::Run(RunMetadata *run_metadata) {
   MACE_LATENCY_LOGGER(1, "Running net");
   for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
     auto &op = *iter;
-    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(),
-                        "(", op->debug_def().type(), ")");
+    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
+                        op->debug_def().type(), ")");
     bool future_wait = (device_type_ == DeviceType::OPENCL &&
                         (run_metadata != nullptr ||
                          std::distance(iter, operators_.end()) == 1));
@@ -99,7 +98,8 @@ std::unique_ptr<NetBase> CreateNet(
     Workspace *ws,
     DeviceType type,
     const NetMode mode) {
-  std::unique_ptr<NetBase> net(new SerialNet(op_registry, net_def, ws, type, mode));
+  std::unique_ptr<NetBase> net(
+      new SerialNet(op_registry, net_def, ws, type, mode));
   return net;
 }
 
diff --git a/mace/core/operator.h b/mace/core/operator.h
index 773db3dd..a163c0c8 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -7,10 +7,10 @@
 
 #include "mace/core/arg_helper.h"
 #include "mace/core/future.h"
-#include "mace/public/mace.h"
 #include "mace/core/registry.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 
@@ -147,7 +147,7 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
 class OperatorRegistry {
  public:
   typedef Registry<std::string, OperatorBase, const OperatorDef &, Workspace *>
-    RegistryType;
+      RegistryType;
   OperatorRegistry();
   ~OperatorRegistry() = default;
   RegistryType *registry() { return &registry_; };
diff --git a/mace/core/preallocated_pooled_allocator.h b/mace/core/preallocated_pooled_allocator.h
index 75cf4117..ad0c975a 100644
--- a/mace/core/preallocated_pooled_allocator.h
+++ b/mace/core/preallocated_pooled_allocator.h
@@ -36,6 +36,6 @@ class PreallocatedPooledAllocator {
   std::unordered_map<int, std::unique_ptr<BufferBase>> buffers_;
 };
 
-} // namespace mace
+}  // namespace mace
 
-#endif // MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
+#endif  // MACE_CORE_PREALLOCATED_POOLED_ALLOCATOR_H_
diff --git a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc b/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
index 8735f529..2828ffa0 100644
--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
@@ -2,19 +2,19 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-#include <vector>
-#include <thread>
 #include <sys/time.h>
+#include <thread>
+#include <vector>
 
 #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
 #include "mace/core/runtime/hexagon/hexagon_nn_ops.h"
 
 namespace {
-  inline int64_t NowMicros() {
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-    return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-  }
+inline int64_t NowMicros() {
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+}
 }
 
 namespace mace {
@@ -63,9 +63,9 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
   // const node
   std::thread const_thread([&]() {
     std::vector<hexagon_nn_const_node> const_node_list;
-    for (const ConstTensor &const_tensor: net_def.tensors()) {
+    for (const ConstTensor &const_tensor : net_def.tensors()) {
       std::vector<int> tensor_shape(const_tensor.dims().begin(),
-                               const_tensor.dims().end());
+                                    const_tensor.dims().end());
       while (tensor_shape.size() < 4) {
         tensor_shape.insert(tensor_shape.begin(), 1);
       }
@@ -77,32 +77,32 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
       const_node.tensor.width = tensor_shape[2];
       const_node.tensor.depth = tensor_shape[3];
 
-      if (const_tensor.data_type() == DataType::DT_INT32
-        && const_tensor.data_size() == 0) {
+      if (const_tensor.data_type() == DataType::DT_INT32 &&
+          const_tensor.data_size() == 0) {
         const_node.tensor.data = NULL;
         const_node.tensor.dataLen = 0;
       } else {
         const_node.tensor.data =
-          const_cast<unsigned char *>(const_tensor.data());
-        const_node.tensor.dataLen =
-          const_tensor.data_size() * GetEnumTypeSize(const_tensor.data_type());
+            const_cast<unsigned char *>(const_tensor.data());
+        const_node.tensor.dataLen = const_tensor.data_size() *
+                                    GetEnumTypeSize(const_tensor.data_type());
       }
       const_node_list.push_back(const_node);
       // 255 is magic number: why fastrpc limits sequence length to that?
       if (const_node_list.size() >= 250) {
-        MACE_CHECK(hexagon_nn_append_const_node_list(nn_id_,
-                                                     const_node_list.data(),
-                                                     const_node_list.size())
-                     == 0, "append const node error");
+        MACE_CHECK(
+            hexagon_nn_append_const_node_list(nn_id_, const_node_list.data(),
+                                              const_node_list.size()) == 0,
+            "append const node error");
         const_node_list.clear();
       }
     }
 
     if (!const_node_list.empty()) {
-      MACE_CHECK(hexagon_nn_append_const_node_list(nn_id_,
-                                                   const_node_list.data(),
-                                                   const_node_list.size()) == 0,
-                 "append const node error");
+      MACE_CHECK(
+          hexagon_nn_append_const_node_list(nn_id_, const_node_list.data(),
+                                            const_node_list.size()) == 0,
+          "append const node error");
     }
     const_node_list.clear();
   });
@@ -117,7 +117,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
     std::vector<hexagon_nn_input> inputs;
     std::vector<hexagon_nn_output> outputs;
 
-    for (const OperatorDef &op: net_def.op()) {
+    for (const OperatorDef &op : net_def.op()) {
       int op_id = op_map.GetOpId(op.type());
       inputs.resize(op.node_input().size());
       for (size_t i = 0; i < op.node_input().size(); ++i) {
@@ -131,9 +131,8 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
       cached_inputs.push_back(inputs);
       cached_outputs.push_back(outputs);
 
-      hexagon_nn_padding_type
-        padding_type = static_cast<hexagon_nn_padding_type>(
-        op.padding());
+      hexagon_nn_padding_type padding_type =
+          static_cast<hexagon_nn_padding_type>(op.padding());
 
       hexagon_nn_op_node op_node;
       op_node.node_id = node_id(op.node_id());
@@ -146,8 +145,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
 
       op_node_list.push_back(op_node);
       if (op_node_list.size() >= 125) {
-        MACE_CHECK(hexagon_nn_append_node_list(nn_id_,
-                                               op_node_list.data(),
+        MACE_CHECK(hexagon_nn_append_node_list(nn_id_, op_node_list.data(),
                                                op_node_list.size()) == 0,
                    "append node error");
         op_node_list.clear();
@@ -157,8 +155,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
     }
 
     if (!op_node_list.empty()) {
-      MACE_CHECK(hexagon_nn_append_node_list(nn_id_,
-                                             op_node_list.data(),
+      MACE_CHECK(hexagon_nn_append_node_list(nn_id_, op_node_list.data(),
                                              op_node_list.size()) == 0,
                  "append node error");
     }
@@ -172,10 +169,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
 
   // input info
   num_inputs_ = 0;
-  for (const InputInfo &input_info: net_def.input_info()) {
+  for (const InputInfo &input_info : net_def.input_info()) {
     std::vector<index_t> input_shape;
-    input_shape.insert(input_shape.begin(),
-                       input_info.dims().begin(), input_info.dims().end());
+    input_shape.insert(input_shape.begin(), input_info.dims().begin(),
+                       input_info.dims().end());
     while (input_shape.size() < 4) {
       input_shape.insert(input_shape.begin(), 1);
     }
@@ -186,10 +183,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
 
   // output info
   num_outputs_ = 0;
-  for (const OutputInfo &output_info: net_def.output_info()) {
+  for (const OutputInfo &output_info : net_def.output_info()) {
     std::vector<index_t> output_shape;
-    output_shape.insert(output_shape.begin(),
-                        output_info.dims().begin(), output_info.dims().end());
+    output_shape.insert(output_shape.begin(), output_info.dims().begin(),
+                        output_info.dims().end());
     while (output_shape.size() < 4) {
       output_shape.insert(output_shape.begin(), 1);
     }
@@ -218,27 +215,27 @@ bool HexagonControlWrapper::TeardownGraph() {
   return hexagon_nn_teardown(nn_id_) == 0;
 }
 
-#define PRINT_BUFSIZE (2*1024*1024)
+#define PRINT_BUFSIZE (2 * 1024 * 1024)
 
 void HexagonControlWrapper::PrintLog() {
   char *buf;
   if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
-  MACE_CHECK(hexagon_nn_getlog(nn_id_,
-                               reinterpret_cast<unsigned char *>(buf),
-                               PRINT_BUFSIZE) == 0, "print log error");
+  MACE_CHECK(hexagon_nn_getlog(nn_id_, reinterpret_cast<unsigned char *>(buf),
+                               PRINT_BUFSIZE) == 0,
+             "print log error");
   LOG(INFO) << std::string(buf);
-  delete[]buf;
+  delete[] buf;
 }
 
 void HexagonControlWrapper::PrintGraph() {
   LOG(INFO) << "Print Graph";
   char *buf;
   if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
-  MACE_CHECK(hexagon_nn_snpprint(nn_id_,
-                                 reinterpret_cast<unsigned char *>(buf),
-                                 PRINT_BUFSIZE) == 0, "print graph error");
+  MACE_CHECK(hexagon_nn_snpprint(nn_id_, reinterpret_cast<unsigned char *>(buf),
+                                 PRINT_BUFSIZE) == 0,
+             "print graph error");
   LOG(INFO) << std::string(buf);
-  delete[]buf;
+  delete[] buf;
 }
 
 void HexagonControlWrapper::SetDebugLevel(int level) {
@@ -256,9 +253,9 @@ void HexagonControlWrapper::GetPerfInfo() {
   LOG(INFO) << "Get perf info";
   std::vector<hexagon_nn_perfinfo> perf_info(MAX_NODE);
   unsigned int n_items = 0;
-  MACE_CHECK(
-    hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE, &n_items) == 0,
-    "get perf info error");
+  MACE_CHECK(hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE,
+                                     &n_items) == 0,
+             "get perf info error");
 
   std::unordered_map<uint32_t, float> node_id_counters;
   std::unordered_map<std::string, std::pair<int, float>> node_type_counters;
@@ -269,8 +266,9 @@ void HexagonControlWrapper::GetPerfInfo() {
     unsigned int node_id = perf_info[i].node_id;
     unsigned int node_type_id = perf_info[i].node_type;
     node_id_counters[node_id] =
-      ((static_cast<uint64_t>(perf_info[i].counter_hi) << 32)
-        + perf_info[i].counter_lo) * 1.0f / perf_info[i].executions;
+        ((static_cast<uint64_t>(perf_info[i].counter_hi) << 32) +
+         perf_info[i].counter_lo) *
+        1.0f / perf_info[i].executions;
 
     char node_type_buf[MAX_NODE];
     hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MAX_NODE);
@@ -288,7 +286,7 @@ void HexagonControlWrapper::GetPerfInfo() {
     total_duration += node_id_counters[node_id];
   }
 
-  for (auto &node_type_counter: node_type_counters) {
+  for (auto &node_type_counter : node_type_counters) {
     LOG(INFO) << "node type: " << node_type_counter.first
               << ", time: " << node_type_counter.second.first
               << ", duration: " << node_type_counter.second.second;
@@ -312,33 +310,25 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
   output_tensor->Resize(output_shapes_[0]);
   std::vector<uint32_t> output_shape(4);
   uint32_t output_bytes;
-  int res = hexagon_nn_execute(nn_id_,
-                               input_tensor.shape()[0],
-                               input_tensor.shape()[1],
-                               input_tensor.shape()[2],
-                               input_tensor.shape()[3],
-                               reinterpret_cast<const unsigned char *>(
-                                 input_tensor.raw_data()),
-                               input_tensor.raw_size(),
-                               &output_shape[0],
-                               &output_shape[1],
-                               &output_shape[2],
-                               &output_shape[3],
-                               reinterpret_cast<unsigned char *>(
-                                 output_tensor->raw_mutable_data()),
-                               output_tensor->raw_size(),
-                               &output_bytes);
+  int res = hexagon_nn_execute(
+      nn_id_, input_tensor.shape()[0], input_tensor.shape()[1],
+      input_tensor.shape()[2], input_tensor.shape()[3],
+      reinterpret_cast<const unsigned char *>(input_tensor.raw_data()),
+      input_tensor.raw_size(), &output_shape[0], &output_shape[1],
+      &output_shape[2], &output_shape[3],
+      reinterpret_cast<unsigned char *>(output_tensor->raw_mutable_data()),
+      output_tensor->raw_size(), &output_bytes);
   MACE_CHECK(res == 0, "execute error");
 
-  MACE_ASSERT(output_shape == output_shapes_[0],
-              "wrong output shape inferred");
+  MACE_ASSERT(output_shape == output_shapes_[0], "wrong output shape inferred");
   MACE_ASSERT(output_bytes == output_tensor->raw_size(),
               "wrong output bytes inferred.");
   return res == 0;
 };
 
-bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_tensors,
-                                            std::vector<Tensor> *output_tensors) {
+bool HexagonControlWrapper::ExecuteGraphNew(
+    const std::vector<Tensor> &input_tensors,
+    std::vector<Tensor> *output_tensors) {
   LOG(INFO) << "Execute graph new: " << nn_id_;
   int num_inputs = input_tensors.size();
   int num_outputs = output_tensors->size();
@@ -355,7 +345,7 @@ bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_ten
     inputs[i].width = input_shape[2];
     inputs[i].depth = input_shape[3];
     inputs[i].data = const_cast<unsigned char *>(
-      reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
+        reinterpret_cast<const unsigned char *>(input_tensors[i].raw_data()));
     inputs[i].dataLen = input_tensors[i].raw_size();
     inputs[i].data_valid_len = input_tensors[i].raw_size();
     inputs[i].unused = 0;
@@ -365,16 +355,16 @@ bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_ten
     (*output_tensors)[i].SetDtype(output_data_types_[i]);
     (*output_tensors)[i].Resize(output_shapes_[i]);
     outputs[i].data = reinterpret_cast<unsigned char *>(
-      (*output_tensors)[i].raw_mutable_data());
+        (*output_tensors)[i].raw_mutable_data());
     outputs[i].dataLen = (*output_tensors)[i].raw_size();
   }
 
-  int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs,
-                                   outputs, num_outputs);
+  int res =
+      hexagon_nn_execute_new(nn_id_, inputs, num_inputs, outputs, num_outputs);
 
   for (int i = 0; i < num_outputs; ++i) {
     std::vector<uint32_t> output_shape{outputs[i].batches, outputs[i].height,
-                                  outputs[i].width, outputs[i].depth};
+                                       outputs[i].width, outputs[i].depth};
     MACE_ASSERT(output_shape == output_shapes_[i],
                 "wrong output shape inferred");
     MACE_ASSERT(outputs[i].data_valid_len == (*output_tensors)[i].raw_size(),
@@ -397,9 +387,7 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
   float *min_in_data = input_tensors[1].mutable_data<float>();
   input_tensors[2].Resize({1, 1, 1, 1});
   float *max_in_data = input_tensors[2].mutable_data<float>();
-  quantizer_.Quantize(input_tensor,
-                      &input_tensors[0],
-                      min_in_data,
+  quantizer_.Quantize(input_tensor, &input_tensors[0], min_in_data,
                       max_in_data);
   if (!ExecuteGraphNew(input_tensors, &output_tensors)) {
     return false;
@@ -409,11 +397,9 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
 
   const float *min_out_data = output_tensors[1].data<float>();
   const float *max_out_data = output_tensors[2].data<float>();
-  quantizer_.DeQuantize(output_tensors[0],
-                        *min_out_data,
-                        *max_out_data,
+  quantizer_.DeQuantize(output_tensors[0], *min_out_data, *max_out_data,
                         output_tensor);
   return true;
 }
 
-} // namespace mace
+}  // namespace mace
diff --git a/mace/core/runtime/hexagon/hexagon_control_wrapper.h b/mace/core/runtime/hexagon/hexagon_control_wrapper.h
index 09a1c778..8cb3b359 100644
--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.h
@@ -16,16 +16,17 @@ namespace mace {
 
 class HexagonControlWrapper {
  public:
-  HexagonControlWrapper() {};
+  HexagonControlWrapper(){};
   int GetVersion();
   bool Config();
   bool Init();
   bool Finalize();
-  bool SetupGraph(const NetDef& net_def);
+  bool SetupGraph(const NetDef &net_def);
   bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
-  bool ExecuteGraphNew(const std::vector<Tensor>& input_tensors,
+  bool ExecuteGraphNew(const std::vector<Tensor> &input_tensors,
                        std::vector<Tensor> *output_tensors);
-  bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor);
+  bool ExecuteGraphPreQuantize(const Tensor &input_tensor,
+                               Tensor *output_tensor);
 
   bool TeardownGraph();
   void PrintLog();
@@ -38,9 +39,7 @@ class HexagonControlWrapper {
  private:
   static constexpr int NODE_ID_OFFSET = 10000;
 
-  inline uint32_t node_id(uint32_t nodeid) {
-    return NODE_ID_OFFSET + nodeid;
-  }
+  inline uint32_t node_id(uint32_t nodeid) { return NODE_ID_OFFSET + nodeid; }
 
   int nn_id_;
   Quantizer quantizer_;
@@ -52,9 +51,8 @@ class HexagonControlWrapper {
   uint32_t num_inputs_;
   uint32_t num_outputs_;
 
- DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
+  DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
 };
-
 }
 
-#endif // MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_
+#endif  // MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_
diff --git a/mace/core/runtime/hexagon/hexagon_controller_dummy.cc b/mace/core/runtime/hexagon/hexagon_controller_dummy.cc
index fdc62ede..c1fdcc42 100644
--- a/mace/core/runtime/hexagon/hexagon_controller_dummy.cc
+++ b/mace/core/runtime/hexagon/hexagon_controller_dummy.cc
@@ -10,31 +10,145 @@ int hexagon_controller_InitHexagonWithMaxAttributes(int enable_dcvs,
   return 0;
 }
 
-int hexagon_controller_DeInitHexagon() {
+int hexagon_controller_DeInitHexagon() { return 0; }
+
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(
+    hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(
+    hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id,
+                                                            unsigned char *buf,
+                                                            int bufLen)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id,
+                                                          unsigned char *buf,
+                                                          int bufLen)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(
+    hexagon_nn_nn_id id,
+    unsigned int node_id,
+    unsigned int operation,
+    hexagon_nn_padding_type padding,
+    const hexagon_nn_input *inputs,
+    int inputsLen,
+    const hexagon_nn_output *outputs,
+    int outputsLen) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(
+    hexagon_nn_nn_id id,
+    const hexagon_nn_op_node *ops,
+    int opsLen) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(
+    hexagon_nn_nn_id id,
+    unsigned int node_id,
+    unsigned int batches,
+    unsigned int height,
+    unsigned int width,
+    unsigned int depth,
+    const unsigned char *data,
+    int dataLen) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(
+    hexagon_nn_nn_id id,
+    const hexagon_nn_const_node *consts,
+    int constsLen) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(
+    hexagon_nn_nn_id id,
+    unsigned int batches_in,
+    unsigned int height_in,
+    unsigned int width_in,
+    unsigned int depth_in,
+    const unsigned char *data_in,
+    int data_inLen,
+    unsigned int *batches_out,
+    unsigned int *height_out,
+    unsigned int *width_out,
+    unsigned int *depth_out,
+    unsigned char *data_out,
+    int data_outLen,
+    unsigned int *data_len_out) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(
+    unsigned int level) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(
+    hexagon_nn_nn_id id,
+    hexagon_nn_perfinfo *info_out,
+    int info_outLen,
+    unsigned int *n_items) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(
+    hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(
+    hexagon_nn_nn_id id,
+    unsigned int *cycles_lo,
+    unsigned int *cycles_hi) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int *ver)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(
+    const char *name, unsigned int *node_id) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(
+    unsigned int node_id, char *name, int nameLen) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(
+    int *ver) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(
+    const unsigned char *buf, int bufLen) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(
+    hexagon_nn_nn_id id,
+    const hexagon_nn_tensordef *inputs,
+    int inputsLen,
+    hexagon_nn_tensordef *outputs,
+    int outputsLen) __QAIC_HEADER_ATTRIBUTE {
   return 0;
 }
-
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int operation, hexagon_nn_padding_type padding, const hexagon_nn_input* inputs, int inputsLen, const hexagon_nn_output* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(hexagon_nn_nn_id id, const hexagon_nn_op_node* ops, int opsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int batches, unsigned int height, unsigned int width, unsigned int depth, const unsigned char* data, int dataLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(hexagon_nn_nn_id id, const hexagon_nn_const_node* consts, int constsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(hexagon_nn_nn_id id, unsigned int batches_in, unsigned int height_in, unsigned int width_in, unsigned int depth_in, const unsigned char* data_in, int data_inLen, unsigned int* batches_out, unsigned int* height_out, unsigned int* width_out, unsigned int* depth_out, unsigned char* data_out, int data_outLen, unsigned int* data_len_out) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(unsigned int level) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(hexagon_nn_nn_id id, hexagon_nn_perfinfo* info_out, int info_outLen, unsigned int* n_items) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(hexagon_nn_nn_id id, unsigned int* cycles_lo, unsigned int* cycles_hi) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int* ver) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(const char* name, unsigned int* node_id) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(unsigned int node_id, char* name, int nameLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(int* ver) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(const unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(hexagon_nn_nn_id id, const hexagon_nn_tensordef* inputs, int inputsLen, hexagon_nn_tensordef* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
diff --git a/mace/core/runtime/hexagon/hexagon_nn.h b/mace/core/runtime/hexagon/hexagon_nn.h
index 3bfd79c0..0baafd8c 100644
--- a/mace/core/runtime/hexagon/hexagon_nn.h
+++ b/mace/core/runtime/hexagon/hexagon_nn.h
@@ -2,27 +2,27 @@
 #define _HEXAGON_NN_H
 #ifndef __QAIC_HEADER
 #define __QAIC_HEADER(ff) ff
-#endif //__QAIC_HEADER
+#endif  //__QAIC_HEADER
 
 #ifndef __QAIC_HEADER_EXPORT
 #define __QAIC_HEADER_EXPORT
-#endif // __QAIC_HEADER_EXPORT
+#endif  // __QAIC_HEADER_EXPORT
 
 #ifndef __QAIC_HEADER_ATTRIBUTE
 #define __QAIC_HEADER_ATTRIBUTE
-#endif // __QAIC_HEADER_ATTRIBUTE
+#endif  // __QAIC_HEADER_ATTRIBUTE
 
 #ifndef __QAIC_IMPL
 #define __QAIC_IMPL(ff) ff
-#endif //__QAIC_IMPL
+#endif  //__QAIC_IMPL
 
 #ifndef __QAIC_IMPL_EXPORT
 #define __QAIC_IMPL_EXPORT
-#endif // __QAIC_IMPL_EXPORT
+#endif  // __QAIC_IMPL_EXPORT
 
 #ifndef __QAIC_IMPL_ATTRIBUTE
 #define __QAIC_IMPL_ATTRIBUTE
-#endif // __QAIC_IMPL_ATTRIBUTE
+#endif  // __QAIC_IMPL_ATTRIBUTE
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -30,92 +30,160 @@ extern "C" {
 #define __QAIC_STRING1_OBJECT_DEFINED__
 #define __STRING1_OBJECT__
 typedef struct _cstring1_s {
-   char* data;
-   int dataLen;
+  char *data;
+  int dataLen;
 } _cstring1_t;
 
 #endif /* __QAIC_STRING1_OBJECT_DEFINED__ */
 typedef struct hexagon_nn_input hexagon_nn_input;
 struct hexagon_nn_input {
-   unsigned int src_id;
-   unsigned int output_idx;
+  unsigned int src_id;
+  unsigned int output_idx;
 };
 typedef struct hexagon_nn_output hexagon_nn_output;
 struct hexagon_nn_output {
-   unsigned int max_size;
-   unsigned int unused;
+  unsigned int max_size;
+  unsigned int unused;
 };
 typedef struct hexagon_nn_perfinfo hexagon_nn_perfinfo;
 struct hexagon_nn_perfinfo {
-   unsigned int node_id;
-   unsigned int node_type;
-   unsigned int executions;
-   unsigned int unused;
-   unsigned int counter_lo;
-   unsigned int counter_hi;
+  unsigned int node_id;
+  unsigned int node_type;
+  unsigned int executions;
+  unsigned int unused;
+  unsigned int counter_lo;
+  unsigned int counter_hi;
 };
 typedef int hexagon_nn_nn_id;
 enum hexagon_nn_padding_type {
-   NN_PAD_NA,
-   NN_PAD_SAME,
-   NN_PAD_VALID,
-   NN_PAD_MIRROR_REFLECT,
-   NN_PAD_MIRROR_SYMMETRIC,
-   NN_PAD_SAME_CAFFE,
-   _32BIT_PLACEHOLDER_hexagon_nn_padding_type = 0x7fffffff
+  NN_PAD_NA,
+  NN_PAD_SAME,
+  NN_PAD_VALID,
+  NN_PAD_MIRROR_REFLECT,
+  NN_PAD_MIRROR_SYMMETRIC,
+  NN_PAD_SAME_CAFFE,
+  _32BIT_PLACEHOLDER_hexagon_nn_padding_type = 0x7fffffff
 };
 typedef enum hexagon_nn_padding_type hexagon_nn_padding_type;
 typedef struct hexagon_nn_tensordef hexagon_nn_tensordef;
 struct hexagon_nn_tensordef {
-   unsigned int batches;
-   unsigned int height;
-   unsigned int width;
-   unsigned int depth;
-   unsigned char* data;
-   int dataLen;
-   unsigned int data_valid_len;
-   unsigned int unused;
+  unsigned int batches;
+  unsigned int height;
+  unsigned int width;
+  unsigned int depth;
+  unsigned char *data;
+  int dataLen;
+  unsigned int data_valid_len;
+  unsigned int unused;
 };
 typedef struct hexagon_nn_op_node hexagon_nn_op_node;
 struct hexagon_nn_op_node {
-   unsigned int node_id;
-   unsigned int operation;
-   hexagon_nn_padding_type padding;
-   hexagon_nn_input* inputs;
-   int inputsLen;
-   hexagon_nn_output* outputs;
-   int outputsLen;
+  unsigned int node_id;
+  unsigned int operation;
+  hexagon_nn_padding_type padding;
+  hexagon_nn_input *inputs;
+  int inputsLen;
+  hexagon_nn_output *outputs;
+  int outputsLen;
 };
 typedef struct hexagon_nn_const_node hexagon_nn_const_node;
 struct hexagon_nn_const_node {
-   unsigned int node_id;
-   hexagon_nn_tensordef tensor;
+  unsigned int node_id;
+  hexagon_nn_tensordef tensor;
 };
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int operation, hexagon_nn_padding_type padding, const hexagon_nn_input* inputs, int inputsLen, const hexagon_nn_output* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(hexagon_nn_nn_id id, const hexagon_nn_op_node* ops, int opsLen) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int batches, unsigned int height, unsigned int width, unsigned int depth, const unsigned char* data, int dataLen) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(hexagon_nn_nn_id id, const hexagon_nn_const_node* consts, int constsLen) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(hexagon_nn_nn_id id, unsigned int batches_in, unsigned int height_in, unsigned int width_in, unsigned int depth_in, const unsigned char* data_in, int data_inLen, unsigned int* batches_out, unsigned int* height_out, unsigned int* width_out, unsigned int* depth_out, unsigned char* data_out, int data_outLen, unsigned int* data_len_out) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(unsigned int level) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(hexagon_nn_nn_id id, hexagon_nn_perfinfo* info_out, int info_outLen, unsigned int* n_items) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(hexagon_nn_nn_id id, unsigned int* cycles_lo, unsigned int* cycles_hi) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int* ver) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(const char* name, unsigned int* node_id) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(unsigned int node_id, char* name, int nameLen) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(int* ver) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(const unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(hexagon_nn_nn_id id, const hexagon_nn_tensordef* inputs, int inputsLen, hexagon_nn_tensordef* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void)
+    __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void)
+    __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(
+    hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(
+    hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id,
+                                                            unsigned char *buf,
+                                                            int bufLen)
+    __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id,
+                                                          unsigned char *buf,
+                                                          int bufLen)
+    __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(
+    hexagon_nn_nn_id id,
+    unsigned int node_id,
+    unsigned int operation,
+    hexagon_nn_padding_type padding,
+    const hexagon_nn_input *inputs,
+    int inputsLen,
+    const hexagon_nn_output *outputs,
+    int outputsLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(
+    hexagon_nn_nn_id id,
+    const hexagon_nn_op_node *ops,
+    int opsLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(
+    hexagon_nn_nn_id id,
+    unsigned int node_id,
+    unsigned int batches,
+    unsigned int height,
+    unsigned int width,
+    unsigned int depth,
+    const unsigned char *data,
+    int dataLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(
+    hexagon_nn_nn_id id,
+    const hexagon_nn_const_node *consts,
+    int constsLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id)
+    __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(
+    hexagon_nn_nn_id id,
+    unsigned int batches_in,
+    unsigned int height_in,
+    unsigned int width_in,
+    unsigned int depth_in,
+    const unsigned char *data_in,
+    int data_inLen,
+    unsigned int *batches_out,
+    unsigned int *height_out,
+    unsigned int *width_out,
+    unsigned int *depth_out,
+    unsigned char *data_out,
+    int data_outLen,
+    unsigned int *data_len_out) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id)
+    __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(
+    unsigned int level) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(
+    hexagon_nn_nn_id id,
+    hexagon_nn_perfinfo *info_out,
+    int info_outLen,
+    unsigned int *n_items) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(
+    hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(
+    hexagon_nn_nn_id id,
+    unsigned int *cycles_lo,
+    unsigned int *cycles_hi) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int *ver)
+    __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(
+    const char *name, unsigned int *node_id) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(
+    unsigned int node_id, char *name, int nameLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void)
+    __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(
+    int *ver) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(
+    const unsigned char *buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(
+    hexagon_nn_nn_id id,
+    const hexagon_nn_tensordef *inputs,
+    int inputsLen,
+    hexagon_nn_tensordef *outputs,
+    int outputsLen) __QAIC_HEADER_ATTRIBUTE;
 #ifdef __cplusplus
 }
 #endif
-#endif //_HEXAGON_NN_H
+#endif  //_HEXAGON_NN_H
diff --git a/mace/core/runtime/hexagon/hexagon_nn_ops.h b/mace/core/runtime/hexagon/hexagon_nn_ops.h
index dfb3b386..8704ce80 100644
--- a/mace/core/runtime/hexagon/hexagon_nn_ops.h
+++ b/mace/core/runtime/hexagon/hexagon_nn_ops.h
@@ -5,8 +5,8 @@
 #ifndef LIBMACE_HEXAGON_NN_OPS_H
 #define LIBMACE_HEXAGON_NN_OPS_H
 
-#include "mace/utils/logging.h"
 #include <unordered_map>
+#include "mace/utils/logging.h"
 
 namespace mace {
 
@@ -24,8 +24,7 @@ typedef enum op_type_enum {
 class OpMap {
  public:
   void Init() {
-#define DEF_OP(NAME) \
-    op_map_[#NAME] = OP_##NAME;
+#define DEF_OP(NAME) op_map_[#NAME] = OP_##NAME;
 
 #include "mace/core/runtime/hexagon/ops.h"
 
@@ -40,9 +39,10 @@ class OpMap {
       return OP_INVALID;
     }
   }
+
  private:
   std::unordered_map<std::string, int> op_map_;
 };
-} // namespace mace
+}  // namespace mace
 
-#endif // LIBMACE_HEXAGON_NN_OPS_H
+#endif  // LIBMACE_HEXAGON_NN_OPS_H
diff --git a/mace/core/runtime/hexagon/ops.h b/mace/core/runtime/hexagon/ops.h
index 79b503cd..55b40413 100644
--- a/mace/core/runtime/hexagon/ops.h
+++ b/mace/core/runtime/hexagon/ops.h
@@ -178,4 +178,3 @@ DEF_OP(QuantizedBiasAdd_8p8to8)
 #undef __SELF_DEF_OP_WREF
 #undef DEF_OP_WREF
 #endif
-
diff --git a/mace/core/runtime/hexagon/quantize.cc b/mace/core/runtime/hexagon/quantize.cc
index 5e3aad62..c4548bcb 100644
--- a/mace/core/runtime/hexagon/quantize.cc
+++ b/mace/core/runtime/hexagon/quantize.cc
@@ -29,16 +29,16 @@ void Quantizer::Quantize(const Tensor &in_tensor,
                          float *max_out) {
   float stepsize;
   float recip_stepsize;
-  QuantizeAdjustRange(min_in, max_in,
-                      min_out, max_out,
-                      &stepsize, &recip_stepsize);
+  QuantizeAdjustRange(min_in, max_in, min_out, max_out, &stepsize,
+                      &recip_stepsize);
 
   const float *in = in_tensor.data<float>();
   uint8_t *out = out_tensor->mutable_data<uint8_t>();
 
   for (int i = 0; i < in_tensor.size(); i++) {
     const float inval = in[i];
-    float ival = static_cast<uint8_t>((inval - *min_out) * recip_stepsize + 0.5f);
+    float ival =
+        static_cast<uint8_t>((inval - *min_out) * recip_stepsize + 0.5f);
     if (ival < 0) ival = 0;
     if (ival > 255) ival = 255;
     out[i] = static_cast<uint8_t>(ival);
@@ -93,4 +93,4 @@ void Quantizer::DeQuantize(const Tensor &in_tensor,
   }
 }
 
-} // namespace mace
\ No newline at end of file
+}  // namespace mace
\ No newline at end of file
diff --git a/mace/core/runtime/hexagon/quantize.h b/mace/core/runtime/hexagon/quantize.h
index 1ec2f41f..216e0c6b 100644
--- a/mace/core/runtime/hexagon/quantize.h
+++ b/mace/core/runtime/hexagon/quantize.h
@@ -16,13 +16,17 @@ class Quantizer {
 
   void Quantize(const Tensor &in_tensor,
                 Tensor *out_tensor,
-                float *min_out, float *max_out);
+                float *min_out,
+                float *max_out);
   void Quantize(const Tensor &in_tensor,
-                const float min_in, const float max_in,
+                const float min_in,
+                const float max_in,
                 Tensor *out_tensor,
-                float *min_out, float *max_out);
+                float *min_out,
+                float *max_out);
   void DeQuantize(const Tensor &in_tensor,
-                  const float min_in, const float max_in,
+                  const float min_in,
+                  const float max_in,
                   Tensor *out_tensor);
 
  private:
@@ -33,9 +37,9 @@ class Quantizer {
                            float *stepsize,
                            float *recip_stepsize);
 
- DISABLE_COPY_AND_ASSIGN(Quantizer);
+  DISABLE_COPY_AND_ASSIGN(Quantizer);
 };
 
-} // mace
+}  // mace
 
-#endif // MACE_DSP_UTIL_QUANTIZE_H_
+#endif  // MACE_DSP_UTIL_QUANTIZE_H_
diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc
index abc88bdd..57aa40c2 100644
--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -2,8 +2,8 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_allocator.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 
 namespace mace {
@@ -29,7 +29,6 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
       return 0;
   }
 }
-
 }
 
 OpenCLAllocator::OpenCLAllocator() {}
@@ -49,17 +48,16 @@ void *OpenCLAllocator::New(size_t nbytes) const {
 void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
                                 const DataType dt) const {
   MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
-  VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", " << image_shape[1];
+  VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", "
+          << image_shape[1];
 
   cl::ImageFormat img_format(CL_RGBA, DataTypeToCLChannelType(dt));
 
   cl_int error;
   cl::Image2D *cl_image =
       new cl::Image2D(OpenCLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
-                      img_format,
-                      image_shape[0], image_shape[1],
-                      0, nullptr, &error);
+                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
+                      image_shape[0], image_shape[1], 0, nullptr, &error);
   MACE_CHECK(error == CL_SUCCESS) << error << " with image shape: ["
                                   << image_shape[0] << ", " << image_shape[1]
                                   << "]";
@@ -89,8 +87,8 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
   // TODO(heliangliang) Non-blocking call
   cl_int error;
   void *mapped_ptr =
-      queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset,
-                             nbytes, nullptr, nullptr, &error);
+      queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+                             offset, nbytes, nullptr, nullptr, &error);
   MACE_CHECK(error == CL_SUCCESS);
   return mapped_ptr;
 }
@@ -106,13 +104,10 @@ void *OpenCLAllocator::MapImage(void *buffer,
 
   mapped_image_pitch->resize(2);
   cl_int error;
-  void *mapped_ptr =
-      OpenCLRuntime::Global()->command_queue().enqueueMapImage(*cl_image,
-                                                            CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
-                                                            origin, region,
-                                                            mapped_image_pitch->data(),
-                                                            mapped_image_pitch->data() + 1,
-                                                            nullptr, nullptr, &error);
+  void *mapped_ptr = OpenCLRuntime::Global()->command_queue().enqueueMapImage(
+      *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
+      mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
+      nullptr, &error);
   MACE_CHECK(error == CL_SUCCESS) << error;
 
   return mapped_ptr;
diff --git a/mace/core/runtime/opencl/opencl_development.cc b/mace/core/runtime/opencl/opencl_development.cc
index 266eceba..71130cf4 100644
--- a/mace/core/runtime/opencl/opencl_development.cc
+++ b/mace/core/runtime/opencl/opencl_development.cc
@@ -5,8 +5,8 @@
 #include <vector>
 
 #include "mace/core/runtime/opencl/cl2_header.h"
-#include "mace/utils/utils.h"
 #include "mace/utils/logging.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
 
@@ -16,7 +16,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name,
                               cl::Device &device,
                               cl::Program *program,
                               bool *is_binary) {
-  extern const std::map<std::string, std::vector<unsigned char>> kEncryptedProgramMap;
+  extern const std::map<std::string, std::vector<unsigned char>>
+      kEncryptedProgramMap;
   *is_binary = false;
   auto it_source = kEncryptedProgramMap.find(program_name);
   if (it_source == kEncryptedProgramMap.end()) {
diff --git a/mace/core/runtime/opencl/opencl_production.cc b/mace/core/runtime/opencl/opencl_production.cc
index 265fcbef..78aa5bcf 100644
--- a/mace/core/runtime/opencl/opencl_production.cc
+++ b/mace/core/runtime/opencl/opencl_production.cc
@@ -14,7 +14,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name,
                               cl::Device &device,
                               cl::Program *program,
                               bool *is_binary) {
-  extern const std::map<std::string, std::vector<unsigned char>> kCompiledProgramMap;
+  extern const std::map<std::string, std::vector<unsigned char>>
+      kCompiledProgramMap;
   *is_binary = true;
   auto it_binary = kCompiledProgramMap.find(binary_file_name_prefix);
   if (it_binary == kCompiledProgramMap.end()) {
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 36b8a837..4a18f630 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -48,11 +48,9 @@ double OpenCLProfilingTimer::ElapsedMicros() {
   return (stop_nanos_ - start_nanos_) / 1000.0;
 }
 
-double OpenCLProfilingTimer::AccumulatedMicros() {
-  return accumulated_micros_;
-}
+double OpenCLProfilingTimer::AccumulatedMicros() { return accumulated_micros_; }
 
-void OpenCLProfilingTimer::AccumulateTiming(){
+void OpenCLProfilingTimer::AccumulateTiming() {
   StopTiming();
   accumulated_micros_ += (stop_nanos_ - start_nanos_) / 1000.0;
 }
@@ -116,7 +114,8 @@ OpenCLRuntime::OpenCLRuntime() {
   cl::CommandQueue command_queue(context, gpu_device, properties);
 
   const char *kernel_path = getenv("MACE_KERNEL_PATH");
-  this->kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
+  this->kernel_path_ =
+      std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
 
   this->device_ = new cl::Device(gpu_device);
   this->context_ = new cl::Context(context);
@@ -163,18 +162,14 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
   MACE_CHECK_NOTNULL(program);
 
   std::string binary_file_name_prefix =
-    GenerateCLBinaryFilenamePrefix(built_program_key);
+      GenerateCLBinaryFilenamePrefix(built_program_key);
   std::vector<unsigned char> program_vec;
   bool is_opencl_binary;
-  const bool found = GetSourceOrBinaryProgram(program_name,
-                                              binary_file_name_prefix,
-                                              context(),
-                                              device(),
-                                              program,
-                                              &is_opencl_binary);
+  const bool found =
+      GetSourceOrBinaryProgram(program_name, binary_file_name_prefix, context(),
+                               device(), program, &is_opencl_binary);
   MACE_CHECK(found, "Program not found for ",
-                    is_opencl_binary ? "binary: " : "source: ",
-                    built_program_key);
+             is_opencl_binary ? "binary: " : "source: ", built_program_key);
 
   // Build program
   std::string build_options_str =
@@ -190,13 +185,13 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
     }
     LOG(FATAL) << "Build program from "
                << (is_opencl_binary ? "binary: " : "source: ")
-               << built_program_key
-               << " failed: " << ret;
+               << built_program_key << " failed: " << ret;
   }
 
   if (!is_opencl_binary) {
     // Write binary if necessary
-    std::string binary_filename = kernel_path_ + binary_file_name_prefix + ".bin";
+    std::string binary_filename =
+        kernel_path_ + binary_file_name_prefix + ".bin";
     size_t device_list_size = 1;
     std::unique_ptr<size_t[]> program_binary_sizes(
         new size_t[device_list_size]);
@@ -240,8 +235,8 @@ cl::Kernel OpenCLRuntime::BuildKernel(
   if (built_program_it != built_program_map_.end()) {
     program = built_program_it->second;
   } else {
-    this->BuildProgram(program_name, built_program_key,
-                       build_options_str, &program);
+    this->BuildProgram(program_name, built_program_key, build_options_str,
+                       &program);
     built_program_map_.emplace(built_program_key, program);
   }
   return cl::Kernel(program, kernel_name.c_str());
@@ -250,9 +245,9 @@ cl::Kernel OpenCLRuntime::BuildKernel(
 void OpenCLRuntime::GetCallStats(const cl::Event &event, CallStats *stats) {
   if (stats != nullptr) {
     stats->start_micros =
-      event.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
+        event.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000;
     stats->end_micros =
-      event.getProfilingInfo<CL_PROFILING_COMMAND_END>() / 1000;
+        event.getProfilingInfo<CL_PROFILING_COMMAND_END>() / 1000;
   }
 }
 
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index ff596459..1f5ab2a1 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -19,7 +19,8 @@ namespace mace {
 
 class OpenCLProfilingTimer : public Timer {
  public:
-  explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event), accumulated_micros_(0) {};
+  explicit OpenCLProfilingTimer(const cl::Event *event)
+      : event_(event), accumulated_micros_(0){};
   void StartTiming() override;
   void StopTiming() override;
   void AccumulateTiming() override;
@@ -48,6 +49,7 @@ class OpenCLRuntime {
   cl::Kernel BuildKernel(const std::string &program_name,
                          const std::string &kernel_name,
                          const std::set<std::string> &build_options);
+
  private:
   OpenCLRuntime();
   ~OpenCLRuntime();
diff --git a/mace/core/runtime/opencl/opencl_wrapper.h b/mace/core/runtime/opencl/opencl_wrapper.h
index c0e88186..fdf90b02 100644
--- a/mace/core/runtime/opencl/opencl_wrapper.h
+++ b/mace/core/runtime/opencl/opencl_wrapper.h
@@ -7,10 +7,10 @@
 
 namespace mace {
 
-  // These functions are not thread-safe.
-  void LoadOpenCLLibrary();
-  void UnloadOpenCLLibrary();
- 
+// These functions are not thread-safe.
+void LoadOpenCLLibrary();
+void UnloadOpenCLLibrary();
+
 }  // namespace mace
 
 #endif  // MACE_CORE_RUNTIME_OPENCL_OPENCL_WRAPPER_H_
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index 7adc6a58..9017dafa 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -65,23 +65,20 @@ inline std::ostream &operator<<(std::ostream &os, unsigned char c) {
 class Tensor {
  public:
   Tensor(Allocator *alloc, DataType type)
-    : allocator_(alloc),
-      dtype_(type),
-      buffer_(nullptr),
-      is_buffer_owner_(true),
-      name_("") {};
+      : allocator_(alloc),
+        dtype_(type),
+        buffer_(nullptr),
+        is_buffer_owner_(true),
+        name_(""){};
 
   Tensor(BufferBase *buffer, DataType dtype)
-    : dtype_(dtype),
-      buffer_(buffer),
-      is_buffer_owner_(false),
-      name_("") {}
+      : dtype_(dtype), buffer_(buffer), is_buffer_owner_(false), name_("") {}
 
   Tensor(const BufferSlice &buffer_slice, DataType dtype)
-    : dtype_(dtype),
-      buffer_slice_(buffer_slice),
-      is_buffer_owner_(false),
-      name_("") {
+      : dtype_(dtype),
+        buffer_slice_(buffer_slice),
+        is_buffer_owner_(false),
+        name_("") {
     buffer_ = &buffer_slice_;
   }
 
@@ -102,8 +99,8 @@ class Tensor {
   inline index_t dim_size() const { return shape_.size(); }
 
   inline index_t dim(unsigned int index) const {
-    MACE_CHECK(index < shape_.size(), "Dim out of range: ",
-               index, " >= ", shape_.size());
+    MACE_CHECK(index < shape_.size(), "Dim out of range: ", index, " >= ",
+               shape_.size());
     return shape_[index];
   }
 
@@ -112,40 +109,35 @@ class Tensor {
                            std::multiplies<int64_t>());
   }
 
-  inline index_t raw_size() const {
-    return size() * SizeOfType();
-  }
+  inline index_t raw_size() const { return size() * SizeOfType(); }
 
   inline bool has_opencl_image() const {
-    return buffer_ != nullptr && !buffer_->OnHost()
-      && typeid(*buffer_) == typeid(Image);
+    return buffer_ != nullptr && !buffer_->OnHost() &&
+           typeid(*buffer_) == typeid(Image);
   }
 
   inline bool has_opencl_buffer() const {
-    return buffer_ != nullptr && !buffer_->OnHost()
-      && !has_opencl_image();
+    return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image();
   }
 
   inline cl::Image *opencl_image() const {
     MACE_CHECK(has_opencl_image(), "do not have image");
-    return static_cast<cl::Image*>(buffer_->buffer());
+    return static_cast<cl::Image *>(buffer_->buffer());
   }
 
   inline cl::Buffer *opencl_buffer() const {
     MACE_CHECK(has_opencl_buffer(), "do not have opencl buffer");
-    return static_cast<cl::Buffer*>(buffer_->buffer());
+    return static_cast<cl::Buffer *>(buffer_->buffer());
   }
 
-  inline index_t buffer_offset() const {
-    return buffer_->offset();
-  }
+  inline index_t buffer_offset() const { return buffer_->offset(); }
 
   inline const void *raw_data() const {
     MACE_CHECK(buffer_ != nullptr, "buffer is null");
     return buffer_->raw_data();
   }
 
-  template<typename T>
+  template <typename T>
   inline const T *data() const {
     MACE_CHECK(buffer_ != nullptr, "buffer is null");
     return buffer_->data<T>();
@@ -156,7 +148,7 @@ class Tensor {
     return buffer_->raw_mutable_data();
   }
 
-  template<typename T>
+  template <typename T>
   inline T *mutable_data() {
     MACE_CHECK(buffer_ != nullptr, "buffer is null");
     return static_cast<T *>(buffer_->raw_mutable_data());
@@ -188,25 +180,17 @@ class Tensor {
       is_buffer_owner_ = true;
     } else {
       MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
-      Image *image = dynamic_cast<Image*>(buffer_);
-      MACE_CHECK(image_shape[0] <= image->image_shape()[0]
-                   && image_shape[1] <= image->image_shape()[1],
-                 "tensor (source op ",
-                 name_,
-                 "): current physical image shape: ",
-                 image->image_shape()[0],
-                 ", ",
-                 image->image_shape()[1],
-                 " < logical image shape: ",
-                 image_shape[0],
-                 ", ",
-                 image_shape[1]);
+      Image *image = dynamic_cast<Image *>(buffer_);
+      MACE_CHECK(image_shape[0] <= image->image_shape()[0] &&
+                     image_shape[1] <= image->image_shape()[1],
+                 "tensor (source op ", name_,
+                 "): current physical image shape: ", image->image_shape()[0],
+                 ", ", image->image_shape()[1], " < logical image shape: ",
+                 image_shape[0], ", ", image_shape[1]);
     }
   }
 
-  inline void ResizeLike(const Tensor &other) {
-    ResizeLike(&other);
-  }
+  inline void ResizeLike(const Tensor &other) { ResizeLike(&other); }
 
   inline void ResizeLike(const Tensor *other) {
     if (other->has_opencl_image()) {
@@ -229,7 +213,7 @@ class Tensor {
     memcpy(buffer_->raw_mutable_data(), src, size);
   }
 
-  template<typename T>
+  template <typename T>
   inline void Copy(const T *src, index_t length) {
     MACE_CHECK(length == size(), "copy src and dst with different size.");
     CopyBytes(static_cast<const void *>(src), sizeof(T) * length);
@@ -248,13 +232,9 @@ class Tensor {
     return type_size;
   }
 
-  inline BufferBase *UnderlyingBuffer() const {
-    return buffer_;
-  }
+  inline BufferBase *UnderlyingBuffer() const { return buffer_; }
 
-  inline void SetSourceOpName(const std::string name) {
-    name_ = name;
-  }
+  inline void SetSourceOpName(const std::string name) { name_ = name; }
 
   inline void DebugPrint() const {
     using namespace numerical_chars;
@@ -272,8 +252,9 @@ class Tensor {
       }
       CASES(dtype_, (os << (this->data<T>()[i]) << ", "));
     }
-    LOG(INFO) << "Tensor size: [" << dim(0) << ", " << dim(1) << ", "
-              << dim(2) << ", " << dim(3) << "], content:\n" << os.str();
+    LOG(INFO) << "Tensor size: [" << dim(0) << ", " << dim(1) << ", " << dim(2)
+              << ", " << dim(3) << "], content:\n"
+              << os.str();
   }
 
   class MappingGuard {
@@ -301,20 +282,20 @@ class Tensor {
     const Tensor *tensor_;
     std::vector<size_t> mapped_image_pitch_;
 
-   DISABLE_COPY_AND_ASSIGN(MappingGuard);
+    DISABLE_COPY_AND_ASSIGN(MappingGuard);
   };
 
  private:
   Allocator *allocator_;
   DataType dtype_;
   std::vector<index_t> shape_;
-  std::vector<size_t > image_shape_;
+  std::vector<size_t> image_shape_;
   BufferBase *buffer_;
   BufferSlice buffer_slice_;
   bool is_buffer_owner_;
   std::string name_;
 
- DISABLE_COPY_AND_ASSIGN(Tensor);
+  DISABLE_COPY_AND_ASSIGN(Tensor);
 };
 
 }  // namespace tensor
diff --git a/mace/core/testing/test_benchmark.cc b/mace/core/testing/test_benchmark.cc
index 97848c97..7dcf2a27 100644
--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -99,9 +99,7 @@ void RestartTiming() {
   accum_time = 0;
   start_time = NowMicros();
 }
-void StartTiming() {
-  start_time = NowMicros();
-}
+void StartTiming() { start_time = NowMicros(); }
 void StopTiming() {
   if (start_time != 0) {
     accum_time += (NowMicros() - start_time);
diff --git a/mace/core/testing/test_benchmark.h b/mace/core/testing/test_benchmark.h
index 79030593..2e3526cf 100644
--- a/mace/core/testing/test_benchmark.h
+++ b/mace/core/testing/test_benchmark.h
@@ -6,9 +6,9 @@
 #ifndef MACE_CORE_TESTING_TEST_BENCHMARK_H_
 #define MACE_CORE_TESTING_TEST_BENCHMARK_H_
 
+#include <string>
 #include <utility>
 #include <vector>
-#include <string>
 
 #define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c
 #define BENCHMARK(n)                                        \
diff --git a/mace/core/types.cc b/mace/core/types.cc
index e466f258..ef0a1755 100644
--- a/mace/core/types.cc
+++ b/mace/core/types.cc
@@ -2,8 +2,8 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-#include <map>
 #include <cstdint>
+#include <map>
 
 #include "mace/core/types.h"
 #include "mace/utils/logging.h"
@@ -30,18 +30,12 @@ bool DataTypeCanUseMemcpy(DataType dt) {
 
 std::string DataTypeToString(const DataType dt) {
   static std::map<DataType, std::string> dtype_string_map = {
-      {DT_FLOAT, "DT_FLOAT"},
-      {DT_HALF, "DT_HALF"},
-      {DT_DOUBLE, "DT_DOUBLE"},
-      {DT_UINT8, "DT_UINT8"},
-      {DT_INT8, "DT_INT8"},
-      {DT_INT32, "DT_INT32"},
-      {DT_UINT32, "DT_UINT32"},
-      {DT_UINT16, "DT_UINT16"},
-      {DT_INT64, "DT_INT64"},
-      {DT_BOOL, "DT_BOOL"},
-      {DT_STRING, "DT_STRING"}
-  };
+      {DT_FLOAT, "DT_FLOAT"},   {DT_HALF, "DT_HALF"},
+      {DT_DOUBLE, "DT_DOUBLE"}, {DT_UINT8, "DT_UINT8"},
+      {DT_INT8, "DT_INT8"},     {DT_INT32, "DT_INT32"},
+      {DT_UINT32, "DT_UINT32"}, {DT_UINT16, "DT_UINT16"},
+      {DT_INT64, "DT_INT64"},   {DT_BOOL, "DT_BOOL"},
+      {DT_STRING, "DT_STRING"}};
   MACE_CHECK(dt != DT_INVALID) << "Not support Invalid data type";
   return dtype_string_map[dt];
 }
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index 59c509e7..1cfa1802 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -5,8 +5,8 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/workspace.h"
 #include "mace/core/arg_helper.h"
+#include "mace/core/workspace.h"
 #include "mace/utils/timer.h"
 
 namespace mace {
@@ -19,7 +19,7 @@ Tensor *Workspace::CreateTensor(const std::string &name,
   } else {
     VLOG(3) << "Creating Tensor " << name;
     tensor_map_[name] =
-      std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
+        std::move(std::unique_ptr<Tensor>(new Tensor(alloc, type)));
   }
   return GetTensor(name);
 }
@@ -35,7 +35,7 @@ const Tensor *Workspace::GetTensor(const std::string &name) const {
 
 Tensor *Workspace::GetTensor(const std::string &name) {
   return const_cast<Tensor *>(
-    static_cast<const Workspace *>(this)->GetTensor(name));
+      static_cast<const Workspace *>(this)->GetTensor(name));
 }
 
 std::vector<std::string> Workspace::Tensors() const {
@@ -51,28 +51,28 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
   index_t model_data_size = 0;
   unsigned char *model_data_ptr = nullptr;
   for (auto &const_tensor : net_def.tensors()) {
-    if (model_data_ptr == nullptr
-      || reinterpret_cast<long long>(const_tensor.data())
-        < reinterpret_cast<long long>(model_data_ptr)) {
+    if (model_data_ptr == nullptr ||
+        reinterpret_cast<long long>(const_tensor.data()) <
+            reinterpret_cast<long long>(model_data_ptr)) {
       model_data_ptr = const_cast<unsigned char *>(const_tensor.data());
     }
   }
   for (auto &const_tensor : net_def.tensors()) {
-    model_data_size = std::max(model_data_size,
-                               static_cast<index_t>(
-                                 (reinterpret_cast<long long>(const_tensor.data())
-                                   - reinterpret_cast<long long>(model_data_ptr))
-                                   + const_tensor.data_size()
-                                     * GetEnumTypeSize(const_tensor.data_type())));
+    model_data_size = std::max(
+        model_data_size,
+        static_cast<index_t>((reinterpret_cast<long long>(const_tensor.data()) -
+                              reinterpret_cast<long long>(model_data_ptr)) +
+                             const_tensor.data_size() *
+                                 GetEnumTypeSize(const_tensor.data_type())));
   }
   VLOG(3) << "Model data size: " << model_data_size;
 
   if (type == DeviceType::CPU) {
     tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
-      new Buffer(GetDeviceAllocator(type), model_data_ptr, model_data_size)));
+        new Buffer(GetDeviceAllocator(type), model_data_ptr, model_data_size)));
   } else {
     tensor_buffer_ = std::move(std::unique_ptr<Buffer>(
-      new Buffer(GetDeviceAllocator(type), model_data_size)));
+        new Buffer(GetDeviceAllocator(type), model_data_size)));
     tensor_buffer_->Map(nullptr);
     tensor_buffer_->Copy(model_data_ptr, 0, model_data_size);
     tensor_buffer_->UnMap();
@@ -81,8 +81,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
   for (auto &const_tensor : net_def.tensors()) {
     MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
     VLOG(3) << "Tensor name: " << const_tensor.name()
-            << ", data type: " << const_tensor.data_type()
-            << ", shape: "
+            << ", data type: " << const_tensor.data_type() << ", shape: "
             << MakeString(std::vector<index_t>(const_tensor.dims().begin(),
                                                const_tensor.dims().end()));
     std::vector<index_t> dims;
@@ -90,14 +89,12 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
       dims.push_back(d);
     }
 
-    index_t
-      offset = (long long) const_tensor.data() - (long long) model_data_ptr;
+    index_t offset = (long long)const_tensor.data() - (long long)model_data_ptr;
     std::unique_ptr<Tensor> tensor(
-      new Tensor(BufferSlice(tensor_buffer_.get(),
-                             offset,
-                             const_tensor.data_size()
-                               * GetEnumTypeSize(const_tensor.data_type())),
-                 const_tensor.data_type()));
+        new Tensor(BufferSlice(tensor_buffer_.get(), offset,
+                               const_tensor.data_size() *
+                                   GetEnumTypeSize(const_tensor.data_type())),
+                   const_tensor.data_type()));
 
     tensor->Reshape(dims);
     tensor_map_[const_tensor.name()] = std::move(tensor);
@@ -118,13 +115,11 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
   // as GPU have consistent data type for each layer for now.
   // As DSP may have different data output type for each op,
   // we stick to the same concept.
-  for (auto &op: net_def.op()) {
+  for (auto &op : net_def.op()) {
     if (op.has_mem_id()) {
       const DataType op_dtype = static_cast<DataType>(
-        ArgumentHelper::GetSingleArgument<OperatorDef, int>(
-          op,
-          "T",
-          static_cast<int>(DT_FLOAT)));
+          ArgumentHelper::GetSingleArgument<OperatorDef, int>(
+              op, "T", static_cast<int>(DT_FLOAT)));
       if (op_dtype != DataType::DT_INVALID) {
         dtype = op_dtype;
         // find first valid data type, break
@@ -133,22 +128,24 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
     }
   }
   MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
-  for (auto &mem_block: net_def.mem_arena().mem_block()) {
-    std::unique_ptr<BufferBase>
-      image_buf(new Image({mem_block.x(), mem_block.y()}, dtype));
+  for (auto &mem_block : net_def.mem_arena().mem_block()) {
+    std::unique_ptr<BufferBase> image_buf(
+        new Image({mem_block.x(), mem_block.y()}, dtype));
     preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf));
   }
   VLOG(3) << "Preallocate image to tensors";
-  for (auto &op: net_def.op()) {
+  for (auto &op : net_def.op()) {
     if (op.has_mem_id()) {
-      std::unique_ptr<Tensor> tensor
-        (new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype));
+      std::unique_ptr<Tensor> tensor(
+          new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype));
       tensor->SetSourceOpName(op.name());
-      VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" << "; Mem: "
-              << op.mem_id() << "; Image shape: "
-              << dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[0]
-              << ", "
-              << dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[1];
+      VLOG(3)
+          << "Tensor: " << op.name() << "(" << op.type() << ")"
+          << "; Mem: " << op.mem_id() << "; Image shape: "
+          << dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[0]
+          << ", "
+          << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
+                 ->image_shape()[1];
       tensor_map_[op.output(0)] = std::move(tensor);
     }
   }
diff --git a/mace/core/workspace.h b/mace/core/workspace.h
index 84274914..5e990d82 100644
--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -5,9 +5,9 @@
 #ifndef MACE_CORE_WORKSPACE_H_
 #define MACE_CORE_WORKSPACE_H_
 
+#include "mace/core/preallocated_pooled_allocator.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
-#include "mace/core/preallocated_pooled_allocator.h"
 
 namespace mace {
 
@@ -43,7 +43,7 @@ class Workspace {
 
   PreallocatedPooledAllocator preallocated_allocator_;
 
- DISABLE_COPY_AND_ASSIGN(Workspace);
+  DISABLE_COPY_AND_ASSIGN(Workspace);
 };
 
 }  // namespace mace
diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h
index 72e52b67..d6689e70 100644
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -6,9 +6,9 @@
 #define MACE_KERNELS_ACTIVATION_H_
 
 #include "mace/core/future.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
@@ -99,17 +99,15 @@ void PReLUActivation(const T *input_ptr,
       output_ptr[i] = in;
     }
   }
-
 }
 
 template <DeviceType D, typename T>
 class ActivationFunctor {
  public:
   ActivationFunctor(ActivationType type, T relux_max_limit)
-      : activation_(type),
-        relux_max_limit_(relux_max_limit){}
+      : activation_(type), relux_max_limit_(relux_max_limit) {}
 
-  void operator()(const Tensor *input, 
+  void operator()(const Tensor *input,
                   const Tensor *alpha,
                   Tensor *output,
                   StatsFuture *future) {
@@ -118,9 +116,11 @@ class ActivationFunctor {
     if (activation_ == PRELU) {
       MACE_CHECK_NOTNULL(alpha);
       const T *alpha_ptr = alpha->data<T>();
-      PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr, output_ptr); 
+      PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr,
+                      output_ptr);
     } else {
-      DoActivation(input_ptr, output_ptr, output->size(), activation_, relux_max_limit_);
+      DoActivation(input_ptr, output_ptr, output->size(), activation_,
+                   relux_max_limit_);
     }
   }
 
@@ -131,14 +131,16 @@ class ActivationFunctor {
 
 template <>
 void ActivationFunctor<DeviceType::NEON, float>::operator()(
-    const Tensor *input, const Tensor *alpha, Tensor *output, StatsFuture *future);
+    const Tensor *input,
+    const Tensor *alpha,
+    Tensor *output,
+    StatsFuture *future);
 
 template <typename T>
 class ActivationFunctor<DeviceType::OPENCL, T> {
  public:
   ActivationFunctor(ActivationType type, T relux_max_limit)
-      : activation_(type),
-        relux_max_limit_(relux_max_limit){}
+      : activation_(type), relux_max_limit_(relux_max_limit) {}
 
   void operator()(const Tensor *input,
                   const Tensor *alpha,
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
index e772d880..6e9ba2d4 100644
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -18,7 +18,7 @@ namespace mace {
 namespace kernels {
 
 namespace {
-  constexpr int kCostPerGroup = 1024;
+constexpr int kCostPerGroup = 1024;
 }  // namespace
 
 template <DeviceType D, typename T>
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index bceee6ff..1e6a12bf 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -10,10 +10,10 @@
 #endif
 
 #include "mace/core/future.h"
-#include "mace/public/mace.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace kernels {
@@ -24,7 +24,7 @@ struct BatchNormFunctorBase {
                        const float relux_max_limit)
       : folded_constant_(folded_constant),
         activation_(activation),
-        relux_max_limit_(relux_max_limit){}
+        relux_max_limit_(relux_max_limit) {}
 
   const bool folded_constant_;
   const ActivationType activation_;
@@ -36,8 +36,7 @@ struct BatchNormFunctor : BatchNormFunctorBase {
   BatchNormFunctor(const bool folded_constant,
                    const ActivationType activation,
                    const float relux_max_limit)
-      : BatchNormFunctorBase(
-            folded_constant, activation, relux_max_limit) {}
+      : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
 
   void operator()(const Tensor *input,
                   const Tensor *scale,
@@ -147,8 +146,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
   BatchNormFunctor(const bool folded_constant,
                    const ActivationType activation,
                    const float relux_max_limit)
-      : BatchNormFunctorBase(
-            folded_constant, activation, relux_max_limit) {}
+      : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
   void operator()(const Tensor *input,
                   const Tensor *scale,
                   const Tensor *offset,
diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h
index bea5a790..28adcf8d 100644
--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -6,9 +6,9 @@
 #define MACE_KERNELS_BIAS_ADD_H_
 
 #include "mace/core/future.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
@@ -32,7 +32,6 @@ struct BiasAddFunctor {
     const T *bias_ptr = bias->data<T>();
     T *output_ptr = output->mutable_data<T>();
 
-
 #pragma omp parallel for collapse(4)
     for (index_t n = 0; n < batch; ++n) {
       for (index_t h = 0; h < height; ++h) {
@@ -44,7 +43,6 @@ struct BiasAddFunctor {
         }
       }
     }
-
   }
 };
 
diff --git a/mace/kernels/buffer_to_image.h b/mace/kernels/buffer_to_image.h
index a3ef4cb2..3292e993 100644
--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -17,10 +17,9 @@ struct BufferToImageFunctorBase {
   bool i2b_;
 };
 
-template<DeviceType D, typename T>
-struct BufferToImageFunctor : BufferToImageFunctorBase{
-  BufferToImageFunctor(bool i2b = false) :
-      BufferToImageFunctorBase(i2b) {}
+template <DeviceType D, typename T>
+struct BufferToImageFunctor : BufferToImageFunctorBase {
+  BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
   void operator()(Tensor *input,
                   const BufferType type,
                   Tensor *output,
@@ -29,10 +28,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase{
   }
 };
 
-template<typename T>
-struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase{
-  BufferToImageFunctor(bool i2b = false) :
-      BufferToImageFunctorBase(i2b) {}
+template <typename T>
+struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase {
+  BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
   void operator()(Tensor *input,
                   const BufferType type,
                   Tensor *output,
diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h
index c4f48a2f..642a93b9 100644
--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -16,8 +16,10 @@ class ChannelShuffleFunctor {
  public:
   ChannelShuffleFunctor(const int group) : group_(group) {}
 
-  void operator()(const T *input, const index_t *input_shape,
-                  T *output, StatsFuture *future) {
+  void operator()(const T *input,
+                  const index_t *input_shape,
+                  T *output,
+                  StatsFuture *future) {
     index_t batch = input_shape[0];
     index_t channels = input_shape[1];
     index_t height = input_shape[2];
diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h
index 50171db7..021b0f61 100644
--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -6,23 +6,23 @@
 #define MACE_KERNELS_CONCAT_H_
 
 #include "mace/core/future.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/tensor.h"
 #include "mace/core/types.h"
 #include "mace/public/mace.h"
-#include "mace/core/tensor.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
 
 struct ConcatFunctorBase {
-  ConcatFunctorBase(const int32_t axis): axis_(axis){}
+  ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
 
   int32_t axis_;
 };
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 struct ConcatFunctor : ConcatFunctorBase {
-  ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}
+  ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
 
   void operator()(const std::vector<const Tensor *> &input_list,
                   Tensor *output,
@@ -75,14 +75,14 @@ struct ConcatFunctor : ConcatFunctorBase {
   }
 };
 
-template<typename T>
-struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase{
-  ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}
+template <typename T>
+struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
+  ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
 
   void operator()(const std::vector<const Tensor *> &input_list,
-                  Tensor *output, StatsFuture *future);
+                  Tensor *output,
+                  StatsFuture *future);
   cl::Kernel kernel_;
-
 };
 
 }  // namepsace kernels
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index f4f49565..99a2eaa3 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -116,9 +116,8 @@ void Conv2dKernelFunc(const T *input_ptr,  // batch start
               sum[sum_idx] += vaddvq_f32(tmp);
 #else
               for (int inci = 0; inci < inc_tile_size; ++inci) {
-                sum[sum_idx] +=
-                    in[in_idx * inc_tile_size + inci] *
-                    weights[weights_idx * inc_tile_size + inci];
+                sum[sum_idx] += in[in_idx * inc_tile_size + inci] *
+                                weights[weights_idx * inc_tile_size + inci];
               }
 #endif
             }
@@ -188,7 +187,7 @@ struct Conv2dFunctorBase {
         paddings_(paddings),
         dilations_(dilations),
         activation_(activation),
-        relux_max_limit_(relux_max_limit){}
+        relux_max_limit_(relux_max_limit) {}
 
   const int *strides_;  // [stride_h, stride_w]
   const Padding padding_type_;
@@ -230,8 +229,9 @@ struct Conv2dFunctor : Conv2dFunctorBase {
           padding_type_, output_shape.data(), paddings.data());
     } else {
       paddings = paddings_;
-      CalcOutputSize(input->shape().data(), filter->shape().data(), paddings_.data(),
-                     dilations_, strides_, RoundType::FLOOR, output_shape.data());
+      CalcOutputSize(input->shape().data(), filter->shape().data(),
+                     paddings_.data(), dilations_, strides_, RoundType::FLOOR,
+                     output_shape.data());
     }
     output->Resize(output_shape);
 
diff --git a/mace/kernels/conv_pool_2d_util.cc b/mace/kernels/conv_pool_2d_util.cc
index 9b7160a7..b1a83782 100644
--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -145,7 +145,7 @@ void CalcOutputSize(const index_t *input_shape,   // NHWC
   MACE_CHECK(dilations[0] > 0 && dilations[1] > 0,
              "Invalid dilations, must >= 1");
   MACE_CHECK((dilations[0] == 1 || strides[0] == 1) &&
-      (dilations[1] == 1 || strides[1] == 1),
+                 (dilations[1] == 1 || strides[1] == 1),
              "If dilations > 1, strides should be 1");
   MACE_CHECK_NOTNULL(output_shape);
   MACE_CHECK_NOTNULL(padding_size);
@@ -159,18 +159,29 @@ void CalcOutputSize(const index_t *input_shape,   // NHWC
   */
   output_shape[0] = input_shape[0];
   if (round_type == FLOOR) {
-    output_shape[1] = static_cast<index_t>(std::floor(1.0 * (input_shape[1] + padding_size[0]
-        - filter_shape[0] - (filter_shape[0] - 1) * (dilations[0] - 1)) / strides[0]) + 1);
-    output_shape[2] = static_cast<index_t>(std::floor(1.0 * (input_shape[2] + padding_size[1]
-        - filter_shape[1] - (filter_shape[1] - 1) * (dilations[1] - 1)) / strides[1]) + 1);
+    output_shape[1] = static_cast<index_t>(
+        std::floor(1.0 * (input_shape[1] + padding_size[0] - filter_shape[0] -
+                          (filter_shape[0] - 1) * (dilations[0] - 1)) /
+                   strides[0]) +
+        1);
+    output_shape[2] = static_cast<index_t>(
+        std::floor(1.0 * (input_shape[2] + padding_size[1] - filter_shape[1] -
+                          (filter_shape[1] - 1) * (dilations[1] - 1)) /
+                   strides[1]) +
+        1);
   } else {
-    output_shape[1] = static_cast<index_t>(std::ceil(1.0 * (input_shape[1] + padding_size[0]
-        - filter_shape[0] - (filter_shape[0] - 1) * (dilations[0] - 1)) / strides[0]) + 1);
-    output_shape[2] = static_cast<index_t>(std::ceil(1.0 * (input_shape[2] + padding_size[1]
-        - filter_shape[1] - (filter_shape[1] - 1) * (dilations[1] - 1)) / strides[1]) + 1);
+    output_shape[1] = static_cast<index_t>(
+        std::ceil(1.0 * (input_shape[1] + padding_size[0] - filter_shape[0] -
+                         (filter_shape[0] - 1) * (dilations[0] - 1)) /
+                  strides[0]) +
+        1);
+    output_shape[2] = static_cast<index_t>(
+        std::ceil(1.0 * (input_shape[2] + padding_size[1] - filter_shape[1] -
+                         (filter_shape[1] - 1) * (dilations[1] - 1)) /
+                  strides[1]) +
+        1);
   }
   output_shape[3] = filter_shape[2];
-
 }
 
 void CalPaddingSize(const index_t *input_shape,   // NCHW
diff --git a/mace/kernels/conv_pool_2d_util.h b/mace/kernels/conv_pool_2d_util.h
index 24097e81..45b1d8a4 100644
--- a/mace/kernels/conv_pool_2d_util.h
+++ b/mace/kernels/conv_pool_2d_util.h
@@ -15,7 +15,7 @@ enum Padding {
   FULL = 2,   // Pads with one less than the filter size on both sides
 };
 
-enum RoundType{
+enum RoundType {
   FLOOR = 0,
   CEIL = 1,
 };
diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h
index da4d00be..c0a1719f 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -10,9 +10,9 @@
 #endif
 
 #include "mace/core/future.h"
-#include "mace/public/mace.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/public/mace.h"
 
 namespace mace {
 namespace kernels {
@@ -247,7 +247,7 @@ struct DepthwiseConv2dFunctorBase {
         paddings_(paddings),
         dilations_(dilations),
         activation_(activation),
-        relux_max_limit_(relux_max_limit){}
+        relux_max_limit_(relux_max_limit) {}
 
   const int *strides_;  // [stride_h, stride_w]
   const Padding padding_type_;
@@ -296,8 +296,9 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
           padding_type_, output_shape.data(), paddings.data());
     } else {
       paddings = paddings_;
-      CalcOutputSize(input->shape().data(), fake_filter_shape.data(), paddings_.data(),
-                     dilations_, strides_, RoundType::FLOOR, output_shape.data());
+      CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
+                     paddings_.data(), dilations_, strides_, RoundType::FLOOR,
+                     output_shape.data());
     }
     auto input_shape = fake_filter_shape;
     output->Resize(output_shape);
diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h
index 18f0604c..263dfb80 100644
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -5,13 +5,13 @@
 #define MACE_KERNELS_ELTWISE_H_
 
 #include "mace/core/future.h"
-#include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/tensor.h"
 
 namespace mace {
 namespace kernels {
 
-enum EltwiseType{
+enum EltwiseType {
   PROD = 0,
   SUM = 1,
   MAX = 2,
@@ -19,8 +19,7 @@ enum EltwiseType{
 };
 
 struct EltwiseFunctorBase {
-  EltwiseFunctorBase(const EltwiseType type,
-                     const std::vector<float> &coeff)
+  EltwiseFunctorBase(const EltwiseType type, const std::vector<float> &coeff)
       : type_(type), coeff_(coeff) {}
 
   EltwiseType type_;
@@ -29,8 +28,7 @@ struct EltwiseFunctorBase {
 
 template <DeviceType D, typename T>
 struct EltwiseFunctor : EltwiseFunctorBase {
-  EltwiseFunctor(const EltwiseType type,
-                     const std::vector<float> &coeff)
+  EltwiseFunctor(const EltwiseType type, const std::vector<float> &coeff)
       : EltwiseFunctorBase(type, coeff) {}
 
   void operator()(const Tensor *input0,
@@ -49,7 +47,7 @@ struct EltwiseFunctor : EltwiseFunctorBase {
     switch (type_) {
       case PROD:
 #pragma omp parallel for
-        for(index_t i = 0; i < size; ++i) {
+        for (index_t i = 0; i < size; ++i) {
           output_ptr[i] = input0_ptr[i] * input1_ptr[i];
         }
         break;
@@ -62,19 +60,20 @@ struct EltwiseFunctor : EltwiseFunctorBase {
         } else {
 #pragma omp parallel for
           for (index_t i = 0; i < size; ++i) {
-            output_ptr[i] = coeff_[0] * input0_ptr[i] + coeff_[1] * input1_ptr[i];
+            output_ptr[i] =
+                coeff_[0] * input0_ptr[i] + coeff_[1] * input1_ptr[i];
           }
         }
         break;
       case MAX:
 #pragma omp parallel for
-        for(index_t i = 0; i < size; ++i) {
+        for (index_t i = 0; i < size; ++i) {
           output_ptr[i] = std::max<T>(input0_ptr[i], input1_ptr[i]);
         }
         break;
       case MIN:
 #pragma omp parallel for
-        for(index_t i = 0; i < size; ++i) {
+        for (index_t i = 0; i < size; ++i) {
           output_ptr[i] = std::min<T>(input0_ptr[i], input1_ptr[i]);
         }
         break;
@@ -84,11 +83,9 @@ struct EltwiseFunctor : EltwiseFunctorBase {
   }
 };
 
-
 template <typename T>
-struct EltwiseFunctor<DeviceType::OPENCL, T>: EltwiseFunctorBase {
-  EltwiseFunctor(const EltwiseType type,
-                 const std::vector<float> &coeff)
+struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
+  EltwiseFunctor(const EltwiseType type, const std::vector<float> &coeff)
       : EltwiseFunctorBase(type, coeff) {}
 
   void operator()(const Tensor *input0,
diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h
index 031717f1..740faacc 100644
--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -6,8 +6,8 @@
 #define MACE_KERNELS_FULLY_CONNECTED_H_
 
 #include "mace/core/future.h"
-#include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"
 
 namespace mace {
@@ -16,25 +16,23 @@ namespace kernels {
 struct FullyConnectedBase {
   FullyConnectedBase(const ActivationType activation,
                      const float relux_max_limit)
-      : activation_(activation),
-        relux_max_limit_(relux_max_limit){}
+      : activation_(activation), relux_max_limit_(relux_max_limit) {}
 
   const ActivationType activation_;
   const float relux_max_limit_;
 };
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 struct FullyConnectedFunctor : FullyConnectedBase {
   FullyConnectedFunctor(const ActivationType activation,
-                        const float relux_max_limit) :
-      FullyConnectedBase(activation, relux_max_limit) {}
+                        const float relux_max_limit)
+      : FullyConnectedBase(activation, relux_max_limit) {}
 
   void operator()(const Tensor *input,
                   const Tensor *weight,
                   const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future) {
-
     std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
     output->Resize(output_shape);
     const index_t N = output->dim(0);
@@ -70,11 +68,11 @@ struct FullyConnectedFunctor : FullyConnectedBase {
   }
 };
 
-template<typename T>
+template <typename T>
 struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
   FullyConnectedFunctor(const ActivationType activation,
-                        const float relux_max_limit) :
-      FullyConnectedBase(activation, relux_max_limit) {}
+                        const float relux_max_limit)
+      : FullyConnectedBase(activation, relux_max_limit) {}
 
   void operator()(const Tensor *input,
                   const Tensor *weight,
diff --git a/mace/kernels/global_avg_pooling.h b/mace/kernels/global_avg_pooling.h
index 8b718e57..cd971558 100644
--- a/mace/kernels/global_avg_pooling.h
+++ b/mace/kernels/global_avg_pooling.h
@@ -39,8 +39,10 @@ struct GlobalAvgPoolingFunctor {
 
 template <>
 void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
-    const float *input, const index_t *input_shape,
-    float *output, StatsFuture *future);
+    const float *input,
+    const index_t *input_shape,
+    float *output,
+    StatsFuture *future);
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h
index 5ed6e77e..d893e951 100644
--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -6,20 +6,18 @@
 #define MACE_KERNELS_MATMUL_H_
 
 #include "mace/core/future.h"
-#include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/tensor.h"
 
 namespace mace {
 namespace kernels {
 
-
 template <DeviceType D, typename T>
 struct MatMulFunctor {
   void operator()(const Tensor *A,
                   const Tensor *B,
                   Tensor *C,
                   StatsFuture *future) {
-
     std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
     C->Resize(c_shape);
     const index_t N = C->dim(0);
@@ -52,7 +50,6 @@ struct MatMulFunctor {
   }
 };
 
-
 template <typename T>
 struct MatMulFunctor<DeviceType::OPENCL, T> {
   void operator()(const Tensor *A,
diff --git a/mace/kernels/neon/batch_norm_neon.cc b/mace/kernels/neon/batch_norm_neon.cc
index 84dc4408..19094ef7 100644
--- a/mace/kernels/neon/batch_norm_neon.cc
+++ b/mace/kernels/neon/batch_norm_neon.cc
@@ -52,7 +52,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
 #pragma omp parallel for collapse(2)
   for (index_t i = 0; i < n; ++i) {
     for (index_t j = 0; j < sample_size; ++j) {
-      const float *input_sample_ptr = input_ptr + (i * sample_size + j) * channel;
+      const float *input_sample_ptr =
+          input_ptr + (i * sample_size + j) * channel;
       float *output_sample_ptr = output_ptr + (i * sample_size + j) * channel;
       const float *new_scale_ptr = new_scale.data();
       const float *new_offset_ptr = new_offset.data();
diff --git a/mace/kernels/neon/conv_2d_neon.cc b/mace/kernels/neon/conv_2d_neon.cc
index 566abd75..8b937ddf 100644
--- a/mace/kernels/neon/conv_2d_neon.cc
+++ b/mace/kernels/neon/conv_2d_neon.cc
@@ -50,12 +50,11 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
   MACE_CHECK_NOTNULL(filter);
   MACE_CHECK_NOTNULL(output);
 
-
   std::vector<index_t> output_shape_vec(4);
   std::vector<int> paddings(2);
   kernels::CalcPaddingAndOutputSize(
-      input->shape().data(), filter->shape().data(), dilations_,
-      strides_, paddings_, output_shape_vec.data(), paddings.data());
+      input->shape().data(), filter->shape().data(), dilations_, strides_,
+      paddings_, output_shape_vec.data(), paddings.data());
   output->Resize(output_shape_vec);
 
   typedef void (*Conv2dNeonFunction)(
@@ -102,8 +101,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
   auto output_shape = output->shape().data();
 
   auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
-  conv2d_neon_func(input_data, input_shape, filter_data, nullptr,
-                   bias_data, output_data, output_shape);
+  conv2d_neon_func(input_data, input_shape, filter_data, nullptr, bias_data,
+                   output_data, output_shape);
 }
 
 }  // namespace kernels
diff --git a/mace/kernels/neon/conv_2d_neon_3x3.cc b/mace/kernels/neon/conv_2d_neon_3x3.cc
index 6a2aa2ea..af1e83cb 100644
--- a/mace/kernels/neon/conv_2d_neon_3x3.cc
+++ b/mace/kernels/neon/conv_2d_neon_3x3.cc
@@ -27,10 +27,8 @@ void Conv2dNeonK3x3S1(const float *input,  // NCHW
   int input_channels = input_shape[1];
   int input_height = input_shape[2];
   int input_width = input_shape[3];
-  int multiplier =
-      filter_shape == nullptr ? 0 : filter_shape[0];
-  int filter_in_channels =
-      filter_shape == nullptr ? input_channels : 1;
+  int multiplier = filter_shape == nullptr ? 0 : filter_shape[0];
+  int filter_in_channels = filter_shape == nullptr ? input_channels : 1;
 #pragma omp parallel for collapse(2)
   for (int b = 0; b < output_batch; ++b) {
     for (int oc = 0; oc < output_channels; ++oc) {
@@ -230,10 +228,8 @@ void Conv2dNeonK3x3S2(const float *input,  // NCHW
   int input_channels = input_shape[1];
   int input_height = input_shape[2];
   int input_width = input_shape[3];
-  int multiplier =
-      filter_shape == nullptr ? 0 : filter_shape[0];
-  int filter_in_channels =
-      filter_shape == nullptr ? input_channels : 1;
+  int multiplier = filter_shape == nullptr ? 0 : filter_shape[0];
+  int filter_in_channels = filter_shape == nullptr ? input_channels : 1;
 
 #pragma omp parallel for collapse(2)
   for (int b = 0; b < output_batch; ++b) {
diff --git a/mace/kernels/neon/depthwise_conv_neon.cc b/mace/kernels/neon/depthwise_conv_neon.cc
index 7fa1a9f5..fc8f457a 100644
--- a/mace/kernels/neon/depthwise_conv_neon.cc
+++ b/mace/kernels/neon/depthwise_conv_neon.cc
@@ -52,9 +52,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
                  << "filter" << kernel_h << "x" << kernel_w << ","
                  << " stride " << strides_[0] << "x" << strides_[1]
                  << " is not implemented yet, using slow version";
-    DepthwiseConv2dFunctor<DeviceType::CPU, float>(strides_, paddings_,
-                                                   dilations_)(
-        input, filter, bias, output, future);
+    DepthwiseConv2dFunctor<DeviceType::CPU, float>(
+        strides_, paddings_, dilations_)(input, filter, bias, output, future);
     return;
   }
 
@@ -73,8 +72,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
     input_shape = padded_input.shape().data();
   }
   auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
-  conv2d_neon_func(input_ptr, input_shape, filter_ptr, filter_shape, bias_ptr, output_ptr,
-                   output_shape);
+  conv2d_neon_func(input_ptr, input_shape, filter_ptr, filter_shape, bias_ptr,
+                   output_ptr, output_shape);
 }
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc
index 75922a9e..180e38ca 100644
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -57,8 +57,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
       default:
         LOG(FATAL) << "Unknown activation type: " << activation_;
     }
-    kernel_ =
-        runtime->BuildKernel("activation", kernel_name, built_options);
+    kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
     int idx = 0;
     kernel_.setArg(idx++, *(input->opencl_image()));
     if (activation_ == PRELU) {
@@ -74,8 +73,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                            static_cast<uint32_t>(height * batch)};
   const std::vector<uint32_t> lws = {8, 16, 8, 1};
   std::string tuning_key =
-      Concat(tuning_key_prefix_, output->dim(0), output->dim(1),
-             output->dim(2), output->dim(3));
+      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
+             output->dim(3));
   TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
 }
 
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index 38388081..a6863a59 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -5,8 +5,8 @@
 #include "mace/kernels/addn.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/utils.h"
 #include "mace/utils/tuner.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
 namespace kernels {
@@ -57,31 +57,23 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
 
     uint32_t idx = 0;
     for (auto input : input_tensors) {
-      kernel_.setArg(idx++,
-                         *(input->opencl_image()));
+      kernel_.setArg(idx++, *(input->opencl_image()));
     }
     kernel_.setArg(idx++, *(output_tensor->opencl_image()));
   }
 
-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(width_pixels),
-      static_cast<uint32_t>(batch_height_pixels)
-  };
+  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
+                           static_cast<uint32_t>(batch_height_pixels)};
   const std::vector<uint32_t> lws = {64, 16, 1};
   std::stringstream ss;
-  ss << "addn_opencl_kernel_"
-     << output_shape[0] << "_"
-     << output_shape[1] << "_"
-     << output_shape[2] << "_"
-     << output_shape[3];
+  ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
+     << "_" << output_shape[2] << "_" << output_shape[3];
   TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
 };
 
-template
-struct AddNFunctor<DeviceType::OPENCL, float>;
+template struct AddNFunctor<DeviceType::OPENCL, float>;
 
-template
-struct AddNFunctor<DeviceType::OPENCL, half>;
+template struct AddNFunctor<DeviceType::OPENCL, half>;
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index 571bdd53..8f14f34b 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -60,17 +60,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
         LOG(FATAL) << "Unknown activation type: " << activation_;
     }
 
-    kernel_ =
-        runtime->BuildKernel("batch_norm", kernel_name, built_options);
+    kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
 
     uint32_t idx = 0;
     kernel_.setArg(idx++, *(input->opencl_image()));
     kernel_.setArg(idx++, *(scale->opencl_image()));
-    kernel_.setArg(idx++,
-                     *(offset->opencl_image()));
+    kernel_.setArg(idx++, *(offset->opencl_image()));
     if (!folded_constant_) {
-      kernel_.setArg(idx++,
-                       *(mean->opencl_image()));
+      kernel_.setArg(idx++, *(mean->opencl_image()));
       kernel_.setArg(idx++, *(var->opencl_image()));
       kernel_.setArg(idx++, epsilon);
     }
diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc
index c8507433..613b633b 100644
--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -12,11 +12,10 @@ namespace mace {
 namespace kernels {
 
 template <typename T>
-void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
-    const Tensor *input,
-    const Tensor *bias,
-    Tensor *output,
-    StatsFuture *future) {
+void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+                                                       const Tensor *bias,
+                                                       Tensor *output,
+                                                       StatsFuture *future) {
   const index_t batch = input->dim(0);
   const index_t height = input->dim(1);
   const index_t width = input->dim(2);
@@ -47,10 +46,8 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
 
   cl::Event event;
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      kernel_, cl::NullRange,
-      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]),
-      nullptr, &event);
+      kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+      cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
   MACE_CHECK(error == CL_SUCCESS);
   if (future != nullptr) {
     future->wait_fn = [runtime, event](CallStats *stats) {
@@ -62,9 +59,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
   }
 }
 
-template
-struct BiasAddFunctor<DeviceType::OPENCL, float>;
-template
-struct BiasAddFunctor<DeviceType::OPENCL, half>;
+template struct BiasAddFunctor<DeviceType::OPENCL, float>;
+template struct BiasAddFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index 19be430f..7b484464 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -9,36 +9,33 @@
 namespace mace {
 namespace kernels {
 
-template<typename T>
-void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
-                                                             const BufferType type,
-                                                             Tensor *image,
-                                                             StatsFuture *future) {
+template <typename T>
+void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
+    Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
   std::vector<size_t> image_shape;
   if (!i2b_) {
     CalImage2DShape(buffer->shape(), type, image_shape);
-    if(type == WINOGRAD_FILTER) {
-      std::vector<index_t> new_shape = 
-        CalWinogradShape(buffer->shape(), type);
+    if (type == WINOGRAD_FILTER) {
+      std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
       image->ResizeImage(new_shape, image_shape);
     } else {
       image->ResizeImage(buffer->shape(), image_shape);
     }
   } else {
-    Image *image_buf = dynamic_cast<Image*>(image->UnderlyingBuffer());
+    Image *image_buf = dynamic_cast<Image *>(image->UnderlyingBuffer());
     image_shape = image_buf->image_shape();
     buffer->Resize(image->shape());
   }
 
-  size_t gws[2] = {image_shape[0],
-                   image_shape[1]};
+  size_t gws[2] = {image_shape[0], image_shape[1]};
   std::string kernel_name;
   switch (type) {
     case CONV2D_FILTER:
       kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image";
       break;
     case DW_CONV2D_FILTER:
-      kernel_name = i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image";
+      kernel_name =
+          i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image";
       break;
     case IN_OUT_CHANNEL:
       kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image";
@@ -48,7 +45,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
       break;
     case IN_OUT_HEIGHT:
     case WEIGHT_HEIGHT:
-      kernel_name = i2b_ ? "in_out_height_image_to_buffer" : "in_out_height_buffer_to_image";
+      kernel_name = i2b_ ? "in_out_height_image_to_buffer"
+                         : "in_out_height_buffer_to_image";
       break;
     case IN_OUT_WIDTH:
       MACE_CHECK(!i2b_) << "IN_OUT_WIDTH only support buffer to image now";
@@ -56,7 +54,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
       break;
     case WINOGRAD_FILTER:
       gws[1] /= 16;
-      kernel_name = i2b_ ? "winograd_filter_image_to_buffer" : "winograd_filter_buffer_to_image";
+      kernel_name = i2b_ ? "winograd_filter_image_to_buffer"
+                         : "winograd_filter_buffer_to_image";
       break;
   }
   std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
@@ -66,25 +65,30 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
   built_options.emplace(kernel_name_ss.str());
   if (buffer->dtype() == image->dtype()) {
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+                          DtToCLCMDDt(DataTypeToEnum<T>::value));
   } else {
-    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DDATA_TYPE=" +
+                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+                          DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
   }
   auto runtime = OpenCLRuntime::Global();
   auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
-                                         obfuscated_kernel_name,
-                                         built_options);
+                                         obfuscated_kernel_name, built_options);
 
   uint32_t idx = 0;
   b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
   if (!i2b_) {
-    MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, "buffer offset not aligned");
-    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->buffer_offset() / GetEnumTypeSize(buffer->dtype())));
+    MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
+               "buffer offset not aligned");
+    b2f_kernel.setArg(idx++,
+                      static_cast<uint32_t>(buffer->buffer_offset() /
+                                            GetEnumTypeSize(buffer->dtype())));
   }
   if (type == ARGUMENT) {
     b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
-  } else if(type == WEIGHT_HEIGHT) {
+  } else if (type == WEIGHT_HEIGHT) {
     b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
     b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
     b2f_kernel.setArg(idx++, 1);
@@ -97,10 +101,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
   const std::vector<uint32_t> lws = {16, 64};
   cl::Event event;
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      b2f_kernel, cl::NullRange,
-      cl::NDRange(gws[0], gws[1]),
-      cl::NDRange(lws[0], lws[1]),
-      nullptr, &event);
+      b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
+      cl::NDRange(lws[0], lws[1]), nullptr, &event);
   MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
 
   if (future != nullptr) {
diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h
index 28b9addd..ac870bd3 100644
--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -18,8 +18,8 @@
 #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
 #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
 
-__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
+__constant sampler_t SAMPLER =
+    CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
 inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
 #ifdef USE_PRELU
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 48466e6a..9cd508bd 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -5,8 +5,8 @@
 #include "mace/kernels/concat.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/utils.h"
 #include "mace/utils/tuner.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
 namespace kernels {
@@ -42,24 +42,23 @@ static void Concat2(cl::Kernel *kernel,
     *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
 
     uint32_t idx = 0;
-    kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input0->opencl_image())));
-    kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input1->opencl_image())));
+    kernel->setArg(idx++,
+                   *(static_cast<const cl::Image2D *>(input0->opencl_image())));
+    kernel->setArg(idx++,
+                   *(static_cast<const cl::Image2D *>(input1->opencl_image())));
     kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
-    kernel->setArg(idx++, *(static_cast<cl::Image2D *>(output->opencl_image())));
+    kernel->setArg(idx++,
+                   *(static_cast<cl::Image2D *>(output->opencl_image())));
   }
 
   const uint32_t gws[3] = {
-      static_cast<uint32_t>(channel_blk),
-      static_cast<uint32_t>(width),
+      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(width),
       static_cast<uint32_t>(batch * height),
   };
   const std::vector<uint32_t> lws = {8, 16, 8, 1};
   std::stringstream ss;
-  ss << "concat_opencl_kernel_"
-     << output->dim(0) << "_"
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
+  ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
+     << "_" << output->dim(2) << "_" << output->dim(3);
   TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
 }
 
@@ -97,27 +96,25 @@ static void ConcatN(cl::Kernel *kernel,
     index_t input_channel_blk = input->dim(3) / 4;
     chan_blk_offset += input_channel_blk;
     const uint32_t gws[3] = {
-        static_cast<uint32_t>(input_channel_blk),
-        static_cast<uint32_t>(width),
+        static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
         static_cast<uint32_t>(batch * height),
     };
     const std::vector<uint32_t> lws = {8, 16, 8, 1};
     std::stringstream ss;
-    ss << "concat_n_opencl_kernel_"
-       << input_channel_blk << "_"
-       << width << "_"
+    ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
        << batch * height;
     TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
   }
 }
 
-template<typename T>
-void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Tensor *> &input_list,
-                                                      Tensor *output,
-                                                      StatsFuture *future) {
+template <typename T>
+void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
+    const std::vector<const Tensor *> &input_list,
+    Tensor *output,
+    StatsFuture *future) {
   const int inputs_count = input_list.size();
   MACE_CHECK(inputs_count >= 2 && axis_ == 3)
-    << "Concat opencl kernel only support >=2 elements with axis == 3";
+      << "Concat opencl kernel only support >=2 elements with axis == 3";
 
   const Tensor *input0 = input_list[0];
   bool divisible_four = input0->dim(axis_) % 4 == 0;
@@ -137,8 +134,9 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
     }
     output_shape[axis_] += input->dim(axis_);
   }
-  MACE_CHECK(inputs_count == 2 || divisible_four,
-             "Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
+  MACE_CHECK(
+      inputs_count == 2 || divisible_four,
+      "Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
   std::vector<size_t> image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
   output->ResizeImage(output_shape, image_shape);
@@ -151,17 +149,14 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
     default:
       if (divisible_four) {
         ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future);
-      }
-      else {
+      } else {
         MACE_NOT_IMPLEMENTED;
       }
   }
 };
 
-template
-struct ConcatFunctor<DeviceType::OPENCL, float>;
-template
-struct ConcatFunctor<DeviceType::OPENCL, half>;
+template struct ConcatFunctor<DeviceType::OPENCL, float>;
+template struct ConcatFunctor<DeviceType::OPENCL, half>;
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc
index 5dc97944..3ed87e7c 100644
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -47,21 +47,21 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                          Tensor *output,
                          StatsFuture *future);
 
-template<typename T>
+template <typename T>
 void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                       const Tensor *filter,
                                                       const Tensor *bias,
                                                       Tensor *output,
                                                       StatsFuture *future) {
   typedef void (*Conv2dOpenclFunction)(
-      cl::Kernel *kernel,
-      const Tensor *input, const Tensor *filter, const Tensor *bias, const int stride,
-      const int *padding, const int *dilations, const ActivationType activation,
-      const float relux_max_limit, const DataType dt,
-      Tensor *output, StatsFuture *future);
+      cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
+      const Tensor *bias, const int stride, const int *padding,
+      const int *dilations, const ActivationType activation,
+      const float relux_max_limit, const DataType dt, Tensor *output,
+      StatsFuture *future);
   // Selection matrix: kernel_size x stride_size
-  static const Conv2dOpenclFunction selector[5] =
-      {Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
+  static const Conv2dOpenclFunction selector[5] = {
+      Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
 
   index_t kernel_h = filter->dim(0);
   index_t kernel_w = filter->dim(1);
@@ -83,8 +83,9 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
         padding_type_, output_shape.data(), paddings.data());
   } else {
     paddings = paddings_;
-    CalcOutputSize(input->shape().data(), filter->shape().data(), paddings_.data(),
-                   dilations_, strides_, RoundType::FLOOR, output_shape.data());
+    CalcOutputSize(input->shape().data(), filter->shape().data(),
+                   paddings_.data(), dilations_, strides_, RoundType::FLOOR,
+                   output_shape.data());
   }
 
   std::vector<size_t> output_image_shape;
@@ -94,18 +95,18 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   if (kernel_h == kernel_w && kernel_h <= 5 &&
       selector[kernel_h - 1] != nullptr) {
     auto conv2d_func = selector[kernel_h - 1];
-    conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_,
-                relux_max_limit_, DataTypeToEnum<T>::value, output, future);
+    conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
+                dilations_, activation_, relux_max_limit_,
+                DataTypeToEnum<T>::value, output, future);
   } else {
-    Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
-                 activation_, relux_max_limit_, DataTypeToEnum<T>::value, output, future);
+    Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
+                 dilations_, activation_, relux_max_limit_,
+                 DataTypeToEnum<T>::value, output, future);
   }
 }
 
-template
-struct Conv2dFunctor<DeviceType::OPENCL, float>;
-template
-struct Conv2dFunctor<DeviceType::OPENCL, half>;
+template struct Conv2dFunctor<DeviceType::OPENCL, float>;
+template struct Conv2dFunctor<DeviceType::OPENCL, half>;
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index b370b32b..41eaad56 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -66,20 +66,15 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     }
 
     auto runtime = OpenCLRuntime::Global();
-    *kernel =
-        runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
+    *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
 
     uint32_t idx = 0;
-    kernel->setArg(idx++,
-                          *(input->opencl_image()));
-    kernel->setArg(idx++,
-                          *(filter->opencl_image()));
+    kernel->setArg(idx++, *(input->opencl_image()));
+    kernel->setArg(idx++, *(filter->opencl_image()));
     if (bias != nullptr) {
-      kernel->setArg(idx++,
-                            *(bias->opencl_image()));
+      kernel->setArg(idx++, *(bias->opencl_image()));
     }
-    kernel->setArg(idx++,
-                          *(output->opencl_image()));
+    kernel->setArg(idx++, *(output->opencl_image()));
     // FIXME handle flexable data type: half not supported
     kernel->setArg(idx++, relux_max_limit);
     kernel->setArg(idx++, static_cast<int>(input_height));
@@ -100,6 +95,5 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
   TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
 }
 
-
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index a7eb668d..df2672c9 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -61,20 +61,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
     }
 
     auto runtime = OpenCLRuntime::Global();
-    *kernel =
-        runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
+    *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
 
     uint32_t idx = 0;
-    kernel->setArg(idx++,
-                          *(input->opencl_image()));
-    kernel->setArg(idx++,
-                          *(filter->opencl_image()));
+    kernel->setArg(idx++, *(input->opencl_image()));
+    kernel->setArg(idx++, *(filter->opencl_image()));
     if (bias != nullptr) {
-      kernel->setArg(idx++,
-                            *(bias->opencl_image()));
+      kernel->setArg(idx++, *(bias->opencl_image()));
     }
-    kernel->setArg(idx++,
-                          *(output->opencl_image()));
+    kernel->setArg(idx++, *(output->opencl_image()));
     kernel->setArg(idx++, relux_max_limit);
     kernel->setArg(idx++, static_cast<int>(input->dim(1)));
     kernel->setArg(idx++, static_cast<int>(input->dim(2)));
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index 5f3ffa5e..c317aa8c 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -61,20 +61,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
     }
 
     auto runtime = OpenCLRuntime::Global();
-    *kernel =
-        runtime->BuildKernel("conv_2d", kernel_name, built_options);
+    *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
 
     uint32_t idx = 0;
-    kernel->setArg(idx++,
-                          *(input->opencl_image()));
-    kernel->setArg(idx++,
-                          *(filter->opencl_image()));
+    kernel->setArg(idx++, *(input->opencl_image()));
+    kernel->setArg(idx++, *(filter->opencl_image()));
     if (bias != nullptr) {
-      kernel->setArg(idx++,
-                            *(bias->opencl_image()));
+      kernel->setArg(idx++, *(bias->opencl_image()));
     }
-    kernel->setArg(idx++,
-                          *(output->opencl_image()));
+    kernel->setArg(idx++, *(output->opencl_image()));
     kernel->setArg(idx++, relux_max_limit);
     kernel->setArg(idx++, static_cast<uint32_t>(input->dim(1)));
     kernel->setArg(idx++, static_cast<uint32_t>(input->dim(2)));
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc
index 3bbd4f43..1b99188b 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -34,7 +34,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
   const index_t channel_blocks = RoundUpDiv4(channels);
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
   const index_t width_blocks = RoundUpDiv4(width);
-  if(kernel->get() == nullptr) {
+  if (kernel->get() == nullptr) {
     const index_t input_batch = input->dim(0);
     const index_t input_height = input->dim(1);
     const index_t input_width = input->dim(2);
@@ -78,18 +78,16 @@ void DepthwiseConv2d(cl::Kernel *kernel,
         LOG(FATAL) << "Unknown activation type: " << activation;
     }
 
-    *kernel = runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
+    *kernel =
+        runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
 
     uint32_t idx = 0;
     kernel->setArg(idx++, *(input->opencl_image()));
-    kernel->setArg(
-        idx++, *(filter->opencl_image()));
+    kernel->setArg(idx++, *(filter->opencl_image()));
     if (bias != nullptr) {
-      kernel->setArg(
-          idx++, *(bias->opencl_image()));
+      kernel->setArg(idx++, *(bias->opencl_image()));
     }
-    kernel->setArg(
-        idx++, *(output->opencl_image()));
+    kernel->setArg(idx++, *(output->opencl_image()));
     kernel->setArg(idx++, relux_max_limit);
     kernel->setArg(idx++, static_cast<short>(input_height));
     kernel->setArg(idx++, static_cast<short>(input_width));
@@ -154,16 +152,17 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
         padding_type_, output_shape.data(), paddings.data());
   } else {
     paddings = paddings_;
-    CalcOutputSize(input->shape().data(), fake_filter_shape.data(), paddings_.data(),
-                   dilations_, strides_, RoundType::FLOOR, output_shape.data());
+    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
+                   paddings_.data(), dilations_, strides_, RoundType::FLOOR,
+                   output_shape.data());
   }
 
   std::vector<size_t> output_image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
   output->ResizeImage(output_shape, output_image_shape);
 
-  DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
-                  activation_, relux_max_limit_, 
+  DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
+                  dilations_, activation_, relux_max_limit_,
                   DataTypeToEnum<T>::value, output, future);
 }
 
diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc
index 8c589c2f..82312c75 100644
--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -15,7 +15,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
                                                        const Tensor *input1,
                                                        Tensor *output,
                                                        StatsFuture *future) {
-
   const index_t batch = input0->dim(0);
   const index_t height = input0->dim(1);
   const index_t width = input0->dim(2);
@@ -38,10 +37,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
     kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
 
     uint32_t idx = 0;
-    kernel_.setArg(idx++,
-                   *(input0->opencl_image()));
-    kernel_.setArg(idx++,
-                   *(input1->opencl_image()));
+    kernel_.setArg(idx++, *(input0->opencl_image()));
+    kernel_.setArg(idx++, *(input1->opencl_image()));
     if (!coeff_.empty()) {
       kernel_.setArg(idx++, coeff_[0]);
       kernel_.setArg(idx++, coeff_[1]);
@@ -49,17 +46,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
     kernel_.setArg(idx++, *(output->opencl_image()));
   }
 
-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(width_pixels),
-      static_cast<uint32_t>(batch_height_pixels)
-  };
+  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
+                           static_cast<uint32_t>(batch_height_pixels)};
   const std::vector<uint32_t> lws = {64, 16, 1};
   std::stringstream ss;
-  ss << "eltwise_opencl_kernel_"
-     << output->dim(0) << "_"
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
+  ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
+     << "_" << output->dim(2) << "_" << output->dim(3);
   TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
 }
 
diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc
index 4a4eacc1..0e208cf4 100644
--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -10,14 +10,13 @@
 namespace mace {
 namespace kernels {
 
-template<typename T>
+template <typename T>
 void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
     const Tensor *input,
     const Tensor *weight,
     const Tensor *bias,
     Tensor *output,
     StatsFuture *future) {
-
   std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
   std::vector<size_t> output_image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
@@ -57,19 +56,16 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
       default:
         LOG(FATAL) << "Unknown activation type: " << activation_;
     }
-    kernel_ = runtime->BuildKernel("fully_connected", kernel_name, built_options);
+    kernel_ =
+        runtime->BuildKernel("fully_connected", kernel_name, built_options);
 
     uint32_t idx = 0;
-    kernel_.setArg(idx++,
-                   *(input->opencl_image()));
-    kernel_.setArg(idx++,
-                   *(weight->opencl_image()));
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, *(weight->opencl_image()));
     if (bias != nullptr) {
-      kernel_.setArg(idx++,
-                     *(bias->opencl_image()));
+      kernel_.setArg(idx++, *(bias->opencl_image()));
     }
-    kernel_.setArg(idx++,
-                   *(output->opencl_image()));
+    kernel_.setArg(idx++, *(output->opencl_image()));
     kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
     kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
     kernel_.setArg(idx++, static_cast<int>(input->dim(3)));
@@ -78,25 +74,18 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
   }
 
   const uint32_t gws[2] = {
-      static_cast<uint32_t>(batch),
-      static_cast<uint32_t>(output_blocks),
+      static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
   };
   const std::vector<uint32_t> lws = {16, 64, 1};
   std::stringstream ss;
-  ss << "fc_opencl_kernel_"
-     << output->dim(0) << "_"
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
+  ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
+     << output->dim(2) << "_" << output->dim(3);
   TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
-
 };
 
-template
-struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
+template struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
 
-template
-struct FullyConnectedFunctor<DeviceType::OPENCL, half>;
+template struct FullyConnectedFunctor<DeviceType::OPENCL, half>;
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index ee141adb..791db167 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -3,8 +3,8 @@
 //
 
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/utils.h"
 #include "mace/utils/tuner.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
 namespace kernels {
@@ -28,8 +28,9 @@ void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */
 }
 
 // [H * W * M, (Ic + 3) / 4]
-void CalDepthwiseConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWIM */
-                                        std::vector<size_t> &image_shape) {
+void CalDepthwiseConv2dFilterImageShape(
+    const std::vector<index_t> &shape, /* HWIM */
+    std::vector<size_t> &image_shape) {
   MACE_CHECK(shape.size() == 4);
   image_shape.resize(2);
   image_shape[0] = shape[0] * shape[1] * shape[3];
@@ -47,8 +48,9 @@ void CalArgImageShape(const std::vector<index_t> &shape,
 
 // Only support 3x3 now
 // [ (Ic + 3) / 4, 16 * Oc]
-void CalWinogradFilterImageShape(const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
-                                 std::vector<size_t> &image_shape) {
+void CalWinogradFilterImageShape(
+    const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
+    std::vector<size_t> &image_shape) {
   MACE_CHECK(shape.size() == 4);
   image_shape.resize(2);
   image_shape[0] = RoundUpDiv4(shape[1]);
@@ -115,19 +117,16 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
   }
 }
 
-
 std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
                                       const BufferType type) {
   if (type == WINOGRAD_FILTER) {
     return {16, shape[0], shape[1], 1};
-  }else if (type == IN_OUT_HEIGHT) {
-    index_t out_width = shape[0] *
-                        ((shape[1] - 1) / 2) *
-                        ((shape[2] - 1) / 2);
+  } else if (type == IN_OUT_HEIGHT) {
+    index_t out_width = shape[0] * ((shape[1] - 1) / 2) * ((shape[2] - 1) / 2);
     return {16, shape[3], out_width, 1};
   } else {
     LOG(FATAL) << "Mace not supported yet.";
-  return std::vector<index_t>();
+    return std::vector<index_t>();
   }
 }
 
@@ -188,10 +187,10 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
     std::vector<uint32_t> local_ws(3, 0);
     local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
     local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(gws[2],
-                                     kwg_size / (local_ws[0] * local_ws[1]));
+    local_ws[2] =
+        std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1]));
     return {
-      // TODO tuning these magic numbers
+        // TODO tuning these magic numbers
         {local_ws[0], local_ws[1], local_ws[2], 1},
         {kwg_size / 16, 4, 4, 1},
         {kwg_size / 32, 4, 8, 1},
@@ -217,20 +216,20 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
     };
   };
   cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params,
-                  Timer *timer,
+  auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
                   std::vector<uint32_t> *tuning_result) -> cl_int {
-    MACE_CHECK(params.size() == 4) << "Tuning parameters of 3D kernel must be 4D";
+    MACE_CHECK(params.size() == 4)
+        << "Tuning parameters of 3D kernel must be 4D";
     cl_int error = CL_SUCCESS;
     if (timer == nullptr) {
       uint32_t num_blocks = params[3];
       const uint32_t block_size = gws[2] / num_blocks;
       if (gws[2] % num_blocks > 0) num_blocks++;
       for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        uint32_t gws2 =
+            (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
         error = runtime->command_queue().enqueueNDRangeKernel(
-            kernel,
-            cl::NDRange(0, 0, i * block_size),
+            kernel, cl::NDRange(0, 0, i * block_size),
             cl::NDRange(gws[0], gws[1], gws2),
             cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
         MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
@@ -247,15 +246,16 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
       if (LimitKernelTime()) {
         double elapse_time = timer->AccumulatedMicros();
         timer->ClearTiming();
-        uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+        uint32_t num_blocks = std::min(
+            static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
         (*tuning_result)[3] = num_blocks;
         const uint32_t block_size = gws[2] / num_blocks;
         if (gws[2] % num_blocks > 0) num_blocks++;
         for (uint32_t i = 0; i < num_blocks; ++i) {
-          uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+          uint32_t gws2 =
+              (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
           error = runtime->command_queue().enqueueNDRangeKernel(
-              kernel,
-              cl::NDRange(0, 0, i * block_size),
+              kernel, cl::NDRange(0, 0, i * block_size),
               cl::NDRange(gws[0], gws[1], gws2),
               cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
           MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
@@ -300,34 +300,30 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
             {kwg_size / 256, 256, 1},
             {kwg_size / 512, 512, 1},
             {kwg_size, 1, 1},
-            {1, kwg_size, 1}
-    };
+            {1, kwg_size, 1}};
   };
   cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params,
-                  Timer *timer,
+  auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
                   std::vector<uint32_t> *tuning_result) -> cl_int {
-    MACE_CHECK(params.size() == 3) << "Tuning parameters of 2D kernel must be 3d";
+    MACE_CHECK(params.size() == 3)
+        << "Tuning parameters of 2D kernel must be 3d";
     cl_int error = CL_SUCCESS;
     if (timer == nullptr) {
       uint32_t num_blocks = params[2];
       const uint32_t block_size = gws[1] / num_blocks;
       if (gws[1] % num_blocks > 0) num_blocks++;
       for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
+        uint32_t gws1 =
+            (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
         error = runtime->command_queue().enqueueNDRangeKernel(
-            kernel,
-            cl::NDRange(0, i * block_size),
-            cl::NDRange(gws[0], gws1),
-            cl::NDRange(params[0], params[1]),
-            nullptr, &event);
+            kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
+            cl::NDRange(params[0], params[1]), nullptr, &event);
         MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
       }
     } else {
       timer->ClearTiming();
       error = runtime->command_queue().enqueueNDRangeKernel(
-          kernel, cl::NullRange,
-          cl::NDRange(gws[0], gws[1]),
+          kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
           cl::NDRange(params[0], params[1]), nullptr, &event);
       MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
       timer->AccumulateTiming();
@@ -336,16 +332,16 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
       if (LimitKernelTime()) {
         double elapse_time = timer->AccumulatedMicros();
         timer->ClearTiming();
-        uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
+        uint32_t num_blocks = std::min(
+            static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
         (*tuning_result)[2] = num_blocks;
         const uint32_t block_size = gws[1] / num_blocks;
         if (gws[1] % num_blocks > 0) num_blocks++;
         for (uint32_t i = 0; i < num_blocks; ++i) {
-          uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
+          uint32_t gws1 =
+              (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
           error = runtime->command_queue().enqueueNDRangeKernel(
-              kernel,
-              cl::NDRange(0, i * block_size),
-              cl::NDRange(gws[0], gws1),
+              kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
               cl::NDRange(params[0], params[1]), nullptr, &event);
           MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
           timer->AccumulateTiming();
@@ -355,11 +351,8 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
     return error;
   };
   OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(tuning_key,
-                                                     lws,
-                                                     params_generator,
-                                                     func,
-                                                     &timer);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
+      tuning_key, lws, params_generator, func, &timer);
   if (future != nullptr) {
     future->wait_fn = [runtime, event](CallStats *stats) {
       event.wait();
@@ -368,7 +361,6 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
       }
     };
   }
-
 }
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h
index 36e9827f..19cc6ff3 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -5,16 +5,16 @@
 #ifndef MACE_KERNELS_OPENCL_HELPER_H_
 #define MACE_KERNELS_OPENCL_HELPER_H_
 
+#include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/types.h"
 #include "mace/utils/utils.h"
-#include "mace/core/future.h"
 
 namespace mace {
 namespace kernels {
 
-const float kMaxKernelExeTime = 1000.0; // microseconds
+const float kMaxKernelExeTime = 1000.0;  // microseconds
 
 enum BufferType {
   CONV2D_FILTER = 0,
@@ -31,7 +31,7 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                      const BufferType type,
                      std::vector<size_t> &image_shape);
 
-std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape, 
+std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
                                       const BufferType type);
 
 std::string DtToCLCMDDt(const DataType dt);
@@ -48,7 +48,6 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
                          const std::vector<uint32_t> &lws,
                          StatsFuture *future);
 
-
 void TuningOrRun2DKernel(cl::Kernel &kernel,
                          const std::string tuning_key,
                          const uint32_t *gws,
@@ -72,12 +71,12 @@ inline bool LimitKernelTime() {
 }
 
 namespace {
-template<typename T>
+template <typename T>
 void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
   (*ss) << v;
 }
 
-template<typename T, typename... Args>
+template <typename T, typename... Args>
 void AppendToStream(std::stringstream *ss,
                     const std::string &delimiter,
                     T first,
@@ -87,7 +86,7 @@ void AppendToStream(std::stringstream *ss,
 }
 }  // namespace
 
-template<typename... Args>
+template <typename... Args>
 std::string Concat(Args... args) {
   std::stringstream ss;
   AppendToStream(&ss, "_", args...);
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index 77560853..d453c293 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -11,12 +11,10 @@ namespace mace {
 namespace kernels {
 
 template <typename T>
-void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
-    const Tensor *A,
-    const Tensor *B,
-    Tensor *C,
-    StatsFuture *future) {
-
+void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
+                                                      const Tensor *B,
+                                                      Tensor *C,
+                                                      StatsFuture *future) {
   std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
   std::vector<size_t> c_image_shape;
   CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, c_image_shape);
@@ -41,8 +39,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
 
     uint32_t idx = 0;
     kernel_.setArg(idx++, *(A->opencl_image()));
-    kernel_.setArg(idx++,
-                         *(B->opencl_image()));
+    kernel_.setArg(idx++, *(B->opencl_image()));
     kernel_.setArg(idx++, *(C->opencl_image()));
     kernel_.setArg(idx++, static_cast<int>(height));
     kernel_.setArg(idx++, static_cast<int>(width));
@@ -57,20 +54,14 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
   };
   const std::vector<uint32_t> lws = {16, 64, 1};
   std::stringstream ss;
-  ss << "matmul_opencl_kernel_"
-     << C->dim(0) << "_"
-     << C->dim(1) << "_"
-     << C->dim(2) << "_"
-     << C->dim(3);
+  ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
+     << C->dim(2) << "_" << C->dim(3);
   TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
-
 };
 
-template
-struct MatMulFunctor<DeviceType::OPENCL, float>;
+template struct MatMulFunctor<DeviceType::OPENCL, float>;
 
-template
-struct MatMulFunctor<DeviceType::OPENCL, half>;
+template struct MatMulFunctor<DeviceType::OPENCL, half>;
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index 1272a4fb..d9256776 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -11,17 +11,15 @@
 namespace mace {
 namespace kernels {
 
-template<typename T>
+template <typename T>
 void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                        Tensor *output,
                                                        StatsFuture *future) {
   MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
-    << "Pooling opencl kernel not support dilation yet";
+      << "Pooling opencl kernel not support dilation yet";
   std::vector<index_t> output_shape(4);
-  std::vector<index_t> filter_shape = {
-      kernels_[0], kernels_[1],
-      input->dim(3), input->dim(3)
-  };
+  std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
+                                       input->dim(3)};
 
   std::vector<int> paddings(2);
   if (paddings_.empty()) {
@@ -77,24 +75,17 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   }
 
   const uint32_t gws[3] = {
-      static_cast<uint32_t>(channel_blocks),
-      static_cast<uint32_t>(out_width),
+      static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
       static_cast<uint32_t>(batch * out_height),
   };
   std::vector<uint32_t> lws = {8, 16, 8, 1};
   std::stringstream ss;
-  ss << "pooling_opencl_kernel_"
-     << output->dim(0) << "_"
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
+  ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
+     << "_" << output->dim(2) << "_" << output->dim(3);
   TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
-
 }
 
-template
-struct PoolingFunctor<DeviceType::OPENCL, float>;
-template
-struct PoolingFunctor<DeviceType::OPENCL, half>;
+template struct PoolingFunctor<DeviceType::OPENCL, float>;
+template struct PoolingFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index 5761d3cb..470a335d 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -2,12 +2,12 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
+#include "mace/kernels/resize_bilinear.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/resize_bilinear.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/utils.h"
 #include "mace/utils/tuner.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
 namespace kernels {
@@ -29,14 +29,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     std::vector<index_t> output_shape{batch, out_height, out_width, channels};
 
     std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape,
-                    BufferType::IN_OUT_CHANNEL,
+    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
                     output_image_shape);
     output->ResizeImage(output_shape, output_image_shape);
 
     float height_scale =
         CalculateResizeScale(in_height, out_height, align_corners_);
-    float width_scale = CalculateResizeScale(in_width, out_width, align_corners_);
+    float width_scale =
+        CalculateResizeScale(in_width, out_width, align_corners_);
 
     auto runtime = OpenCLRuntime::Global();
     std::set<std::string> built_options;
@@ -45,7 +45,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    kernel_ = runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
+    kernel_ =
+        runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
 
     uint32_t idx = 0;
     kernel_.setArg(idx++, *(input->opencl_image()));
@@ -62,11 +63,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
                            static_cast<uint32_t>(out_height * batch)};
   const std::vector<uint32_t> lws = {8, 16, 8, 1};
   std::stringstream ss;
-  ss << "resize_bilinear_opencl_kernel_"
-     << output->dim(0) << "_"
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
+  ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
+     << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
   TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
 }
 
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
index a3336aa6..25e1c9e4 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -6,13 +6,13 @@
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/utils.h"
 #include "mace/utils/tuner.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
 namespace kernels {
 
-template<typename T>
+template <typename T>
 void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
                                                        Tensor *output,
                                                        StatsFuture *future) {
@@ -45,17 +45,12 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
                            static_cast<uint32_t>(height * batch)};
   const std::vector<uint32_t> lws = {8, 16, 8, 1};
   std::stringstream ss;
-  ss << "softmax_opencl_kernel_"
-     << output->dim(0) << "_"
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
+  ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
+     << "_" << output->dim(2) << "_" << output->dim(3);
   TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
 }
 
-template
-struct SoftmaxFunctor<DeviceType::OPENCL, float>;
-template
-struct SoftmaxFunctor<DeviceType::OPENCL, half>;
+template struct SoftmaxFunctor<DeviceType::OPENCL, float>;
+template struct SoftmaxFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
index 2eb06027..0cecb0a7 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -5,20 +5,21 @@
 #ifndef MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
 #define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
 
-#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/space_to_batch.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/utils.h"
 #include "mace/utils/tuner.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
 namespace kernels {
 
 template <typename T>
-void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor,
-                                                            const std::vector<index_t> &output_shape,
-                                                            Tensor *batch_tensor,
-                                                            StatsFuture *future) {
+void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
+    Tensor *space_tensor,
+    const std::vector<index_t> &output_shape,
+    Tensor *batch_tensor,
+    StatsFuture *future) {
   const char *kernel_name = nullptr;
   std::vector<size_t> output_image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
@@ -37,8 +38,10 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
     kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
     built_options.emplace(kernel_name_ss.str());
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
-    kernel_ = runtime->BuildKernel("space_to_batch", kernel_name, built_options);
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+                          DtToCLCMDDt(DataTypeToEnum<T>::value));
+    kernel_ =
+        runtime->BuildKernel("space_to_batch", kernel_name, built_options);
 
     uint32_t idx = 0;
     if (b2s_) {
@@ -59,15 +62,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
   }
 
   const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
-  const uint32_t gws[3] = {chan_blk,
-                           static_cast<uint32_t>(batch_tensor->dim(2)),
-                           static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
+  const uint32_t gws[3] = {
+      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
+      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
   const std::vector<uint32_t> lws = {8, 16, 8, 1};
   std::stringstream ss;
-  ss << kernel_name << "_"
-     << batch_tensor->dim(0) << "_"
-     << batch_tensor->dim(1) << "_"
-     << batch_tensor->dim(2) << "_"
+  ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
+     << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
      << batch_tensor->dim(3);
   TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
 }
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index 8fd17f21..ee7d5d12 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -11,21 +11,21 @@
 namespace mace {
 namespace kernels {
 
-template<typename T>
-void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input_tensor,
-                                                                 Tensor *output_tensor,
-                                                                 StatsFuture *future) {
+template <typename T>
+void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
+    const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
   std::vector<index_t> output_shape(4);
   std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
   std::vector<int> paddings(2);
   if (paddings_.empty()) {
     kernels::CalcNHWCPaddingAndOutputSize(
-        input_tensor->shape().data(), filter_shape.data(), dilations_.data(), strides_.data(),
-        padding_type_, output_shape.data(), paddings.data());
+        input_tensor->shape().data(), filter_shape.data(), dilations_.data(),
+        strides_.data(), padding_type_, output_shape.data(), paddings.data());
   } else {
     paddings = paddings_;
-    CalcOutputSize(input_tensor->shape().data(), filter_shape.data(), paddings_.data(),
-                   dilations_.data(), strides_.data(), RoundType::FLOOR, output_shape.data());
+    CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
+                   paddings_.data(), dilations_.data(), strides_.data(),
+                   RoundType::FLOOR, output_shape.data());
   }
 
   const index_t round_h = (output_shape[1] + 1) / 2;
@@ -38,14 +38,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
     CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
     output_tensor->ResizeImage(output_shape, image_shape);
 
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
+    std::string obfuscated_kernel_name =
+        MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
     std::set<std::string> built_options;
     built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DDATA_TYPE=" +
+                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+                          DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
     auto runtime = OpenCLRuntime::Global();
-    kernel_ = runtime->BuildKernel("winograd_transform",
-                                   obfuscated_kernel_name,
+    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
                                    built_options);
 
     uint32_t idx = 0;
@@ -60,34 +62,39 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
     kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
   }
 
-  const uint32_t gws[2] = {static_cast<uint32_t>(out_width),
-                         static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(out_width),
+      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
   const std::vector<uint32_t> lws = {128, 8, 1};
   std::stringstream ss;
-  ss << "winograd_transform_kernel_"
-     << input_tensor->dim(0) << "_"
-     << input_tensor->dim(1) << "_"
-     << input_tensor->dim(2) << "_"
+  ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
+     << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
      << input_tensor->dim(3);
   TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
 }
 
-template<typename T>
-void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input_tensor,
-                                                                        const Tensor *bias,
-                                                                        Tensor *output_tensor,
-                                                                        StatsFuture *future) {
-  std::vector<index_t> output_shape = {batch_, height_, width_, input_tensor->dim(1)};
+template <typename T>
+void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
+    const Tensor *input_tensor,
+    const Tensor *bias,
+    Tensor *output_tensor,
+    StatsFuture *future) {
+  std::vector<index_t> output_shape = {batch_, height_, width_,
+                                       input_tensor->dim(1)};
   std::vector<size_t> image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
   output_tensor->ResizeImage(output_shape, image_shape);
 
   if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
+    std::string obfuscated_kernel_name =
+        MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
     std::set<std::string> built_options;
-    built_options.emplace("-Dwinograd_inverse_transform_2x2=" + obfuscated_kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-Dwinograd_inverse_transform_2x2=" +
+                          obfuscated_kernel_name);
+    built_options.emplace("-DDATA_TYPE=" +
+                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+                          DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation_) {
       case NOOP:
@@ -112,18 +119,21 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
     }
 
     auto runtime = OpenCLRuntime::Global();
-    kernel_ = runtime->BuildKernel("winograd_transform",
-                                   obfuscated_kernel_name,
+    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
                                    built_options);
 
     const uint32_t round_h = (height_ + 1) / 2;
     const uint32_t round_w = (width_ + 1) / 2;
     uint32_t idx = 0;
-    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
+    kernel_.setArg(
+        idx++,
+        *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
     if (bias != nullptr) {
-      kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->opencl_image())));
+      kernel_.setArg(idx++,
+                     *(static_cast<const cl::Image2D *>(bias->opencl_image())));
     }
-    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
+    kernel_.setArg(
+        idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
     kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
     kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
     kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
@@ -131,28 +141,23 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
     kernel_.setArg(idx++, relux_max_limit_);
   }
 
-  const uint32_t gws[2] = {static_cast<uint32_t>(input_tensor->dim(2)),
-                         static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(input_tensor->dim(2)),
+      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
   const std::vector<uint32_t> lws = {128, 8, 1};
 
   std::stringstream ss;
-  ss << "winograd_inverse_transform_kernel_"
-     << input_tensor->dim(0) << "_"
-     << input_tensor->dim(1) << "_"
-     << input_tensor->dim(2) << "_"
+  ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"
+     << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
      << input_tensor->dim(3);
   TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
 }
 
-template
-struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
-template
-struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
+template struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
+template struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
 
-template
-struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>;
-template
-struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
+template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>;
+template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index dbbfaefc..a2d3bcdb 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -7,9 +7,9 @@
 
 #include <limits>
 #include "mace/core/future.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/conv_pool_2d_util.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 
@@ -42,7 +42,7 @@ struct PoolingFunctorBase {
   const int *dilations_;
 };
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 struct PoolingFunctor : PoolingFunctorBase {
   PoolingFunctor(const PoolingType pooling_type,
                  const int *kernels,
@@ -50,29 +50,27 @@ struct PoolingFunctor : PoolingFunctorBase {
                  const Padding padding_type,
                  const std::vector<int> &paddings,
                  const int *dilations)
-      : PoolingFunctorBase(pooling_type, kernels,
-                           strides, padding_type,
-                           paddings, dilations) {}
+      : PoolingFunctorBase(
+            pooling_type, kernels, strides, padding_type, paddings, dilations) {
+  }
 
   void operator()(const Tensor *input_tensor,
                   Tensor *output_tensor,
                   StatsFuture *future) {
-
     std::vector<index_t> output_shape(4);
     std::vector<index_t> filter_shape = {
-        kernels_[0], kernels_[1],
-        input_tensor->dim(3), input_tensor->dim(3)
-    };
+        kernels_[0], kernels_[1], input_tensor->dim(3), input_tensor->dim(3)};
 
     std::vector<int> paddings(2);
     if (paddings_.empty()) {
       kernels::CalcNHWCPaddingAndOutputSize(
-          input_tensor->shape().data(), filter_shape.data(), dilations_, strides_,
-          padding_type_, output_shape.data(), paddings.data());
+          input_tensor->shape().data(), filter_shape.data(), dilations_,
+          strides_, padding_type_, output_shape.data(), paddings.data());
     } else {
       paddings = paddings_;
-      CalcOutputSize(input_tensor->shape().data(), filter_shape.data(), paddings_.data(),
-                     dilations_, strides_, RoundType::CEIL, output_shape.data());
+      CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
+                     paddings_.data(), dilations_, strides_, RoundType::CEIL,
+                     output_shape.data());
     }
     output_tensor->Resize(output_shape);
 
@@ -110,7 +108,8 @@ struct PoolingFunctor : PoolingFunctorBase {
         for (int h = 0; h < height; ++h) {
           for (int w = 0; w < width; ++w) {
             for (int c = 0; c < channels; ++c) {
-              index_t out_offset = (((b * height) + h) * width + w) * channels + c;
+              index_t out_offset =
+                  (((b * height) + h) * width + w) * channels + c;
               index_t in_offset = b * in_image_size * input_channels + c;
               T res = std::numeric_limits<T>::lowest();
               for (int kh = 0; kh < kernel_h; ++kh) {
@@ -119,7 +118,8 @@ struct PoolingFunctor : PoolingFunctorBase {
                   int inw = padded_w_start + w * stride_w + dilation_w * kw;
                   if (inh >= 0 && inh < input_height && inw >= 0 &&
                       inw < input_width) {
-                    index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
+                    index_t input_offset =
+                        in_offset + (inh * input_width + inw) * input_channels;
                     res = std::max(res, input[input_offset]);
                   }
                 }
@@ -135,7 +135,8 @@ struct PoolingFunctor : PoolingFunctorBase {
         for (int h = 0; h < height; ++h) {
           for (int w = 0; w < width; ++w) {
             for (int c = 0; c < channels; ++c) {
-              index_t out_offset = (((b * height) + h) * width + w) * channels + c;
+              index_t out_offset =
+                  (((b * height) + h) * width + w) * channels + c;
               index_t in_offset = b * in_image_size * input_channels + c;
               T sum = 0;
               int block_size = 0;
@@ -145,7 +146,8 @@ struct PoolingFunctor : PoolingFunctorBase {
                   int inw = padded_w_start + w * stride_w + dilation_w * kw;
                   if (inh >= 0 && inh < input_height && inw >= 0 &&
                       inw < input_width) {
-                    index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
+                    index_t input_offset =
+                        in_offset + (inh * input_width + inw) * input_channels;
                     sum += input[input_offset];
                     block_size += 1;
                   }
@@ -158,16 +160,13 @@ struct PoolingFunctor : PoolingFunctorBase {
       }
     }
   }
-
 };
 
-template<>
+template <>
 void PoolingFunctor<DeviceType::NEON, float>::operator()(
-    const Tensor *input_tensor,
-    Tensor *output_tensor,
-    StatsFuture *future);
+    const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future);
 
-template<typename T>
+template <typename T>
 struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
   PoolingFunctor(const PoolingType pooling_type,
                  const int *kernels,
@@ -175,9 +174,9 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
                  const Padding padding_type,
                  const std::vector<int> &paddings,
                  const int *dilations)
-      : PoolingFunctorBase(pooling_type, kernels,
-                           strides, padding_type,
-                           paddings, dilations) {}
+      : PoolingFunctorBase(
+            pooling_type, kernels, strides, padding_type, paddings, dilations) {
+  }
   void operator()(const Tensor *input_tensor,
                   Tensor *output_tensor,
                   StatsFuture *future);
diff --git a/mace/kernels/reshape.h b/mace/kernels/reshape.h
index 4d37a199..544ba360 100644
--- a/mace/kernels/reshape.h
+++ b/mace/kernels/reshape.h
@@ -5,8 +5,8 @@
 #define MACE_KERNELS_RESHAPE_H_
 
 #include "mace/core/future.h"
-#include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/tensor.h"
 
 namespace mace {
 namespace kernels {
@@ -25,7 +25,6 @@ struct ReshapeFunctor {
   }
 };
 
-
 }  // namespace kernels
 }  // namespace mace
 
diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h
index 1762cb3b..bdd94192 100644
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -5,8 +5,8 @@
 #define MACE_KERNELS_RESIZE_BILINEAR_H_
 
 #include "mace/core/future.h"
-#include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/tensor.h"
 
 namespace mace {
 namespace kernels {
@@ -74,9 +74,9 @@ void ResizeImage(const T *images,
       const T *batch_input_ptr = images + in_batch_num_values * b;
       T *batch_output_ptr = output + out_batch_num_values * b;
       const T *y_lower_input_ptr =
-        batch_input_ptr + ys[y].lower * in_width * channels;
+          batch_input_ptr + ys[y].lower * in_width * channels;
       const T *y_upper_input_ptr =
-        batch_input_ptr + ys[y].upper * in_width * channels;
+          batch_input_ptr + ys[y].upper * in_width * channels;
       T *y_output_ptr = batch_output_ptr + y * out_width * channels;
       const float ys_lerp = ys[y].lerp;
 
@@ -95,7 +95,7 @@ void ResizeImage(const T *images,
           const T bottom_right = bottom_right_ptr[c];
 
           output_ptr[c] = ComputeLerp(top_left, top_right, bottom_left,
-              bottom_right, xs_lerp, ys_lerp);
+                                      bottom_right, xs_lerp, ys_lerp);
         }
       }
     }
@@ -107,10 +107,10 @@ struct ResizeBilinearFunctorBase {
   ResizeBilinearFunctorBase(const std::vector<index_t> &size,
                             bool align_corners)
       : align_corners_(align_corners) {
-        MACE_CHECK(size.size() == 2);
-        out_height_ = size[0];
-        out_width_ = size[1];
-      }
+    MACE_CHECK(size.size() == 2);
+    out_height_ = size[0];
+    out_width_ = size[1];
+  }
 
  protected:
   bool align_corners_;
@@ -163,8 +163,9 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
   }
 };
 
-template<typename T>
-struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase {
+template <typename T>
+struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
+    : ResizeBilinearFunctorBase {
   ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
       : ResizeBilinearFunctorBase(size, align_corners) {}
 
diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h
index 4b4c15f2..402bf97c 100644
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -6,9 +6,9 @@
 #define MACE_KERNELS_CONV_2D_H_
 
 #include "mace/core/future.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
@@ -16,11 +16,10 @@ namespace kernels {
 struct SpaceToBatchFunctorBase {
   SpaceToBatchFunctorBase(const std::vector<int> &paddings,
                           const std::vector<int> &block_shape,
-                          bool b2s):
-      paddings_(paddings.begin(), paddings.end()),
-      block_shape_(block_shape.begin(), block_shape.end()),
-      b2s_(b2s)
-  {}
+                          bool b2s)
+      : paddings_(paddings.begin(), paddings.end()),
+        block_shape_(block_shape.begin(), block_shape.end()),
+        b2s_(b2s) {}
 
   std::vector<int> paddings_;
   std::vector<int> block_shape_;
@@ -28,10 +27,11 @@ struct SpaceToBatchFunctorBase {
 };
 
 template <DeviceType D, typename T>
-struct SpaceToBatchFunctor : SpaceToBatchFunctorBase{
+struct SpaceToBatchFunctor : SpaceToBatchFunctorBase {
   SpaceToBatchFunctor(const std::vector<int> &paddings,
                       const std::vector<int> &block_shape,
-                      bool b2s): SpaceToBatchFunctorBase(paddings, block_shape, b2s){}
+                      bool b2s)
+      : SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
 
   void operator()(Tensor *space_tensor,
                   const std::vector<index_t> &output_shape,
@@ -42,10 +42,11 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase{
 };
 
 template <typename T>
-struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{
+struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
   SpaceToBatchFunctor(const std::vector<int> &paddings,
                       const std::vector<int> &block_shape,
-                      bool b2s): SpaceToBatchFunctorBase(paddings, block_shape, b2s){}
+                      bool b2s)
+      : SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
 
   void operator()(Tensor *space_tensor,
                   const std::vector<index_t> &output_shape,
@@ -53,7 +54,6 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{
                   StatsFuture *future);
 
   cl::Kernel kernel_;
-
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h
index fdab5c7c..464a59ce 100644
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -6,10 +6,10 @@
 #define MACE_KERNELS_WINOGRAD_TRANSFORM_H_
 
 #include "mace/core/future.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/kernels/activation.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/kernels/conv_pool_2d_util.h"
 
 namespace mace {
 namespace kernels {
@@ -17,38 +17,36 @@ namespace kernels {
 struct WinogradTransformFunctorBase {
   WinogradTransformFunctorBase(const Padding &padding_type,
                                const std::vector<int> &paddings)
-      : strides_({1, 1}), dilations_({1, 1}),
-        padding_type_(padding_type), paddings_(paddings) {}
+      : strides_({1, 1}),
+        dilations_({1, 1}),
+        padding_type_(padding_type),
+        paddings_(paddings) {}
 
-  const std::vector<int> strides_;         // [stride_h, stride_w]
-  const std::vector<int> dilations_;       // [dilation_h, dilation_w]
+  const std::vector<int> strides_;    // [stride_h, stride_w]
+  const std::vector<int> dilations_;  // [dilation_h, dilation_w]
   Padding padding_type_;
   std::vector<int> paddings_;
 };
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 struct WinogradTransformFunctor : WinogradTransformFunctorBase {
   WinogradTransformFunctor(const Padding &padding_type,
                            const std::vector<int> &paddings)
       : WinogradTransformFunctorBase(padding_type, paddings) {}
 
-  void operator()(const Tensor *input,
-                  Tensor *output,
-                  StatsFuture *future) {
+  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
     MACE_NOT_IMPLEMENTED;
   }
-
 };
 
-template<typename T>
-struct WinogradTransformFunctor<DeviceType::OPENCL, T> : WinogradTransformFunctorBase {
+template <typename T>
+struct WinogradTransformFunctor<DeviceType::OPENCL, T>
+    : WinogradTransformFunctorBase {
   WinogradTransformFunctor(const Padding &padding_type,
                            const std::vector<int> &paddings)
       : WinogradTransformFunctorBase(padding_type, paddings) {}
 
-  void operator()(const Tensor *input,
-                  Tensor *output,
-                  StatsFuture *future);
+  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
 
   cl::Kernel kernel_;
 };
@@ -72,14 +70,15 @@ struct WinogradInverseTransformFunctorBase {
   const float relux_max_limit_;
 };
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
   WinogradInverseTransformFunctor(const int batch,
                                   const int height,
                                   const int width,
                                   const ActivationType activation,
                                   const float relux_max_limit)
-      : WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit) {}
+      : WinogradInverseTransformFunctorBase(
+            batch, height, width, activation, relux_max_limit) {}
 
   void operator()(const Tensor *input,
                   const Tensor *bias,
@@ -87,17 +86,18 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
                   StatsFuture *future) {
     MACE_NOT_IMPLEMENTED;
   }
-
 };
 
-template<typename T>
-struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> : WinogradInverseTransformFunctorBase {
+template <typename T>
+struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
+    : WinogradInverseTransformFunctorBase {
   WinogradInverseTransformFunctor(const int batch,
                                   const int height,
                                   const int width,
                                   const ActivationType activation,
                                   const float relux_max_limit)
-      : WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit) {}
+      : WinogradInverseTransformFunctorBase(
+            batch, height, width, activation, relux_max_limit) {}
 
   void operator()(const Tensor *input,
                   const Tensor *bias,
diff --git a/mace/ops/activation.h b/mace/ops/activation.h
index a55dfe1a..5f08bc26 100644
--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -22,7 +22,8 @@ class ActivationOp : public Operator<D, T> {
 
   bool Run(StatsFuture *future) override {
     const Tensor *input_tensor = this->Input(0);
-    const Tensor *alpha_tensor = this->InputSize() >= 2 ? this->Input(1) : nullptr;
+    const Tensor *alpha_tensor =
+        this->InputSize() >= 2 ? this->Input(1) : nullptr;
     Tensor *output_tensor = this->outputs_[0];
     output_tensor->ResizeLike(input_tensor);
 
diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc
index ce5ddd45..77f8e745 100644
--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -214,9 +214,7 @@ void TestSimplePrelu() {
   net.AddInputFromArray<D, float>(
       "Input", {2, 2, 2, 2},
       {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
-  net.AddInputFromArray<D, float>(
-      "Alpha", {2},
-      {2.0, 3.0});
+  net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0});
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D, float>(net, "Input", "InputImage",
@@ -250,7 +248,8 @@ void TestSimplePrelu() {
   }
 
   auto expected = CreateTensor<float>(
-      {2, 2, 2, 2}, {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
+      {2, 2, 2, 2},
+      {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
diff --git a/mace/ops/addn.h b/mace/ops/addn.h
index 9adc3341..5824844b 100644
--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
@@ -26,12 +26,10 @@ class AddNOp : public Operator<D, T> {
     for (int i = 1; i < n; ++i) {
       inputs[i] = this->Input(i);
       MACE_CHECK(inputs[0]->dim_size() == inputs[i]->dim_size());
-      MACE_CHECK(inputs[0]->size() == inputs[i]->size()) << "Input 0: "
-                                                         << MakeString(inputs[0]->shape())
-                                                         << ", size: " << inputs[0]->size()
-                                                         << ". Input " << i << ": "
-                                                         << MakeString(inputs[i]->shape())
-                                                         << ", size: " << inputs[i]->size();
+      MACE_CHECK(inputs[0]->size() == inputs[i]->size())
+          << "Input 0: " << MakeString(inputs[0]->shape())
+          << ", size: " << inputs[0]->size() << ". Input " << i << ": "
+          << MakeString(inputs[i]->shape()) << ", size: " << inputs[i]->size();
     }
 
     functor_(inputs, output_tensor, future);
diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc
index 8ffccad2..85c7853d 100644
--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -15,8 +15,7 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
   OpsTestNet net;
   // Add input data
   for (int i = 0; i < inputs; ++i) {
-    net.AddRandomInput<D, float>(MakeString("Input", i).c_str(),
-                                 {n, h, w, c});
+    net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
   }
 
   if (D == DeviceType::OPENCL) {
diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc
index b0975aa3..5091e26e 100644
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -76,7 +76,7 @@ static void BatchNorm(
   static void BM_BATCH_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
       int iters) {                                                     \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;   \
-    mace::testing::MaccProcessed(tot);                                \
+    mace::testing::MaccProcessed(tot);                                 \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                \
     BatchNorm<DEVICE, TYPE>(iters, N, C, H, W);                        \
   }                                                                    \
diff --git a/mace/ops/batch_to_space.h b/mace/ops/batch_to_space.h
index 59f8e03b..91d2c0c6 100644
--- a/mace/ops/batch_to_space.h
+++ b/mace/ops/batch_to_space.h
@@ -12,15 +12,14 @@
 
 namespace mace {
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 class BatchToSpaceNDOp : public Operator<D, T> {
  public:
   BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws)
       : Operator<D, T>(op_def, ws),
-        functor_(
-            OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0}),
-            OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}),
-            true) {}
+        functor_(OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0}),
+                 OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}),
+                 true) {}
 
   bool Run(StatsFuture *future) override {
     const Tensor *batch_tensor = this->Input(INPUT);
@@ -28,7 +27,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
 
     std::vector<index_t> output_shape(4, 0);
     CalculateOutputShape(batch_tensor, space_tensor, output_shape.data());
-    functor_(space_tensor, output_shape, const_cast<Tensor *>(batch_tensor), future);
+    functor_(space_tensor, output_shape, const_cast<Tensor *>(batch_tensor),
+             future);
     return true;
   }
 
@@ -37,7 +37,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
                                    Tensor *output,
                                    index_t *output_shape) {
     auto crops = OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0});
-    auto block_shape = OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1});
+    auto block_shape =
+        OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1});
     MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
     MACE_CHECK(block_shape.size() == 2, "Block's shape should be 1D");
     MACE_CHECK(crops.size() == 4, "Crops' shape should be 2D");
@@ -45,13 +46,13 @@ class BatchToSpaceNDOp : public Operator<D, T> {
     const index_t block_dims = block_shape.size();
     index_t block_shape_product = 1;
     for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) {
-      MACE_CHECK(block_shape[block_dim] > 1, "block_shape's value should be great to 1");
+      MACE_CHECK(block_shape[block_dim] > 1,
+                 "block_shape's value should be great to 1");
       const index_t block_shape_value = block_shape[block_dim];
-      const index_t cropped_input_size = input_tensor->dim(block_dim + 1) * block_shape_value
-          - crops[block_dim * 2]
-          - crops[block_dim * 2 + 1];
-      MACE_CHECK(cropped_input_size >= 0,
-                 "cropped size must be non-negative");
+      const index_t cropped_input_size =
+          input_tensor->dim(block_dim + 1) * block_shape_value -
+          crops[block_dim * 2] - crops[block_dim * 2 + 1];
+      MACE_CHECK(cropped_input_size >= 0, "cropped size must be non-negative");
       block_shape_product *= block_shape_value;
       output_shape[block_dim + 1] = cropped_input_size;
     }
diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc
index bac02236..aa68adc4 100644
--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -41,7 +41,7 @@ static void BMBatchToSpace(
       BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE( \
           int iters) {                                                     \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;       \
-    mace::testing::MaccProcessed(tot);                                    \
+    mace::testing::MaccProcessed(tot);                                     \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                    \
     BMBatchToSpace<DEVICE, TYPE>(iters, N, C, H, W, ARG);                  \
   }                                                                        \
diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc
index d59885de..8af9405b 100644
--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -53,7 +53,7 @@ static void BiasAdd(int iters, int batch, int channels, int height, int width) {
   static void BM_BIAS_ADD_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE( \
       int iters) {                                                   \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
-    mace::testing::MaccProcessed(tot);                              \
+    mace::testing::MaccProcessed(tot);                               \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
     BiasAdd<DEVICE, TYPE>(iters, N, C, H, W);                        \
   }                                                                  \
diff --git a/mace/ops/buffer_to_image.h b/mace/ops/buffer_to_image.h
index 72306338..d1d8621b 100644
--- a/mace/ops/buffer_to_image.h
+++ b/mace/ops/buffer_to_image.h
@@ -11,16 +11,17 @@
 namespace mace {
 
 template <DeviceType D, typename T>
-class BufferToImageOp: public Operator<D, T> {
+class BufferToImageOp : public Operator<D, T> {
  public:
   BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws)  {}
+      : Operator<D, T>(op_def, ws) {}
 
   bool Run(StatsFuture *future) override {
     const Tensor *input_tensor = this->Input(INPUT);
 
-    kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
-        "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
+    kernels::BufferType type =
+        static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
+            "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
     Tensor *output = this->Output(OUTPUT);
 
     functor_(const_cast<Tensor *>(input_tensor), type, output, future);
diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc
index 34c7d16f..04baa382 100644
--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -7,8 +7,9 @@
 
 using namespace mace;
 
-template<DeviceType D, typename T>
-void TestBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
+template <DeviceType D, typename T>
+void TestBidirectionTransform(const int type,
+                              const std::vector<index_t> &input_shape) {
   OpsTestNet net;
   OpDefBuilder("BufferToImage", "BufferToImageTest")
       .Input("Input")
@@ -34,7 +35,8 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
   net.RunOp(D);
 
   // Check
-  ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-5);
+  ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
+                      1e-5);
 }
 
 TEST(BufferToImageTest, ArgSmall) {
@@ -54,51 +56,63 @@ TEST(BufferToImageTest, ArgLarge) {
 }
 
 TEST(BufferToImageTest, InputSmallSingleChannel) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {1, 2, 3, 1});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+                                                      {1, 2, 3, 1});
 }
 
 TEST(BufferToImageTest, InputSmallMultipleChannel) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {1, 2, 3, 3});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+                                                      {1, 2, 3, 3});
 }
 
 TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {3, 2, 3, 3});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+                                                      {3, 2, 3, 3});
 }
 
 TEST(BufferToImageTest, InputMedia) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {3, 13, 17, 128});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+                                                      {3, 13, 17, 128});
 }
 
 TEST(BufferToImageTest, InputLarge) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {3, 64, 64, 256});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+                                                      {3, 64, 64, 256});
 }
 
 TEST(BufferToImageTest, Filter1x1Small) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 3, 5});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+                                                      {1, 1, 3, 5});
 }
 
 TEST(BufferToImageTest, Filter1x1Media) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 13, 17});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+                                                      {1, 1, 13, 17});
 }
 
 TEST(BufferToImageTest, Filter1x1Large) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 128, 512});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+                                                      {1, 1, 128, 512});
 }
 
 TEST(BufferToImageTest, Filter3x3Small) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 3, 5});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+                                                      {3, 3, 3, 5});
 }
 
 TEST(BufferToImageTest, Filter3x3Meida) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 13, 17});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+                                                      {3, 3, 13, 17});
 }
 
 TEST(BufferToImageTest, Filter3x3Large) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 128, 256});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+                                                      {3, 3, 128, 256});
 }
 
-template<DeviceType D, typename T>
-void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
+template <DeviceType D, typename T>
+void TestDiffTypeBidirectionTransform(const int type,
+                                      const std::vector<index_t> &input_shape) {
   OpsTestNet net;
   OpDefBuilder("BufferToImage", "BufferToImageTest")
       .Input("Input")
@@ -123,14 +137,16 @@ void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t>
   net.RunOp(D);
 
   // Check
-  ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-2);
+  ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
+                          1e-2);
 }
 
 TEST(BufferToImageTest, ArgFloatToHalfSmall) {
-  TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
+  TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT,
+                                                             {11});
 }
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 void TestStringHalfBidirectionTransform(const int type,
                                         const std::vector<index_t> &input_shape,
                                         const unsigned char *input_data) {
@@ -142,9 +158,10 @@ void TestStringHalfBidirectionTransform(const int type,
       .AddIntArg("T", DataTypeToEnum<T>::value)
       .Finalize(net.NewOperatorDef());
 
-  const half *h_data = reinterpret_cast<const half*>(input_data);
+  const half *h_data = reinterpret_cast<const half *>(input_data);
 
-  net.AddInputFromArray<D, half>("Input", input_shape, std::vector<half>(h_data, h_data+2));
+  net.AddInputFromArray<D, half>("Input", input_shape,
+                                 std::vector<half>(h_data, h_data + 2));
 
   // Run
   net.RunOp(D);
@@ -160,12 +177,14 @@ void TestStringHalfBidirectionTransform(const int type,
   net.RunOp(D);
 
   // Check
-  ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-2);
+  ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
+                         1e-2);
 }
 
 TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
-  const unsigned char input_data[] = {0xCD, 0x3C, 0x33, 0x40,};
-  TestStringHalfBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT,
-                                                               {2},
-                                                               input_data);
+  const unsigned char input_data[] = {
+      0xCD, 0x3C, 0x33, 0x40,
+  };
+  TestStringHalfBidirectionTransform<DeviceType::OPENCL, half>(
+      kernels::ARGUMENT, {2}, input_data);
 }
diff --git a/mace/ops/channel_shuffle.h b/mace/ops/channel_shuffle.h
index 9f6b19be..b87d6263 100644
--- a/mace/ops/channel_shuffle.h
+++ b/mace/ops/channel_shuffle.h
@@ -28,8 +28,8 @@ class ChannelShuffleOp : public Operator<D, T> {
                input->shape()[1]);
 
     output->ResizeLike(input);
-    functor_(input->data<T>(), input->shape().data(),
-             output->mutable_data<T>(), future);
+    functor_(input->data<T>(), input->shape().data(), output->mutable_data<T>(),
+             future);
 
     return true;
   }
diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc
index a984b39d..bf64eda2 100644
--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -41,7 +41,7 @@ static void ChannelShuffle(
   static void BM_CHANNEL_SHUFFLE_##N##_##C##_##H##_##W##_##G##_##DEVICE( \
       int iters) {                                                       \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;     \
-    mace::testing::MaccProcessed(tot);                                  \
+    mace::testing::MaccProcessed(tot);                                   \
     mace::testing::BytesProcessed(tot *(sizeof(float)));                 \
     ChannelShuffle<DEVICE>(iters, N, C, H, W, G);                        \
   }                                                                      \
diff --git a/mace/ops/concat.h b/mace/ops/concat.h
index 4577dc8b..cadd5293 100644
--- a/mace/ops/concat.h
+++ b/mace/ops/concat.h
@@ -14,10 +14,11 @@ class ConcatOp : public Operator<D, T> {
  public:
   ConcatOp(const OperatorDef &op_def, Workspace *ws)
       : Operator<D, T>(op_def, ws),
-        functor_(OperatorBase::GetSingleArgument<int>("axis", 3)){}
+        functor_(OperatorBase::GetSingleArgument<int>("axis", 3)) {}
 
   bool Run(StatsFuture *future) override {
-    MACE_CHECK(this->InputSize() >= 2) << "There must be at least two inputs to concat";
+    MACE_CHECK(this->InputSize() >= 2)
+        << "There must be at least two inputs to concat";
     const std::vector<const Tensor *> input_list = this->Inputs();
     const int32_t concat_axis = OperatorBase::GetSingleArgument<int>("axis", 3);
     const int32_t input_dims = input_list[0]->dim_size();
diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc
index cc447b83..bbbbc126 100644
--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -37,11 +37,10 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
   }
 }
 
-#define BM_CONCAT_CPU_MACRO(DIM0, DIM1) \
-  static void BM_CONCAT_CPU_##DIM0##_##DIM1( \
-      int iters) { \
+#define BM_CONCAT_CPU_MACRO(DIM0, DIM1)                      \
+  static void BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) {     \
     ConcatHelper<DeviceType::CPU, float>(iters, DIM0, DIM1); \
-  } \
+  }                                                          \
   BENCHMARK(BM_CONCAT_CPU_##DIM0##_##DIM1)
 
 BM_CONCAT_CPU_MACRO(0, 1000);
@@ -90,13 +89,11 @@ static void OpenclConcatHelper(int iters,
   }
 }
 
-
-#define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE) \
-  static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE( \
-      int iters) { \
-    std::vector<index_t> shape = {N, H, W, C}; \
-    OpenclConcatHelper<TYPE>(iters, shape, shape, 3); \
-  } \
+#define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE)                           \
+  static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) { \
+    std::vector<index_t> shape = {N, H, W, C};                             \
+    OpenclConcatHelper<TYPE>(iters, shape, shape, 3);                      \
+  }                                                                        \
   BENCHMARK(BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE)
 
 BM_CONCAT_OPENCL_MACRO(3, 32, 32, 32, float);
diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc
index a49e593c..2e061ad4 100644
--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -112,8 +112,8 @@ TEST_F(ConcatOpTest, CPURandom) {
     concat_axis_size += input_shapes[i][axis];
     GenerateRandomRealTypeData(input_shapes[i], inputs[i]);
     input_ptrs[i] = inputs[i].data();
-    net.AddInputFromArray<DeviceType::CPU, float>(
-        MakeString("Input", i), input_shapes[i], inputs[i]);
+    net.AddInputFromArray<DeviceType::CPU, float>(MakeString("Input", i),
+                                                  input_shapes[i], inputs[i]);
   }
 
   // Run
@@ -214,6 +214,6 @@ TEST_F(ConcatOpTest, OPENCLUnAligned) {
 }
 
 TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) {
-  OpenclRandomTest<float>({{3, 32, 32, 32}, {3, 32, 32, 32},
-                           {3, 32, 32, 32}, {3, 32, 32, 32}}, 3);
+  OpenclRandomTest<float>(
+      {{3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}}, 3);
 }
\ No newline at end of file
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index 184772c4..086f7328 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -2,8 +2,8 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-#include <fstream>
 #include "mace/ops/conv_2d.h"
+#include <fstream>
 #include "mace/ops/ops_test_util.h"
 
 using namespace mace;
@@ -342,7 +342,8 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
 TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); }
 
 template <DeviceType D, typename T>
-static void TestComplexConvNxNS12(const std::vector<index_t> &shape, const int stride) {
+static void TestComplexConvNxNS12(const std::vector<index_t> &shape,
+                                  const int stride) {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                   Padding type) {
@@ -412,27 +413,21 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape, const int s
 }
 
 TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32},
-                                                   1);
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32},
-                                                   2);
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 1);
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 2);
 }
 
 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7},
-                                                   1);
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7},
-                                                   2);
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 1);
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 2);
 }
 
 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({31, 113, 13, 17},
-                                                   3);
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 13, 17},
-                                                   4);
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({31, 113, 13, 17}, 3);
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 13, 17}, 4);
 }
 
-template<DeviceType D>
+template <DeviceType D>
 static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
                                       const std::vector<index_t> &filter_shape,
                                       const std::vector<int> &dilations) {
@@ -519,67 +514,58 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
-                                                {1, 1, 32, 64},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 1, 32, 64},
                                                 {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
-                                                {3, 3, 32, 64},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {3, 3, 32, 64},
                                                 {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
-                                                {15, 1, 256, 2},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {15, 1, 256, 2},
                                                 {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
-                                                {1, 15, 256, 2},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 15, 256, 2},
                                                 {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
-                                                {7, 7, 3, 64},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {7, 7, 3, 64},
                                                 {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
-                                                {1, 1, 5, 7},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {1, 1, 5, 7},
                                                 {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
-                                                {3, 3, 5, 7},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {3, 3, 5, 7},
                                                 {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64},
-                                                {5, 5, 16, 16},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {5, 5, 16, 16},
                                                 {2, 2});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64},
-                                                {7, 7, 16, 16},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {7, 7, 16, 16},
                                                 {2, 2});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67},
-                                                {7, 7, 16, 16},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67}, {7, 7, 16, 16},
                                                 {4, 4});
 }
 
-template<DeviceType D, typename T>
-static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dilation_rate) {
+template <DeviceType D, typename T>
+static void TestDilationConvNxN(const std::vector<index_t> &shape,
+                                const int dilation_rate) {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                   Padding type) {
@@ -617,9 +603,12 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
     expected.Copy(*net.GetOutput("Output"));
 
     // run on gpu
-    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Input", "InputImage",
+                        kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, T>(net, "Filter", "FilterImage",
+                        kernels::BufferType::CONV2D_FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage",
+                        kernels::BufferType::ARGUMENT);
 
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
@@ -634,7 +623,8 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
     // Run on device
     net.RunOp(D);
 
-    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
+    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
+                        kernels::BufferType::IN_OUT_CHANNEL);
     ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
   };
 
@@ -647,22 +637,20 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
 }
 
 TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) {
-  TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64},
-                                                 2);
+  TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, 2);
 }
 
 TEST_F(Conv2dOpTest, OPENCLAligned2Dilation4) {
-  TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16},
-                                                 4);
+  TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 4);
 }
 
 TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
-  TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7},
-                                                 4);
+  TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 4);
 }
 
-template<DeviceType D, typename T>
-static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std::vector<int> &paddings) {
+template <DeviceType D, typename T>
+static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
+                                    const std::vector<int> &paddings) {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w) {
     srand(time(NULL));
@@ -698,9 +686,12 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std
     expected.Copy(*net.GetOutput("Output"));
 
     // run on gpu
-    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Input", "InputImage",
+                        kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, T>(net, "Filter", "FilterImage",
+                        kernels::BufferType::CONV2D_FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage",
+                        kernels::BufferType::ARGUMENT);
 
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("InputImage")
@@ -714,7 +705,8 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std
     // Run on device
     net.RunOp(D);
 
-    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
+    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
+                        kernels::BufferType::IN_OUT_CHANNEL);
     ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
   };
 
@@ -726,8 +718,7 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std
 }
 
 TEST_F(Conv2dOpTest, OPENCLAlignedPad1) {
-  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64},
-                                                     {1, 1});
+  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
@@ -736,6 +727,5 @@ TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
 }
 
 TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
-  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7},
-                                                     {4, 4});
+  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, {4, 4});
 }
diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h
index 5bbd9441..621a8f2b 100644
--- a/mace/ops/eltwise.h
+++ b/mace/ops/eltwise.h
@@ -18,15 +18,17 @@ class EltwiseOp : public Operator<D, T> {
         functor_(static_cast<kernels::EltwiseType>(
                      OperatorBase::GetSingleArgument<int>(
                          "type", static_cast<int>(kernels::EltwiseType::SUM))),
-                 OperatorBase::GetRepeatedArgument<float>("coeff")){}
+                 OperatorBase::GetRepeatedArgument<float>("coeff")) {}
 
   bool Run(StatsFuture *future) override {
     const Tensor *input0 = this->Input(0);
     const Tensor *input1 = this->Input(1);
     Tensor *output = this->Output(OUTPUT);
-    MACE_CHECK(input0->dim_size() == input1->dim_size()) << "Inputs of Eltwise op must be same shape";
-    for(int i = 0; i < input0->dim_size(); ++i) {
-      MACE_CHECK(input0->dim(i) == input1->dim(i)) << "Inputs of Eltwise op must be same shape";
+    MACE_CHECK(input0->dim_size() == input1->dim_size())
+        << "Inputs of Eltwise op must be same shape";
+    for (int i = 0; i < input0->dim_size(); ++i) {
+      MACE_CHECK(input0->dim(i) == input1->dim(i))
+          << "Inputs of Eltwise op must be same shape";
     }
 
     output->ResizeLike(input0);
diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc
index c2f48643..8dcb243a 100644
--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -61,7 +61,7 @@ static void EltwiseBenchmark(
       BM_ELTWISE_##ELT_TYPE##_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
           int iters) {                                                   \
     const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;     \
-    mace::testing::MaccProcessed(tot);                                  \
+    mace::testing::MaccProcessed(tot);                                   \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                  \
     EltwiseBenchmark<DEVICE, TYPE>(                                      \
         iters, static_cast<kernels::EltwiseType>(ELT_TYPE), N, H, W, C); \
diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc
index 3e3d3362..ae8cf5f0 100644
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -2,15 +2,15 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
+#include "mace/kernels/eltwise.h"
 #include "mace/core/operator.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/kernels/eltwise.h"
 
 namespace mace {
 
 class EltwiseOpTest : public OpsTestBase {};
 
-template<DeviceType D>
+template <DeviceType D>
 void Simple(const kernels::EltwiseType type,
             const std::vector<index_t> &shape,
             const std::vector<float> &input0,
@@ -36,8 +36,10 @@ void Simple(const kernels::EltwiseType type,
     // Run
     net.RunOp(D);
   } else {
-    BufferToImage<D, half>(net, "Input1", "InputImg1", kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, half>(net, "Input2", "InputImg2", kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, half>(net, "Input1", "InputImg1",
+                           kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, half>(net, "Input2", "InputImg2",
+                           kernels::BufferType::IN_OUT_CHANNEL);
     OpDefBuilder("Eltwise", "EltwiseTest")
         .Input("InputImg1")
         .Input("InputImg2")
@@ -49,7 +51,8 @@ void Simple(const kernels::EltwiseType type,
     // Run
     net.RunOp(D);
 
-    ImageToBuffer<D, float>(net, "OutputImg", "Output", kernels::BufferType::IN_OUT_CHANNEL);
+    ImageToBuffer<D, float>(net, "OutputImg", "Output",
+                            kernels::BufferType::IN_OUT_CHANNEL);
   }
 
   auto expected = CreateTensor<float>(shape, output);
@@ -58,64 +61,42 @@ void Simple(const kernels::EltwiseType type,
 }
 
 TEST_F(EltwiseOpTest, CPUSimple) {
-  Simple<DeviceType::CPU>(kernels::EltwiseType::PROD,
-                          {1, 1, 2, 3},
-                          {1, 2, 3, 4, 5, 6},
-                          {1, 2, 3, 4, 5, 6},
+  Simple<DeviceType::CPU>(kernels::EltwiseType::PROD, {1, 1, 2, 3},
+                          {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
                           {1, 4, 9, 16, 25, 36});
-  Simple<DeviceType::CPU>(kernels::EltwiseType::SUM,
-                          {1, 1, 2, 3},
-                          {1, 2, 3, 4, 5, 6},
-                          {1, 2, 3, 4, 5, 6},
+  Simple<DeviceType::CPU>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
+                          {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
                           {2, 4, 6, 8, 10, 12});
-  Simple<DeviceType::CPU>(kernels::EltwiseType::SUM,
-                          {1, 1, 2, 3},
-                          {1, 2, 3, 4, 5, 6},
-                          {1, 2, 3, 4, 5, 6},
-                          {3, 6, 9, 12, 15, 18},
-                          {2, 1});
-  Simple<DeviceType::CPU>(kernels::EltwiseType::MAX,
-                          {1, 1, 2, 3},
-                          {1, 2, 3, 4, 5, 6},
-                          {1, 1, 3, 3, 6, 6},
+  Simple<DeviceType::CPU>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
+                          {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
+                          {3, 6, 9, 12, 15, 18}, {2, 1});
+  Simple<DeviceType::CPU>(kernels::EltwiseType::MAX, {1, 1, 2, 3},
+                          {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
                           {1, 2, 3, 4, 6, 6});
-  Simple<DeviceType::CPU>(kernels::EltwiseType::MIN,
-                          {1, 1, 2, 3},
-                          {1, 2, 3, 4, 5, 6},
-                          {1, 1, 3, 3, 6, 6},
+  Simple<DeviceType::CPU>(kernels::EltwiseType::MIN, {1, 1, 2, 3},
+                          {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
                           {1, 1, 3, 3, 5, 6});
 }
 
 TEST_F(EltwiseOpTest, GPUSimple) {
-  Simple<DeviceType::OPENCL>(kernels::EltwiseType::PROD,
-                             {1, 1, 2, 3},
-                             {1, 2, 3, 4, 5, 6},
-                             {1, 2, 3, 4, 5, 6},
+  Simple<DeviceType::OPENCL>(kernels::EltwiseType::PROD, {1, 1, 2, 3},
+                             {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
                              {1, 4, 9, 16, 25, 36});
-  Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM,
-                             {1, 1, 2, 3},
-                             {1, 2, 3, 4, 5, 6},
-                             {1, 2, 3, 4, 5, 6},
+  Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
+                             {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
                              {2, 4, 6, 8, 10, 12});
-  Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM,
-                             {1, 1, 2, 3},
-                             {1, 2, 3, 4, 5, 6},
-                             {1, 2, 3, 4, 5, 6},
-                             {3, 6, 9, 12, 15, 18},
-                             {2, 1});
-  Simple<DeviceType::OPENCL>(kernels::EltwiseType::MAX,
-                             {1, 1, 2, 3},
-                             {1, 2, 3, 4, 5, 6},
-                             {1, 1, 3, 3, 6, 6},
+  Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
+                             {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
+                             {3, 6, 9, 12, 15, 18}, {2, 1});
+  Simple<DeviceType::OPENCL>(kernels::EltwiseType::MAX, {1, 1, 2, 3},
+                             {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
                              {1, 2, 3, 4, 6, 6});
-  Simple<DeviceType::OPENCL>(kernels::EltwiseType::MIN,
-                             {1, 1, 2, 3},
-                             {1, 2, 3, 4, 5, 6},
-                             {1, 1, 3, 3, 6, 6},
+  Simple<DeviceType::OPENCL>(kernels::EltwiseType::MIN, {1, 1, 2, 3},
+                             {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
                              {1, 1, 3, 3, 5, 6});
 }
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 void RandomTest(const kernels::EltwiseType type,
                 const std::vector<index_t> &shape) {
   testing::internal::LogToStderr();
@@ -139,8 +120,10 @@ void RandomTest(const kernels::EltwiseType type,
   // Run
   net.RunOp();
 
-  BufferToImage<D, T>(net, "Input1", "InputImg1", kernels::BufferType::IN_OUT_CHANNEL);
-  BufferToImage<D, T>(net, "Input2", "InputImg2", kernels::BufferType::IN_OUT_CHANNEL);
+  BufferToImage<D, T>(net, "Input1", "InputImg1",
+                      kernels::BufferType::IN_OUT_CHANNEL);
+  BufferToImage<D, T>(net, "Input2", "InputImg2",
+                      kernels::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Eltwise", "EltwiseTest")
       .Input("InputImg1")
       .Input("InputImg2")
@@ -153,12 +136,15 @@ void RandomTest(const kernels::EltwiseType type,
   // Run
   net.RunOp(D);
 
-  ImageToBuffer<D, float>(net, "OutputImg", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
+  ImageToBuffer<D, float>(net, "OutputImg", "OPENCLOutput",
+                          kernels::BufferType::IN_OUT_CHANNEL);
 
   if (DataTypeToEnum<T>::value == DT_FLOAT) {
-    ExpectTensorNear<float>(*net.GetTensor("Output"), *net.GetOutput("OPENCLOutput"), 1e-3);
+    ExpectTensorNear<float>(*net.GetTensor("Output"),
+                            *net.GetOutput("OPENCLOutput"), 1e-3);
   } else {
-    ExpectTensorNear<float>(*net.GetTensor("Output"), *net.GetOutput("OPENCLOutput"), 1e-1);
+    ExpectTensorNear<float>(*net.GetTensor("Output"),
+                            *net.GetOutput("OPENCLOutput"), 1e-1);
   }
 }
 
diff --git a/mace/ops/folded_batch_norm.cc b/mace/ops/folded_batch_norm.cc
index 8658d577..5847ab94 100644
--- a/mace/ops/folded_batch_norm.cc
+++ b/mace/ops/folded_batch_norm.cc
@@ -7,25 +7,22 @@
 namespace mace {
 
 void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
-  REGISTER_OPERATOR(op_registry,
-                    OpKeyBuilder("FoldedBatchNorm")
-                        .Device(DeviceType::CPU)
-                        .TypeConstraint<float>("T")
-                        .Build(),
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
+                                     .Device(DeviceType::CPU)
+                                     .TypeConstraint<float>("T")
+                                     .Build(),
                     FoldedBatchNormOp<DeviceType::CPU, float>);
 
-  REGISTER_OPERATOR(op_registry,
-                    OpKeyBuilder("FoldedBatchNorm")
-                        .Device(DeviceType::OPENCL)
-                        .TypeConstraint<float>("T")
-                        .Build(),
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
+                                     .Device(DeviceType::OPENCL)
+                                     .TypeConstraint<float>("T")
+                                     .Build(),
                     FoldedBatchNormOp<DeviceType::OPENCL, float>);
 
-  REGISTER_OPERATOR(op_registry,
-                    OpKeyBuilder("FoldedBatchNorm")
-                        .Device(DeviceType::OPENCL)
-                        .TypeConstraint<half>("T")
-                        .Build(),
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
+                                     .Device(DeviceType::OPENCL)
+                                     .TypeConstraint<half>("T")
+                                     .Build(),
                     FoldedBatchNormOp<DeviceType::OPENCL, half>);
 }
 
diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc
index 45bd6736..77bf351d 100644
--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -17,7 +17,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma,
                           std::vector<float> &scale,
                           std::vector<float> &offset) {
   size_t size = gamma.size();
-  for (int i = 0 ; i < size; ++i) {
+  for (int i = 0; i < size; ++i) {
     scale[i] = gamma[i] / std::sqrt(var[i] + epsilon);
     offset[i] = offset[i] - mean[i] * scale[i];
   }
diff --git a/mace/ops/fully_connected.h b/mace/ops/fully_connected.h
index c65947af..2f915149 100644
--- a/mace/ops/fully_connected.h
+++ b/mace/ops/fully_connected.h
@@ -15,11 +15,10 @@ class FullyConnectedOp : public Operator<D, T> {
  public:
   FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
       : Operator<D, T>(operator_def, ws),
-        functor_(
-            kernels::StringToActivationType(
-                OperatorBase::GetSingleArgument<std::string>("activation",
-                                                             "NOOP")),
-            OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
+        functor_(kernels::StringToActivationType(
+                     OperatorBase::GetSingleArgument<std::string>("activation",
+                                                                  "NOOP")),
+                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
 
   bool Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc
index 04776899..9ada2c54 100644
--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
@@ -17,16 +17,17 @@ static void FCBenchmark(
 
   // Add input data
   net.AddRandomInput<D, float>("Input", {batch, height, width, channel});
-  net.AddRandomInput<D, float>("Weight", {out_channel, height * width * channel});
+  net.AddRandomInput<D, float>("Weight",
+                               {out_channel, height * width * channel});
   net.AddRandomInput<D, float>("Bias", {out_channel});
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D, T>(net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+                        kernels::BufferType::IN_OUT_CHANNEL);
     BufferToImage<D, T>(net, "Weight", "WeightImage",
-                            kernels::BufferType::WEIGHT_HEIGHT);
+                        kernels::BufferType::WEIGHT_HEIGHT);
     BufferToImage<D, T>(net, "Bias", "BiasImage",
-                            kernels::BufferType::ARGUMENT);
+                        kernels::BufferType::ARGUMENT);
 
     OpDefBuilder("FC", "FullyConnectedTest")
         .Input("InputImage")
@@ -57,14 +58,17 @@ static void FCBenchmark(
   net.Sync();
 }
 
-#define BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE)                              \
-  static void BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE(int iters) { \
-    const int64_t macc = static_cast<int64_t>(iters) * N * C * H * W * OC + OC;  \
-    const int64_t tot = static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC; \
-    mace::testing::MaccProcessed(macc);                                          \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                          \
-    FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC);                            \
-  }                                                                              \
+#define BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE)                     \
+  static void BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE( \
+      int iters) {                                                    \
+    const int64_t macc =                                              \
+        static_cast<int64_t>(iters) * N * C * H * W * OC + OC;        \
+    const int64_t tot =                                               \
+        static_cast<int64_t>(iters) * (N + OC) * C * H * W + OC;      \
+    mace::testing::MaccProcessed(macc);                               \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));               \
+    FCBenchmark<DEVICE, TYPE>(iters, N, H, W, C, OC);                 \
+  }                                                                   \
   BENCHMARK(BM_FC_##N##_##H##_##W##_##C##_##OC##_##TYPE##_##DEVICE)
 
 #define BM_FC(N, H, W, C, OC)                 \
diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc
index a945f41a..3a41dd87 100644
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -10,7 +10,7 @@ namespace mace {
 
 class FullyConnectedOpTest : public OpsTestBase {};
 
-template<DeviceType D>
+template <DeviceType D>
 void Simple(const std::vector<index_t> &input_shape,
             const std::vector<float> &input_value,
             const std::vector<index_t> &weight_shape,
@@ -58,83 +58,52 @@ void Simple(const std::vector<index_t> &input_shape,
   }
 
   // Check
-  auto expected =
-      CreateTensor<float>(output_shape, output_value);
+  auto expected = CreateTensor<float>(output_shape, output_value);
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 
 TEST_F(FullyConnectedOpTest, SimpleCPU) {
-  Simple<DeviceType::CPU>({1, 2, 2, 2},
-                          {1, 2, 3, 4, 5, 6, 7, 8},
-                          {1, 8},
-                          {1, 2, 3, 4, 5, 6, 7, 8},
-                          {1}, {2},
-                          {1, 1, 1, 1}, {206});
-  Simple<DeviceType::CPU>({1, 1, 2, 5},
-                          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
-                          {2, 10},
-                          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                           10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
-                          {2}, {2, 3},
-                          {1, 1, 1, 2}, {387, 3853});
-  Simple<DeviceType::CPU>({1, 1, 2, 3},
-                          {1, 2, 3, 4, 5, 6},
-                          {5, 6},
-                          {1, 2, 3, 4, 5, 6,
-                           10, 20, 30, 40, 50, 60,
-                           1, 2, 3, 4, 5, 6,
-                           10, 20, 30, 40, 50, 60,
-                           1, 2, 3, 4, 5, 6},
-                          {5}, {1, 2, 3, 4, 5},
-                          {1, 1, 1, 5}, {92, 912, 94, 914, 96});
+  Simple<DeviceType::CPU>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8},
+                          {1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1},
+                          {206});
+  Simple<DeviceType::CPU>(
+      {1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 10},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
+      {2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
+  Simple<DeviceType::CPU>(
+      {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 6},
+      {1, 2, 3, 4,  5,  6,  10, 20, 30, 40, 50, 60, 1, 2, 3,
+       4, 5, 6, 10, 20, 30, 40, 50, 60, 1,  2,  3,  4, 5, 6},
+      {5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96});
 }
 
 TEST_F(FullyConnectedOpTest, SimpleCPUWithBatch) {
-  Simple<DeviceType::CPU>({2, 1, 2, 2},
-                          {1, 2, 3, 4, 5, 6, 7, 8},
-                          {1, 4},
-                          {1, 2, 3, 4},
-                          {1}, {2},
-                          {2, 1, 1, 1}, {32, 72});
+  Simple<DeviceType::CPU>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4},
+                          {1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
 }
 
 TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
-  Simple<DeviceType::OPENCL>({1, 2, 2, 2},
-                             {1, 2, 3, 4, 5, 6, 7, 8},
-                             {1, 8},
-                             {1, 2, 3, 4, 5, 6, 7, 8},
-                             {1}, {2},
-                             {1, 1, 1, 1}, {206});
-  Simple<DeviceType::OPENCL>({1, 1, 2, 5},
-                             {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
-                             {2, 10},
-                             {1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                              10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
-                             {2}, {2, 3},
-                             {1, 1, 1, 2}, {387, 3853});
-  Simple<DeviceType::OPENCL>({1, 1, 2, 3},
-                             {1, 2, 3, 4, 5, 6},
-                             {5, 6},
-                             {1, 2, 3, 4, 5, 6,
-                              10, 20, 30, 40, 50, 60,
-                              1, 2, 3, 4, 5, 6,
-                              10, 20, 30, 40, 50, 60,
-                              1, 2, 3, 4, 5, 6},
-                             {5}, {1, 2, 3, 4, 5},
-                             {1, 1, 1, 5}, {92, 912, 94, 914, 96});
+  Simple<DeviceType::OPENCL>({1, 2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 8},
+                             {1, 2, 3, 4, 5, 6, 7, 8}, {1}, {2}, {1, 1, 1, 1},
+                             {206});
+  Simple<DeviceType::OPENCL>(
+      {1, 1, 2, 5}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {2, 10},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100},
+      {2}, {2, 3}, {1, 1, 1, 2}, {387, 3853});
+  Simple<DeviceType::OPENCL>(
+      {1, 1, 2, 3}, {1, 2, 3, 4, 5, 6}, {5, 6},
+      {1, 2, 3, 4,  5,  6,  10, 20, 30, 40, 50, 60, 1, 2, 3,
+       4, 5, 6, 10, 20, 30, 40, 50, 60, 1,  2,  3,  4, 5, 6},
+      {5}, {1, 2, 3, 4, 5}, {1, 1, 1, 5}, {92, 912, 94, 914, 96});
 }
 
 TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) {
-  Simple<DeviceType::OPENCL>({2, 1, 2, 2},
-                             {1, 2, 3, 4, 5, 6, 7, 8},
-                             {1, 4},
-                             {1, 2, 3, 4},
-                             {1}, {2},
-                             {2, 1, 1, 1}, {32, 72});
+  Simple<DeviceType::OPENCL>({2, 1, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 4},
+                             {1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
 }
 
-template<typename T>
+template <typename T>
 void Complex(const index_t batch,
              const index_t height,
              const index_t width,
@@ -156,8 +125,7 @@ void Complex(const index_t batch,
       "Input", {batch, height, width, channels});
   net.AddRandomInput<DeviceType::OPENCL, float>(
       "Weight", {out_channel, height * width * channels});
-  net.AddRandomInput<DeviceType::OPENCL, float>(
-      "Bias", {out_channel});
+  net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {out_channel});
 
   // run cpu
   net.RunOp();
@@ -215,6 +183,4 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfUnAlignedWithBatch) {
   Complex<half>(16, 13, 12, 31, 113);
   Complex<half>(31, 21, 11, 23, 103);
 }
-
 }
-
diff --git a/mace/ops/fused_conv_2d_test.cc b/mace/ops/fused_conv_2d_test.cc
index 37a056f1..ad64be0d 100644
--- a/mace/ops/fused_conv_2d_test.cc
+++ b/mace/ops/fused_conv_2d_test.cc
@@ -511,8 +511,9 @@ TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) {
   TestGeneralConvNxNS12<DeviceType::OPENCL, float>({40, 40}, {15, 1, 32, 64});
 }
 
-template<DeviceType D, typename T>
-static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilation) {
+template <DeviceType D, typename T>
+static void TestAtrousConvNxN(const std::vector<index_t> &shape,
+                              const int dilation) {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                   Padding type) {
@@ -550,9 +551,12 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
     expected.Copy(*net.GetOutput("Output"));
 
     // run on gpu
-    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
-    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Input", "InputImage",
+                        kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, T>(net, "Filter", "FilterImage",
+                        kernels::BufferType::CONV2D_FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage",
+                        kernels::BufferType::ARGUMENT);
 
     OpDefBuilder("FusedConv2D", "FusedConv2dTest")
         .Input("InputImage")
@@ -567,7 +571,8 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
     // Run on device
     net.RunOp(D);
 
-    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
+    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
+                        kernels::BufferType::IN_OUT_CHANNEL);
     ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
   };
 
@@ -591,7 +596,7 @@ TEST_F(FusedConv2dOpTest, OPENCLUnalignedAtrousConvNxN) {
   TestAtrousConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 2);
 }
 
-template<DeviceType D>
+template <DeviceType D>
 static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
                                       const std::vector<index_t> &filter_shape,
                                       const std::vector<int> &dilations) {
@@ -620,7 +625,8 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
         .Finalize(net.NewOperatorDef());
 
     // Add input data
-    net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, float>("Input",
+                                 {batch, height, width, input_channels});
     net.AddRandomInput<D, float>(
         "Filter", {kernel_h, kernel_w, output_channels, input_channels});
     net.AddRandomInput<D, float>("Bias", {output_channels});
@@ -632,9 +638,12 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
     expected.Copy(*net.GetOutput("Output"));
 
     // run on gpu
-    BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
-    BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, half>(net, "Input", "InputImage",
+                           kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, half>(net, "Filter", "FilterImage",
+                           kernels::BufferType::CONV2D_FILTER);
+    BufferToImage<D, half>(net, "Bias", "BiasImage",
+                           kernels::BufferType::ARGUMENT);
 
     OpDefBuilder("FusedConv2D", "FusedConv2dTest")
         .Input("InputImage")
@@ -649,7 +658,8 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
     // Run on device
     net.RunOp(D);
 
-    ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
+    ImageToBuffer<D, float>(net, "OutputImage", "OPENCLOutput",
+                            kernels::BufferType::IN_OUT_CHANNEL);
     ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.7);
   };
 
@@ -658,13 +668,11 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
 }
 
 TEST_F(FusedConv2dOpTest, OPENCL7X7AtrousConvD2) {
-  TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32},
-                                                {7, 7, 3, 16},
+  TestGeneralHalfAtrousConv<DeviceType::OPENCL>({32, 32}, {7, 7, 3, 16},
                                                 {2, 2});
 }
 
 TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) {
-  TestGeneralHalfAtrousConv<DeviceType::OPENCL>({63, 71},
-                                                {15, 15, 16, 16},
+  TestGeneralHalfAtrousConv<DeviceType::OPENCL>({63, 71}, {15, 15, 16, 16},
                                                 {2, 2});
 }
diff --git a/mace/ops/global_avg_pooling.h b/mace/ops/global_avg_pooling.h
index 55deb2a9..dc1cda9e 100644
--- a/mace/ops/global_avg_pooling.h
+++ b/mace/ops/global_avg_pooling.h
@@ -1,4 +1,4 @@
-//DMACE_ENABLE_NEON
+// DMACE_ENABLE_NEON
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
diff --git a/mace/ops/global_avg_pooling_benchmark.cc b/mace/ops/global_avg_pooling_benchmark.cc
index 00b5471a..1ea07c4b 100644
--- a/mace/ops/global_avg_pooling_benchmark.cc
+++ b/mace/ops/global_avg_pooling_benchmark.cc
@@ -40,13 +40,13 @@ static void GlobalAvgPooling(
   static void BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE( \
       int iters) {                                                    \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;  \
-    mace::testing::MaccProcessed(tot);                               \
+    mace::testing::MaccProcessed(tot);                                \
     mace::testing::BytesProcessed(tot *(sizeof(float)));              \
     GlobalAvgPooling<DEVICE>(iters, N, C, H, W);                      \
   }                                                                   \
   BENCHMARK(BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE)
 
-#define BM_GLOBAL_AVG_POOLING(N, C, H, W)       \
+#define BM_GLOBAL_AVG_POOLING(N, C, H, W) \
   BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, CPU);
 //  BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, NEON);
 
diff --git a/mace/ops/image_to_buffer.h b/mace/ops/image_to_buffer.h
index ab4cc5ed..22169b4e 100644
--- a/mace/ops/image_to_buffer.h
+++ b/mace/ops/image_to_buffer.h
@@ -11,17 +11,18 @@
 namespace mace {
 
 template <DeviceType D, typename T>
-class ImageToBufferOp: public Operator<D, T> {
+class ImageToBufferOp : public Operator<D, T> {
  public:
   ImageToBufferOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws), functor_(true)  {}
+      : Operator<D, T>(op_def, ws), functor_(true) {}
 
   bool Run(StatsFuture *future) override {
     const Tensor *input_tensor = this->Input(INPUT);
     Tensor *output = this->Output(OUTPUT);
 
-    kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
-        "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
+    kernels::BufferType type =
+        static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
+            "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
     functor_(output, type, const_cast<Tensor *>(input_tensor), future);
     return true;
   }
diff --git a/mace/ops/matmul.h b/mace/ops/matmul.h
index 6cfdfe99..b45ae35a 100644
--- a/mace/ops/matmul.h
+++ b/mace/ops/matmul.h
@@ -24,8 +24,8 @@ class MatMulOp : public Operator<D, T> {
         << "The dimension of A and B should be 4";
     MACE_CHECK(A->dim(0) == B->dim(0)) << "A and B must have same batch size";
     MACE_CHECK(A->dim(2) == B->dim(1))
-      << "the number of A's column " << A->dim(2)
-      << " must be equal to B's row " << B->dim(1);
+        << "the number of A's column " << A->dim(2)
+        << " must be equal to B's row " << B->dim(1);
 
     functor_(A, B, C, future);
     return true;
diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc
index b6c801df..d8b80ead 100644
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
@@ -10,7 +10,7 @@ namespace mace {
 
 class MatMulOpTest : public OpsTestBase {};
 
-template<DeviceType D>
+template <DeviceType D>
 void Simple(const std::vector<index_t> &A_shape,
             const std::vector<float> &A_value,
             const std::vector<index_t> &B_shape,
@@ -51,29 +51,24 @@ void Simple(const std::vector<index_t> &A_shape,
   }
 
   // Check
-  auto expected =
-      CreateTensor<float>(C_shape, C_value);
+  auto expected = CreateTensor<float>(C_shape, C_value);
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
 
 TEST_F(MatMulOpTest, SimpleCPU) {
-  Simple<DeviceType::CPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6},
-                          {1, 3, 2, 1}, {1, 2, 3, 4, 5, 6},
-                          {1, 2, 2, 1}, {22, 28, 49, 64});
-  Simple<DeviceType::CPU>({1, 5, 5, 1},
-                          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                           16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
-                          {1, 5, 5, 1},
-                          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                           16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
-                          {1, 5, 5, 1},
-                          {215, 230, 245, 260, 275, 490, 530, 570, 610, 650,
-                           765, 830, 895, 960, 1025, 1040, 1130, 1220, 1310, 1400,
-                           1315, 1430, 1545, 1660, 1775});
+  Simple<DeviceType::CPU>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
+                          {1, 2, 3, 4, 5, 6}, {1, 2, 2, 1}, {22, 28, 49, 64});
+  Simple<DeviceType::CPU>(
+      {1, 5, 5, 1}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+      {1, 5, 5, 1}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+      {1, 5, 5, 1}, {215,  230,  245,  260,  275,  490,  530,  570,  610,
+                     650,  765,  830,  895,  960,  1025, 1040, 1130, 1220,
+                     1310, 1400, 1315, 1430, 1545, 1660, 1775});
 }
 
-
 TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
   Simple<DeviceType::CPU>({2, 2, 3, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
                           {2, 3, 2, 1}, {1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6},
@@ -81,19 +76,17 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
 }
 
 TEST_F(MatMulOpTest, SimpleOPENCL) {
-  Simple<DeviceType::OPENCL>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6},
-                             {1, 3, 2, 1}, {1, 2, 3, 4, 5, 6},
-                             {1, 2, 2, 1}, {22, 28, 49, 64});
-  Simple<DeviceType::OPENCL>({1, 5, 5, 1},
-                             {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                              16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
-                             {1, 5, 5, 1},
-                             {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                              16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
-                             {1, 5, 5, 1},
-                             {215, 230, 245, 260, 275, 490, 530, 570, 610, 650,
-                              765, 830, 895, 960, 1025, 1040, 1130, 1220, 1310, 1400,
-                              1315, 1430, 1545, 1660, 1775});
+  Simple<DeviceType::OPENCL>({1, 2, 3, 1}, {1, 2, 3, 4, 5, 6}, {1, 3, 2, 1},
+                             {1, 2, 3, 4, 5, 6}, {1, 2, 2, 1},
+                             {22, 28, 49, 64});
+  Simple<DeviceType::OPENCL>(
+      {1, 5, 5, 1}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+      {1, 5, 5, 1}, {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25},
+      {1, 5, 5, 1}, {215,  230,  245,  260,  275,  490,  530,  570,  610,
+                     650,  765,  830,  895,  960,  1025, 1040, 1130, 1220,
+                     1310, 1400, 1315, 1430, 1545, 1660, 1775});
 }
 
 TEST_F(MatMulOpTest, SimpleGPUWithBatch) {
@@ -118,8 +111,8 @@ void Complex(const index_t batch,
       .Finalize(net.NewOperatorDef());
 
   // Add input data
-  net.AddRandomInput<DeviceType::OPENCL, float>(
-      "A", {batch, height, channels, 1});
+  net.AddRandomInput<DeviceType::OPENCL, float>("A",
+                                                {batch, height, channels, 1});
   net.AddRandomInput<DeviceType::OPENCL, float>(
       "B", {batch, channels, out_width, 1});
 
@@ -132,9 +125,9 @@ void Complex(const index_t batch,
 
   // Run on opencl
   BufferToImage<DeviceType::OPENCL, T>(net, "A", "AImage",
-                                           kernels::BufferType::IN_OUT_WIDTH);
+                                       kernels::BufferType::IN_OUT_WIDTH);
   BufferToImage<DeviceType::OPENCL, T>(net, "B", "BImage",
-                                           kernels::BufferType::IN_OUT_HEIGHT);
+                                       kernels::BufferType::IN_OUT_HEIGHT);
 
   OpDefBuilder("MatMul", "MatMulTest")
       .Input("AImage")
@@ -177,5 +170,4 @@ TEST_F(MatMulOpTest, OPENCLHalfUnAlignedWithBatch) {
   Complex<half>(16, 32, 64, 64);
   Complex<half>(31, 31, 61, 67);
 }
-
 }
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index 15aa3bc9..50c2f2ca 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -95,7 +95,7 @@ class OpDefBuilder {
 
 class OpsTestNet {
  public:
-  OpsTestNet() : op_registry_(new OperatorRegistry()) {};
+  OpsTestNet() : op_registry_(new OperatorRegistry()){};
 
   template <DeviceType D, typename T>
   void AddInputFromArray(const std::string &name,
@@ -239,7 +239,7 @@ void GenerateRandomIntTypeData(const std::vector<index_t> &shape,
 
 template <typename T>
 std::unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape,
-                                const std::vector<T> &data) {
+                                     const std::vector<T> &data) {
   std::unique_ptr<Tensor> res(
       new Tensor(GetDeviceAllocator(DeviceType::CPU), DataTypeToEnum<T>::v()));
   res->Resize(shape);
@@ -334,9 +334,8 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
         for (int h = 0; h < x.dim(1); ++h) {
           for (int w = 0; w < x.dim(2); ++w) {
             for (int c = 0; c < x.dim(3); ++c) {
-              EXPECT_NEAR(*a, *b, abs_err) << "with index = ["
-                                           << n << ", " << h << ", "
-                                           << w << ", " << c << "]";
+              EXPECT_NEAR(*a, *b, abs_err) << "with index = [" << n << ", " << h
+                                           << ", " << w << ", " << c << "]";
               a++;
               b++;
             }
diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h
index 2e4aed62..b88093ab 100644
--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
@@ -20,8 +20,12 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
         pooling_type_(
             static_cast<PoolingType>(OperatorBase::GetSingleArgument<int>(
                 "pooling_type", static_cast<int>(AVG)))),
-        functor_(pooling_type_, kernels_.data(), this->strides_.data(),
-                 this->padding_type_, this->paddings_, this->dilations_.data()){};
+        functor_(pooling_type_,
+                 kernels_.data(),
+                 this->strides_.data(),
+                 this->padding_type_,
+                 this->paddings_,
+                 this->dilations_.data()){};
 
   bool Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc
index fd673d42..dae7e1af 100644
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -54,7 +54,7 @@ static void Pooling(int iters,
       BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_##DEVICE( \
           int iters) {                                                              \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                \
-    mace::testing::MaccProcessed(tot);                                             \
+    mace::testing::MaccProcessed(tot);                                              \
     mace::testing::BytesProcessed(tot *(sizeof(float)));                            \
     Pooling<DEVICE>(iters, N, C, H, W, KE, STRIDE, Padding::PA,                     \
                     PoolingType::PO);                                               \
@@ -62,7 +62,7 @@ static void Pooling(int iters,
   BENCHMARK(                                                                        \
       BM_POOLING_##N##_##C##_##H##_##W##_K##KE##S##STRIDE##_##PA##_##PO##_##DEVICE)
 
-#define BM_POOLING(N, C, H, W, K, S, PA, PO)       \
+#define BM_POOLING(N, C, H, W, K, S, PA, PO) \
   BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU);
 //  BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, NEON);
 
diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc
index bf4cff8b..8bababc8 100644
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -198,7 +198,8 @@ static void MaxPooling3S2(const std::vector<index_t> &input_shape,
   Tensor expected;
   expected.Copy(*net.GetOutput("Output"));
 
-  BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
+  BufferToImage<D, T>(net, "Input", "InputImage",
+                      kernels::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Pooling", "PoolingTest")
       .Input("InputImage")
       .Output("OutputImage")
@@ -333,7 +334,8 @@ static void AvgPoolingTest(const std::vector<index_t> &shape,
   Tensor expected;
   expected.Copy(*net.GetOutput("Output"));
 
-  BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
+  BufferToImage<D, T>(net, "Input", "InputImage",
+                      kernels::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("Pooling", "PoolingTest")
       .Input("InputImage")
       .Output("OutputImage")
diff --git a/mace/ops/reshape.h b/mace/ops/reshape.h
index 46c8c875..a4aec715 100644
--- a/mace/ops/reshape.h
+++ b/mace/ops/reshape.h
@@ -15,7 +15,7 @@ class ReshapeOp : public Operator<D, T> {
  public:
   ReshapeOp(const OperatorDef &op_def, Workspace *ws)
       : Operator<D, T>(op_def, ws),
-        shape_(OperatorBase::GetRepeatedArgument<int64_t>("shape")){}
+        shape_(OperatorBase::GetRepeatedArgument<int64_t>("shape")) {}
 
   bool Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
@@ -38,9 +38,11 @@ class ReshapeOp : public Operator<D, T> {
     }
 
     if (unknown_idx != -1) {
-      MACE_CHECK(product != 0) << "Cannot infer shape if there is zero shape size.";
+      MACE_CHECK(product != 0)
+          << "Cannot infer shape if there is zero shape size.";
       const index_t missing = input->size() / product;
-      MACE_CHECK(missing * product == input->size()) << "Input size not match reshaped tensor size";
+      MACE_CHECK(missing * product == input->size())
+          << "Input size not match reshaped tensor size";
       out_shape[unknown_idx] = missing;
     }
 
diff --git a/mace/ops/reshape_test.cc b/mace/ops/reshape_test.cc
index ab3c13a0..851f33cc 100644
--- a/mace/ops/reshape_test.cc
+++ b/mace/ops/reshape_test.cc
@@ -13,7 +13,6 @@ class ReshapeTest : public OpsTestBase {};
 void TestReshape(const std::vector<index_t> &org_shape,
                  const std::vector<int> &output_shape,
                  const std::vector<index_t> &res_shape) {
-
   // Construct graph
   OpsTestNet net;
   OpDefBuilder("Reshape", "ReshapeTest")
diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc
index 8742f020..7b68e762 100644
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -20,9 +20,9 @@ void Register_Softmax(OperatorRegistry *op_registry) {
                     SoftmaxOp<DeviceType::OPENCL, float>);
 
   REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
-                                    .Device(DeviceType::OPENCL)
-                                    .TypeConstraint<half>("T")
-                                    .Build(),
+                                     .Device(DeviceType::OPENCL)
+                                     .TypeConstraint<half>("T")
+                                     .Build(),
                     SoftmaxOp<DeviceType::OPENCL, half>);
 }
 
diff --git a/mace/ops/softmax.h b/mace/ops/softmax.h
index cbce1d75..3eebabe0 100644
--- a/mace/ops/softmax.h
+++ b/mace/ops/softmax.h
@@ -14,11 +14,10 @@ template <DeviceType D, class T>
 class SoftmaxOp : public Operator<D, T> {
  public:
   SoftmaxOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws) {
-  }
+      : Operator<D, T>(operator_def, ws) {}
 
   bool Run(StatsFuture *future) override {
-    const Tensor *logits= this->Input(LOGITS);
+    const Tensor *logits = this->Input(LOGITS);
 
     Tensor *output = this->Output(OUTPUT);
     output->ResizeLike(logits);
diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc
index af8e3afc..68c4e4e6 100644
--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -14,7 +14,8 @@ void Simple() {
   // Construct graph
   OpsTestNet net;
   // Add input data
-  net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4}, {1, 1, 1, 1, 1, 2, 3, 4});
+  net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4},
+                                  {1, 1, 1, 1, 1, 2, 3, 4});
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D, float>(net, "Input", "InputImage",
@@ -41,18 +42,15 @@ void Simple() {
     net.RunOp(D);
   }
 
-  auto expected = CreateTensor<float>({1, 1, 2, 4}, {0.25, 0.25, 0.25, 0.25,
-                                                     0.0320586, 0.08714432, 0.23688282, 0.64391426});
+  auto expected = CreateTensor<float>(
+      {1, 1, 2, 4},
+      {0.25, 0.25, 0.25, 0.25, 0.0320586, 0.08714432, 0.23688282, 0.64391426});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-7);
 }
 
-TEST_F(SoftmaxOpTest, CPUSimple) {
-  Simple<DeviceType::CPU>();
-}
-TEST_F(SoftmaxOpTest, OPENCLSimple) {
-  Simple<DeviceType::OPENCL>();
-}
+TEST_F(SoftmaxOpTest, CPUSimple) { Simple<DeviceType::CPU>(); }
+TEST_F(SoftmaxOpTest, OPENCLSimple) { Simple<DeviceType::OPENCL>(); }
 
 template <DeviceType D>
 void Complex(const std::vector<index_t> &logits_shape) {
diff --git a/mace/ops/space_to_batch.h b/mace/ops/space_to_batch.h
index 787b82e6..b25c5895 100644
--- a/mace/ops/space_to_batch.h
+++ b/mace/ops/space_to_batch.h
@@ -12,7 +12,7 @@
 
 namespace mace {
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 class SpaceToBatchNDOp : public Operator<D, T> {
  public:
   SpaceToBatchNDOp(const OperatorDef &op_def, Workspace *ws)
@@ -28,17 +28,19 @@ class SpaceToBatchNDOp : public Operator<D, T> {
 
     std::vector<index_t> output_shape(4, 0);
     CalculateOutputShape(space_tensor, batch_tensor, output_shape.data());
-    functor_(const_cast<Tensor *>(space_tensor), output_shape, batch_tensor, future);
+    functor_(const_cast<Tensor *>(space_tensor), output_shape, batch_tensor,
+             future);
     return true;
   }
 
  private:
-
   inline void CalculateOutputShape(const Tensor *input_tensor,
                                    Tensor *output,
                                    index_t *output_shape) {
-    auto paddings = OperatorBase::GetRepeatedArgument<int>("paddings", {0, 0, 0, 0});
-    auto block_shape = OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1});
+    auto paddings =
+        OperatorBase::GetRepeatedArgument<int>("paddings", {0, 0, 0, 0});
+    auto block_shape =
+        OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1});
     MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
     MACE_CHECK(block_shape.size() == 2, "Block's shape should be 1D");
     MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D");
@@ -46,13 +48,14 @@ class SpaceToBatchNDOp : public Operator<D, T> {
     const index_t block_dims = block_shape.size();
     index_t block_shape_product = 1;
     for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) {
-      MACE_CHECK(block_shape[block_dim] > 1, "block_shape's value should be great to 1");
+      MACE_CHECK(block_shape[block_dim] > 1,
+                 "block_shape's value should be great to 1");
       const index_t block_shape_value = block_shape[block_dim];
-      const index_t padded_input_size = input_tensor->dim(block_dim + 1)
-          + paddings[block_dim * 2]
-          + paddings[block_dim * 2 + 1];
-      MACE_CHECK(padded_input_size % block_shape_value == 0,
-                 "padded input ", padded_input_size, " is not divisible by block_shape");
+      const index_t padded_input_size = input_tensor->dim(block_dim + 1) +
+                                        paddings[block_dim * 2] +
+                                        paddings[block_dim * 2 + 1];
+      MACE_CHECK(padded_input_size % block_shape_value == 0, "padded input ",
+                 padded_input_size, " is not divisible by block_shape");
       block_shape_product *= block_shape_value;
       output_shape[block_dim + 1] = padded_input_size / block_shape_value;
     }
diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc
index ac643f94..db72ce54 100644
--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
@@ -42,7 +42,7 @@ static void BMSpaceToBatch(
       BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE( \
           int iters) {                                                       \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;         \
-    mace::testing::MaccProcessed(tot);                                      \
+    mace::testing::MaccProcessed(tot);                                       \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                      \
     BMSpaceToBatch<DEVICE, TYPE>(iters, N, H, W, C, SHAPE);                  \
   }                                                                          \
diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc
index c76757f9..9fc5e40b 100644
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
@@ -4,8 +4,8 @@
 
 #include <fstream>
 #include "mace/core/operator.h"
-#include "mace/ops/ops_test_util.h"
 #include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/ops/ops_test_util.h"
 
 namespace mace {
 
@@ -21,7 +21,9 @@ void TransposeFilter(const std::vector<float> &input,
     for (index_t w = 0; w < input_shape[1]; ++w) {
       for (index_t oc = 0; oc < input_shape[2]; ++oc) {
         for (index_t ic = 0; ic < input_shape[3]; ++ic) {
-          int offset = ((oc * input_shape[3] + ic) * input_shape[0] + h) * input_shape[1] + w;
+          int offset = ((oc * input_shape[3] + ic) * input_shape[0] + h) *
+                           input_shape[1] +
+                       w;
           output[offset] = *input_ptr;
           ++input_ptr;
         }
@@ -30,7 +32,7 @@ void TransposeFilter(const std::vector<float> &input,
   }
 }
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 void WinogradConvolution(const index_t batch,
                          const index_t height,
                          const index_t width,
@@ -53,8 +55,7 @@ void WinogradConvolution(const index_t batch,
                       kernels::BufferType::IN_OUT_CHANNEL);
   BufferToImage<D, T>(net, "Filter", "FilterImage",
                       kernels::BufferType::CONV2D_FILTER);
-  BufferToImage<D, T>(net, "Bias", "BiasImage",
-                      kernels::BufferType::ARGUMENT);
+  BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
   OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("InputImage")
       .Input("FilterImage")
@@ -78,8 +79,10 @@ void WinogradConvolution(const index_t batch,
   // transform filter
   std::vector<float> wino_filter_data;
   TransposeFilter(filter_data, filter_shape, wino_filter_data);
-  net.AddInputFromArray<D, float>("WinoFilterData", {out_channels, in_channels, 3, 3}, wino_filter_data);
-  BufferToImage<D, T>(net, "WinoFilterData", "WinoFilter", kernels::BufferType::WINOGRAD_FILTER);
+  net.AddInputFromArray<D, float>(
+      "WinoFilterData", {out_channels, in_channels, 3, 3}, wino_filter_data);
+  BufferToImage<D, T>(net, "WinoFilterData", "WinoFilter",
+                      kernels::BufferType::WINOGRAD_FILTER);
 
   // transform input
   OpDefBuilder("WinogradTransform", "WinogradTransformTest")
@@ -126,18 +129,23 @@ void WinogradConvolution(const index_t batch,
 }
 
 TEST_F(WinogradConvlutionTest, AlignedConvolution) {
-  WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16, Padding::VALID);
-  WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16, Padding::SAME);
+  WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16,
+                                                 Padding::VALID);
+  WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16,
+                                                 Padding::SAME);
 }
 
 TEST_F(WinogradConvlutionTest, UnAlignedConvolution) {
-  WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 31, 37, Padding::VALID);
-  WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 37, 31, Padding::SAME);
+  WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 31, 37,
+                                                 Padding::VALID);
+  WinogradConvolution<DeviceType::OPENCL, float>(1, 61, 67, 37, 31,
+                                                 Padding::SAME);
 }
 
 TEST_F(WinogradConvlutionTest, BatchConvolution) {
-  WinogradConvolution<DeviceType::OPENCL, float>(3, 64, 64, 32, 32, Padding::VALID);
-  WinogradConvolution<DeviceType::OPENCL, float>(5, 61, 67, 37, 31, Padding::SAME);
+  WinogradConvolution<DeviceType::OPENCL, float>(3, 64, 64, 32, 32,
+                                                 Padding::VALID);
+  WinogradConvolution<DeviceType::OPENCL, float>(5, 61, 67, 37, 31,
+                                                 Padding::SAME);
 }
-
 }
diff --git a/mace/ops/winograd_inverse_transform.h b/mace/ops/winograd_inverse_transform.h
index aef37473..4ea49289 100644
--- a/mace/ops/winograd_inverse_transform.h
+++ b/mace/ops/winograd_inverse_transform.h
@@ -8,12 +8,12 @@
 #include <memory>
 
 #include "mace/core/operator.h"
-#include "mace/kernels/winograd_transform.h"
 #include "mace/kernels/activation.h"
+#include "mace/kernels/winograd_transform.h"
 
 namespace mace {
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 class WinogradInverseTransformOp : public Operator<D, T> {
  public:
   WinogradInverseTransformOp(const OperatorDef &op_def, Workspace *ws)
diff --git a/mace/ops/winograd_transform.h b/mace/ops/winograd_transform.h
index 71d8a527..e225adc7 100644
--- a/mace/ops/winograd_transform.h
+++ b/mace/ops/winograd_transform.h
@@ -12,14 +12,14 @@
 
 namespace mace {
 
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 class WinogradTransformOp : public Operator<D, T> {
  public:
   WinogradTransformOp(const OperatorDef &op_def, Workspace *ws)
       : Operator<D, T>(op_def, ws),
         functor_(static_cast<Padding>(OperatorBase::GetSingleArgument<int>(
-            "padding", static_cast<int>(VALID))),
-        OperatorBase::GetRepeatedArgument<int>("padding_values")) {}
+                     "padding", static_cast<int>(VALID))),
+                 OperatorBase::GetRepeatedArgument<int>("padding_values")) {}
 
   bool Run(StatsFuture *future) override {
     const Tensor *input_tensor = this->Input(INPUT);
diff --git a/mace/ops/winograd_transform_benchmark.cc b/mace/ops/winograd_transform_benchmark.cc
index a8c0e77b..23f7249b 100644
--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
@@ -16,7 +16,7 @@ static void BMWinogradTransform(
   net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
 
   BufferToImage<D, T>(net, "Input", "InputImage",
-                          kernels::BufferType::IN_OUT_CHANNEL);
+                      kernels::BufferType::IN_OUT_CHANNEL);
   OpDefBuilder("WinogradTransform", "WinogradTransformTest")
       .Input("InputImage")
       .Output("OutputImage")
@@ -36,17 +36,15 @@ static void BMWinogradTransform(
   net.Sync();
 }
 
-#define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE)             \
-  static void                                                         \
-      BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(    \
-          int iters) {                                                \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;  \
-    mace::testing::MaccProcessed(tot);                               \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));               \
-    BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C);                  \
-  }                                                                   \
-  BENCHMARK(                                                          \
-      BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
+#define BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE)                  \
+  static void BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
+      int iters) {                                                             \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
+    mace::testing::MaccProcessed(tot);                                         \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
+    BMWinogradTransform<DEVICE, TYPE>(iters, N, H, W, C);                      \
+  }                                                                            \
+  BENCHMARK(BM_WINOGRAD_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
 
 #define BM_WINOGRAD_TRANSFORM(N, H, W, C) \
   BM_WINOGRAD_TRANSFORM_MACRO(N, H, W, C, half, OPENCL);
@@ -88,16 +86,16 @@ static void BMWinogradInverseTransform(
   net.Sync();
 }
 
-#define BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE)             \
-  static void                                                         \
-      BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE(    \
-          int iters) {                                                \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;  \
-    mace::testing::MaccProcessed(tot);                               \
-    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));               \
-    BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C);                  \
-  }                                                                   \
-  BENCHMARK(                                                          \
+#define BM_WINOGRAD_INVERSE_TRANSFORM_MACRO(N, H, W, C, TYPE, DEVICE)          \
+  static void                                                                  \
+      BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \
+          int iters) {                                                         \
+    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;           \
+    mace::testing::MaccProcessed(tot);                                         \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                        \
+    BMWinogradInverseTransform<DEVICE, TYPE>(iters, N, H, W, C);               \
+  }                                                                            \
+  BENCHMARK(                                                                   \
       BM_WINOGRAD_INVERSE_TRANSFORM_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE)
 
 #define BM_WINOGRAD_INVERSE_TRANSFORM(N, H, W, C) \
diff --git a/mace/public/mace.h b/mace/public/mace.h
index d5fd7a52..5d4ad299 100644
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -6,10 +6,10 @@
 #define MACE_CORE_MACE_H_
 
 #include <cstdint>
-#include <vector>
-#include <string>
-#include <memory>
 #include <map>
+#include <memory>
+#include <string>
+#include <vector>
 
 namespace mace {
 
@@ -25,13 +25,11 @@ namespace mace {
 #define MACE_STR(x) MACE_STR_HELPER(x)
 
 // e.g. "0.5.0" or "0.6.0-alpha".
-#define MACE_VERSION_STRING                                            \
+#define MACE_VERSION_STRING                                                    \
   (MACE_STR(MACE_MAJOR_VERSION) "." MACE_STR(MACE_MINOR_VERSION) "." MACE_STR( \
       MACE_PATCH_VERSION) MACE_VERSION_SUFFIX)
 
-inline const char *MaceVersion() {
-  return MACE_VERSION_STRING;
-}
+inline const char *MaceVersion() { return MACE_VERSION_STRING; }
 
 extern const char *MaceGitVersion();
 
@@ -43,17 +41,9 @@ extern const char *MaceGitVersion();
   classname &operator=(const classname &) = delete
 #endif
 
-enum NetMode {
-  INIT = 0,
-  NORMAL = 1
-};
+enum NetMode { INIT = 0, NORMAL = 1 };
 
-enum DeviceType {
-  CPU = 0,
-  NEON = 1,
-  OPENCL = 2,
-  HEXAGON = 3
-};
+enum DeviceType { CPU = 0, NEON = 1, OPENCL = 2, HEXAGON = 3 };
 
 enum DataType {
   DT_INVALID = 0,
@@ -104,6 +94,7 @@ class Argument {
  public:
   Argument();
   void CopyFrom(const Argument &from);
+
  public:
   const std::string &name() const;
   void set_name(const std::string &value);
@@ -147,11 +138,13 @@ class NodeInput {
   NodeInput() {}
   NodeInput(int node_id, int output_port);
   void CopyFrom(const NodeInput &from);
+
  public:
   int node_id() const;
   void set_node_id(int node_id);
   int output_port() const;
   void set_output_port(int output_port);
+
  private:
   int node_id_;
   int output_port_;
@@ -162,8 +155,10 @@ class OutputShape {
   OutputShape();
   OutputShape(const std::vector<int64_t> &dims);
   void CopyFrom(const OutputShape &from);
+
  public:
   const std::vector<int64_t> &dims() const;
+
  private:
   std::vector<int64_t> dims_;
 };
@@ -240,10 +235,12 @@ class OperatorDef {
 class MemoryBlock {
  public:
   MemoryBlock(int mem_id, uint32_t x, uint32_t y);
+
  public:
   int mem_id() const;
   uint32_t x() const;
   uint32_t y() const;
+
  private:
   int mem_id_;
   uint32_t x_;
@@ -255,9 +252,9 @@ class MemoryArena {
   const std::vector<MemoryBlock> &mem_block() const;
   std::vector<MemoryBlock> &mutable_mem_block();
   int mem_block_size() const;
+
  private:
   std::vector<MemoryBlock> mem_block_;
-
 };
 
 // for hexagon mace-nnlib
@@ -268,10 +265,11 @@ class InputInfo {
   int32_t max_byte_size() const;
   DataType data_type() const;
   const std::vector<int32_t> &dims() const;
+
  private:
   std::string name_;
   int32_t node_id_;
-  int32_t max_byte_size_; // only support 32-bit len
+  int32_t max_byte_size_;  // only support 32-bit len
   DataType data_type_;
   std::vector<int32_t> dims_;
 };
@@ -285,10 +283,11 @@ class OutputInfo {
   void set_data_type(DataType data_type);
   const std::vector<int32_t> &dims() const;
   void set_dims(const std::vector<int32_t> &dims);
+
  private:
   std::string name_;
   int32_t node_id_;
-  int32_t max_byte_size_; // only support 32-bit len
+  int32_t max_byte_size_;  // only support 32-bit len
   DataType data_type_;
   std::vector<int32_t> dims_;
 };
@@ -299,6 +298,7 @@ class NetDef {
   int op_size() const;
 
   const OperatorDef &op(const int idx) const;
+
  public:
   const std::string &name() const;
   bool has_name() const;
@@ -359,7 +359,6 @@ struct RunMetadata {
   std::vector<OperatorStats> op_stats;
 };
 
-
 class Workspace;
 class NetBase;
 class OperatorRegistry;
@@ -374,8 +373,7 @@ struct MaceInputInfo {
 class MaceEngine {
  public:
   // Single input and output
-  explicit MaceEngine(const NetDef *net_def,
-                      DeviceType device_type);
+  explicit MaceEngine(const NetDef *net_def, DeviceType device_type);
   // Multiple input or output
   explicit MaceEngine(const NetDef *net_def,
                       DeviceType device_type,
@@ -394,7 +392,7 @@ class MaceEngine {
   // Multiple input or output
   bool Run(const std::vector<MaceInputInfo> &input,
            std::map<std::string, float *> &output,
-           RunMetadata *run_metadata=nullptr);
+           RunMetadata *run_metadata = nullptr);
   MaceEngine(const MaceEngine &) = delete;
   MaceEngine &operator=(const MaceEngine &) = delete;
 
diff --git a/mace/utils/command_line_flags.h b/mace/utils/command_line_flags.h
index ce65e944..4373ceed 100644
--- a/mace/utils/command_line_flags.h
+++ b/mace/utils/command_line_flags.h
@@ -45,7 +45,7 @@ class Flags {
   // Return a usage message with command line cmdline, and the
   // usage_text strings in flag_list[].
   static std::string Usage(const std::string &cmdline,
-                      const std::vector<Flag> &flag_list);
+                           const std::vector<Flag> &flag_list);
 };
 
 }  // namespace mace
diff --git a/mace/utils/env_time.h b/mace/utils/env_time.h
index 9f42486e..ce70a244 100644
--- a/mace/utils/env_time.h
+++ b/mace/utils/env_time.h
@@ -9,7 +9,6 @@
 #include <sys/time.h>
 #include <time.h>
 
-
 namespace mace {
 
 inline int64_t NowMicros() {
diff --git a/mace/utils/logging.h b/mace/utils/logging.h
index e743e18e..22e39488 100644
--- a/mace/utils/logging.h
+++ b/mace/utils/logging.h
@@ -10,8 +10,8 @@
 #include <string>
 #include <vector>
 
-#include "mace/utils/env_time.h"
 #include "mace/public/mace.h"
+#include "mace/utils/env_time.h"
 #include "mace/utils/string_util.h"
 
 #undef ERROR
diff --git a/mace/utils/string_util.h b/mace/utils/string_util.h
index ac7ab4e0..aad884d3 100644
--- a/mace/utils/string_util.h
+++ b/mace/utils/string_util.h
@@ -27,7 +27,7 @@ inline void MakeStringInternal(std::stringstream &ss,
   MakeStringInternal(ss, args...);
 }
 
-} // namespace
+}  // namespace
 
 template <typename... Args>
 std::string MakeString(const Args &... args) {
diff --git a/mace/utils/timer.h b/mace/utils/timer.h
index ab48fb89..d6e28608 100644
--- a/mace/utils/timer.h
+++ b/mace/utils/timer.h
@@ -24,13 +24,9 @@ class WallClockTimer : public Timer {
  public:
   WallClockTimer() : accumulated_micros_(0) {}
 
-  void StartTiming() override {
-    start_micros_ = NowMicros();
-  }
+  void StartTiming() override { start_micros_ = NowMicros(); }
 
-  void StopTiming() override {
-    stop_micros_ = NowMicros();
-  }
+  void StopTiming() override { stop_micros_ = NowMicros(); }
 
   void AccumulateTiming() override {
     StopTiming();
@@ -43,13 +39,9 @@ class WallClockTimer : public Timer {
     accumulated_micros_ = 0;
   }
 
-  double ElapsedMicros() override {
-    return stop_micros_ - start_micros_;
-  }
+  double ElapsedMicros() override { return stop_micros_ - start_micros_; }
 
-  double AccumulatedMicros() override {
-    return accumulated_micros_;
-  }
+  double AccumulatedMicros() override { return accumulated_micros_; }
 
  private:
   double start_micros_;
diff --git a/mace/utils/tuner_test.cc b/mace/utils/tuner_test.cc
index 6cd54c9f..80cc50d0 100644
--- a/mace/utils/tuner_test.cc
+++ b/mace/utils/tuner_test.cc
@@ -30,20 +30,14 @@ TEST_F(TunerTest, SimpleRun) {
 
   WallClockTimer timer;
   std::vector<unsigned int> default_params(1, 1);
-  int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
-                                                                         default_params,
-                                                                         nullptr,
-                                                                         TunerFunc,
-                                                                         &timer);
+  int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+      "SimpleRun", default_params, nullptr, TunerFunc, &timer);
 
   EXPECT_EQ(expect, res);
 
   default_params[0] = 2;
-  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
-                                                                     default_params,
-                                                                     nullptr,
-                                                                     TunerFunc,
-                                                                     &timer);
+  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+      "SimpleRun", default_params, nullptr, TunerFunc, &timer);
   EXPECT_EQ(expect + 1, res);
 }
 
@@ -64,20 +58,13 @@ TEST_F(TunerTest, SimpleTune) {
   };
   // tune
   WallClockTimer timer;
-  int res =
-      Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
-                                                                   default_params,
-                                                                   *params_generator,
-                                                                   TunerFunc,
-                                                                   &timer);
+  int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+      "SimpleRun", default_params, *params_generator, TunerFunc, &timer);
   EXPECT_EQ(expect, res);
 
   // run
-  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
-                                                                     default_params,
-                                                                     nullptr,
-                                                                     TunerFunc,
-                                                                     &timer);
+  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+      "SimpleRun", default_params, nullptr, TunerFunc, &timer);
   EXPECT_EQ(expect, res);
 }
 
diff --git a/mace/utils/utils.h b/mace/utils/utils.h
index 3f5c8ee0..0330de47 100644
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
@@ -62,9 +62,9 @@ inline std::string ObfuscateSymbol(const std::string &src) {
   if (dest.empty()) {
     return dest;
   }
-  dest[0] = src[0]; // avoid invalid symbol which starts from 0-9
+  dest[0] = src[0];  // avoid invalid symbol which starts from 0-9
   const std::string encode_dict =
-    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
   for (size_t i = 1; i < src.size(); i++) {
     char ch = src[i];
     int idx;
diff --git a/mace/utils/utils_test.cc b/mace/utils/utils_test.cc
index 6cd54c9f..80cc50d0 100644
--- a/mace/utils/utils_test.cc
+++ b/mace/utils/utils_test.cc
@@ -30,20 +30,14 @@ TEST_F(TunerTest, SimpleRun) {
 
   WallClockTimer timer;
   std::vector<unsigned int> default_params(1, 1);
-  int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
-                                                                         default_params,
-                                                                         nullptr,
-                                                                         TunerFunc,
-                                                                         &timer);
+  int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+      "SimpleRun", default_params, nullptr, TunerFunc, &timer);
 
   EXPECT_EQ(expect, res);
 
   default_params[0] = 2;
-  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
-                                                                     default_params,
-                                                                     nullptr,
-                                                                     TunerFunc,
-                                                                     &timer);
+  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+      "SimpleRun", default_params, nullptr, TunerFunc, &timer);
   EXPECT_EQ(expect + 1, res);
 }
 
@@ -64,20 +58,13 @@ TEST_F(TunerTest, SimpleTune) {
   };
   // tune
   WallClockTimer timer;
-  int res =
-      Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
-                                                                   default_params,
-                                                                   *params_generator,
-                                                                   TunerFunc,
-                                                                   &timer);
+  int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+      "SimpleRun", default_params, *params_generator, TunerFunc, &timer);
   EXPECT_EQ(expect, res);
 
   // run
-  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
-                                                                     default_params,
-                                                                     nullptr,
-                                                                     TunerFunc,
-                                                                     &timer);
+  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>(
+      "SimpleRun", default_params, nullptr, TunerFunc, &timer);
   EXPECT_EQ(expect, res);
 }
 
-- 
GitLab