Reformatting code and enable cpplint

4410ecd2 · Liangliang He · b26187f0 · 4410ecd2 · 4410ecd2 · 4410ecd2
128 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
 stages:
  - ops_test
  - ops_benchmark
+  - cpplint
+cpplint:
+  stage: cpplint
+  only:
+    - master
+  script:
+    - curl -o cpplint.py https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py
+    - python cpplint.py --root=mace --linelength=80 --counting=detailed $(find mace -name *.h -or -name *.cc | grep -vE "half.h")
 ops_test:
  stage: ops_test

--- a/mace/core/allocator.h
+++ b/mace/core/allocator.h
@@ -9,8 +9,8 @@
 #include <malloc.h>
 #include "mace/core/registry.h"
-#include "mace/public/mace.h"
 #include "mace/core/types.h"
+#include "mace/public/mace.h"
 namespace mace {
@@ -81,7 +81,7 @@ class CPUAllocator : public Allocator {
    free(data);
  };
  void *Map(void *buffer, size_t offset, size_t nbytes) const override {
-    return (char*)buffer + offset;
+    return (char *)buffer + offset;
  }
  void *MapImage(void *buffer,
                 const std::vector<size_t> &image_shape,

--- a/mace/core/arg_helper.cc
+++ b/mace/core/arg_helper.cc
--- a/mace/core/buffer.h
+++ b/mace/core/buffer.h
@@ -5,9 +5,9 @@
 #ifndef MACE_CORE_BUFFER_H_
 #define MACE_CORE_BUFFER_H_
-#include "mace/core/types.h"
-#include "mace/core/allocator.h"
 #include <vector>
+#include "mace/core/allocator.h"
+#include "mace/core/types.h"
 namespace mace {
@@ -39,23 +39,19 @@ class BufferBase {
  virtual bool OnHost() const = 0;
-  virtual index_t offset() const {
+  virtual index_t offset() const { return 0; };
-    return 0;
-  };
-  template<typename T>
+  template <typename T>
  const T *data() const {
    return reinterpret_cast<const T *>(raw_data());
  }
-  template<typename T>
+  template <typename T>
  T *mutable_data() {
    return reinterpret_cast<T *>(raw_mutable_data());
  }
-  index_t size() const {
+  index_t size() const { return size_; }
-    return size_;
-  }
 protected:
  index_t size_;
@@ -155,12 +151,10 @@ class Buffer : public BufferBase {
  void Copy(void *src, index_t offset, index_t length) {
    MACE_CHECK_NOTNULL(mapped_buf_);
    MACE_CHECK(length <= size_, "out of buffer");
-    memcpy(mapped_buf_, (char *) src + offset, length);
+    memcpy(mapped_buf_, (char *)src + offset, length);
  }
-  bool OnHost() const {
+  bool OnHost() const { return allocator_->OnHost(); }
-    return allocator_->OnHost();
-  }
 private:
  Allocator *allocator_;
@@ -180,9 +174,10 @@ class Image : public BufferBase {
        mapped_buf_(nullptr) {}
  Image(std::vector<size_t> shape, DataType data_type)
-    : BufferBase(std::accumulate(shape.begin(), shape.end(),
+      : BufferBase(
-                                 1, std::multiplies<index_t>())
+            std::accumulate(
-                   * GetEnumTypeSize(data_type)),
+                shape.begin(), shape.end(), 1, std::multiplies<index_t>()) *
+            GetEnumTypeSize(data_type)),
        allocator_(GetDeviceAllocator(OPENCL)),
        mapped_buf_(nullptr) {
    shape_ = shape;
@@ -214,9 +209,7 @@ class Image : public BufferBase {
    return mapped_buf_;
  }
-  std::vector<size_t> image_shape() const {
+  std::vector<size_t> image_shape() const { return shape_; }
-    return shape_;
-  }
  void *Map(index_t offset, index_t length, std::vector<size_t> *pitch) const {
    MACE_NOT_IMPLEMENTED;
@@ -241,17 +234,11 @@ class Image : public BufferBase {
    mapped_buf_ = nullptr;
  };
-  void Resize(index_t size) {
+  void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
-    MACE_NOT_IMPLEMENTED;
-  }
-  void Copy(void *src, index_t offset, index_t length) {
+  void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
-    MACE_NOT_IMPLEMENTED;
-  }
-  bool OnHost() const {
+  bool OnHost() const { return allocator_->OnHost(); }
-    return allocator_->OnHost();
-  }
 private:
  Allocator *allocator_;
@@ -266,10 +253,7 @@ class Image : public BufferBase {
 class BufferSlice : public BufferBase {
 public:
  BufferSlice()
-    : buffer_(nullptr),
+      : buffer_(nullptr), mapped_buf_(nullptr), offset_(0), length_(0) {}
-      mapped_buf_(nullptr),
-      offset_(0),
-      length_(0) {}
  BufferSlice(BufferBase *buffer, index_t offset, index_t length)
      : BufferBase(buffer->size()),
        buffer_(buffer),
@@ -277,17 +261,11 @@ class BufferSlice : public BufferBase {
        offset_(offset),
        length_(length) {
    MACE_CHECK(offset >= 0, "buffer slice offset should >= 0");
-    MACE_CHECK(offset + length <= size_,
+    MACE_CHECK(offset + length <= size_, "buffer slice offset + length (",
-               "buffer slice offset + length (",
+               offset, " + ", length, ") should <= ", size_);
-               offset,
+  }
-               " + ",
+  BufferSlice(const BufferSlice &other)
-               length,
+      : BufferSlice(other.buffer_, other.offset_, other.length_) {}
-               ") should <= ",
-               size_);
-  }
-  BufferSlice(const BufferSlice &other) : BufferSlice(other.buffer_,
-                                                      other.offset_,
-                                                      other.length_) {}
  ~BufferSlice() {
    if (buffer_ != nullptr && mapped_buf_ != nullptr) {
@@ -303,7 +281,7 @@ class BufferSlice : public BufferBase {
  const void *raw_data() const {
    if (OnHost()) {
      MACE_CHECK_NOTNULL(buffer_);
-      return (char *) buffer_->raw_data() + offset_;
+      return (char *)buffer_->raw_data() + offset_;
    } else {
      MACE_CHECK_NOTNULL(mapped_buf_);
      return mapped_buf_;
@@ -320,9 +298,7 @@ class BufferSlice : public BufferBase {
    return nullptr;
  }
-  void UnMap(void *mapped_ptr) const {
+  void UnMap(void *mapped_ptr) const { MACE_NOT_IMPLEMENTED; }
-    MACE_NOT_IMPLEMENTED;
-  }
  void Map(std::vector<size_t> *pitch) {
    MACE_CHECK_NOTNULL(buffer_);
@@ -336,21 +312,13 @@ class BufferSlice : public BufferBase {
    mapped_buf_ = nullptr;
  };
-  void Resize(index_t size) {
+  void Resize(index_t size) { MACE_NOT_IMPLEMENTED; }
-    MACE_NOT_IMPLEMENTED;
-  }
-  void Copy(void *src, index_t offset, index_t length) {
+  void Copy(void *src, index_t offset, index_t length) { MACE_NOT_IMPLEMENTED; }
-    MACE_NOT_IMPLEMENTED;
-  }
-  index_t offset() const {
+  index_t offset() const { return offset_; }
-    return offset_;
-  }
-  bool OnHost() const {
+  bool OnHost() const { return buffer_->OnHost(); }
-    return buffer_->OnHost();
-  }
 private:
  BufferBase *buffer_;
@@ -358,7 +326,6 @@ class BufferSlice : public BufferBase {
  index_t offset_;
  index_t length_;
 };
 }
 #endif  // MACE_CORE_BUFFER_H_
--- a/mace/core/mace.cc
+++ b/mace/core/mace.cc
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -3,9 +3,9 @@
 //
 #include "mace/core/net.h"
-#include "mace/utils/utils.h"
-#include "mace/utils/timer.h"
 #include "mace/utils/memory_logging.h"
+#include "mace/utils/timer.h"
+#include "mace/utils/utils.h"
 namespace mace {
@@ -20,8 +20,7 @@ SerialNet::SerialNet(const std::shared_ptr<const OperatorRegistry> op_registry,
                     Workspace *ws,
                     DeviceType type,
                     const NetMode mode)
-    :  NetBase(op_registry, net_def, ws, type),
+    : NetBase(op_registry, net_def, ws, type), device_type_(type) {
-      device_type_(type) {
  MACE_LATENCY_LOGGER(1, "Constructing SerialNet ", net_def->name());
  for (int idx = 0; idx < net_def->op_size(); ++idx) {
    const auto &operator_def = net_def->op(idx);
@@ -41,8 +40,8 @@ bool SerialNet::Run(RunMetadata *run_metadata) {
  MACE_LATENCY_LOGGER(1, "Running net");
  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
    auto &op = *iter;
-    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(),
+    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
-                        "(", op->debug_def().type(), ")");
+                        op->debug_def().type(), ")");
    bool future_wait = (device_type_ == DeviceType::OPENCL &&
                        (run_metadata != nullptr ||
                         std::distance(iter, operators_.end()) == 1));
@@ -99,7 +98,8 @@ std::unique_ptr<NetBase> CreateNet(
    Workspace *ws,
    DeviceType type,
    const NetMode mode) {
-  std::unique_ptr<NetBase> net(new SerialNet(op_registry, net_def, ws, type, mode));
+  std::unique_ptr<NetBase> net(
+      new SerialNet(op_registry, net_def, ws, type, mode));
  return net;
 }

--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -7,10 +7,10 @@
 #include "mace/core/arg_helper.h"
 #include "mace/core/future.h"
-#include "mace/public/mace.h"
 #include "mace/core/registry.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
+#include "mace/public/mace.h"
 namespace mace {

--- a/mace/core/preallocated_pooled_allocator.h
+++ b/mace/core/preallocated_pooled_allocator.h
--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.cc
@@ -2,19 +2,19 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
-#include <vector>
-#include <thread>
 #include <sys/time.h>
+#include <thread>
+#include <vector>
 #include "mace/core/runtime/hexagon/hexagon_control_wrapper.h"
 #include "mace/core/runtime/hexagon/hexagon_nn_ops.h"
 namespace {
-  inline int64_t NowMicros() {
+inline int64_t NowMicros() {
  struct timeval tv;
  gettimeofday(&tv, nullptr);
  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-  }
+}
 }
 namespace mace {
@@ -63,7 +63,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
  // const node
  std::thread const_thread([&]() {
    std::vector<hexagon_nn_const_node> const_node_list;
-    for (const ConstTensor &const_tensor: net_def.tensors()) {
+    for (const ConstTensor &const_tensor : net_def.tensors()) {
      std::vector<int> tensor_shape(const_tensor.dims().begin(),
                                    const_tensor.dims().end());
      while (tensor_shape.size() < 4) {
@@ -77,30 +77,30 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
      const_node.tensor.width = tensor_shape[2];
      const_node.tensor.depth = tensor_shape[3];
-      if (const_tensor.data_type() == DataType::DT_INT32
+      if (const_tensor.data_type() == DataType::DT_INT32 &&
-        && const_tensor.data_size() == 0) {
+          const_tensor.data_size() == 0) {
        const_node.tensor.data = NULL;
        const_node.tensor.dataLen = 0;
      } else {
        const_node.tensor.data =
            const_cast<unsigned char *>(const_tensor.data());
-        const_node.tensor.dataLen =
+        const_node.tensor.dataLen = const_tensor.data_size() *
-          const_tensor.data_size() * GetEnumTypeSize(const_tensor.data_type());
+                                    GetEnumTypeSize(const_tensor.data_type());
      }
      const_node_list.push_back(const_node);
      // 255 is magic number: why fastrpc limits sequence length to that?
      if (const_node_list.size() >= 250) {
-        MACE_CHECK(hexagon_nn_append_const_node_list(nn_id_,
+        MACE_CHECK(
-                                                     const_node_list.data(),
+            hexagon_nn_append_const_node_list(nn_id_, const_node_list.data(),
-                                                     const_node_list.size())
+                                              const_node_list.size()) == 0,
-                     == 0, "append const node error");
+            "append const node error");
        const_node_list.clear();
      }
    }
    if (!const_node_list.empty()) {
-      MACE_CHECK(hexagon_nn_append_const_node_list(nn_id_,
+      MACE_CHECK(
-                                                   const_node_list.data(),
+          hexagon_nn_append_const_node_list(nn_id_, const_node_list.data(),
                                            const_node_list.size()) == 0,
          "append const node error");
    }
@@ -117,7 +117,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
    std::vector<hexagon_nn_input> inputs;
    std::vector<hexagon_nn_output> outputs;
-    for (const OperatorDef &op: net_def.op()) {
+    for (const OperatorDef &op : net_def.op()) {
      int op_id = op_map.GetOpId(op.type());
      inputs.resize(op.node_input().size());
      for (size_t i = 0; i < op.node_input().size(); ++i) {
@@ -131,9 +131,8 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
      cached_inputs.push_back(inputs);
      cached_outputs.push_back(outputs);
-      hexagon_nn_padding_type
+      hexagon_nn_padding_type padding_type =
-        padding_type = static_cast<hexagon_nn_padding_type>(
+          static_cast<hexagon_nn_padding_type>(op.padding());
-        op.padding());
      hexagon_nn_op_node op_node;
      op_node.node_id = node_id(op.node_id());
@@ -146,8 +145,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
      op_node_list.push_back(op_node);
      if (op_node_list.size() >= 125) {
-        MACE_CHECK(hexagon_nn_append_node_list(nn_id_,
+        MACE_CHECK(hexagon_nn_append_node_list(nn_id_, op_node_list.data(),
-                                               op_node_list.data(),
                                               op_node_list.size()) == 0,
                   "append node error");
        op_node_list.clear();
@@ -157,8 +155,7 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
    }
    if (!op_node_list.empty()) {
-      MACE_CHECK(hexagon_nn_append_node_list(nn_id_,
+      MACE_CHECK(hexagon_nn_append_node_list(nn_id_, op_node_list.data(),
-                                             op_node_list.data(),
                                             op_node_list.size()) == 0,
                 "append node error");
    }
@@ -172,10 +169,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
  // input info
  num_inputs_ = 0;
-  for (const InputInfo &input_info: net_def.input_info()) {
+  for (const InputInfo &input_info : net_def.input_info()) {
    std::vector<index_t> input_shape;
-    input_shape.insert(input_shape.begin(),
+    input_shape.insert(input_shape.begin(), input_info.dims().begin(),
-                       input_info.dims().begin(), input_info.dims().end());
+                       input_info.dims().end());
    while (input_shape.size() < 4) {
      input_shape.insert(input_shape.begin(), 1);
    }
@@ -186,10 +183,10 @@ bool HexagonControlWrapper::SetupGraph(const NetDef &net_def) {
  // output info
  num_outputs_ = 0;
-  for (const OutputInfo &output_info: net_def.output_info()) {
+  for (const OutputInfo &output_info : net_def.output_info()) {
    std::vector<index_t> output_shape;
-    output_shape.insert(output_shape.begin(),
+    output_shape.insert(output_shape.begin(), output_info.dims().begin(),
-                        output_info.dims().begin(), output_info.dims().end());
+                        output_info.dims().end());
    while (output_shape.size() < 4) {
      output_shape.insert(output_shape.begin(), 1);
    }
@@ -218,27 +215,27 @@ bool HexagonControlWrapper::TeardownGraph() {
  return hexagon_nn_teardown(nn_id_) == 0;
 }
-#define PRINT_BUFSIZE (2*1024*1024)
+#define PRINT_BUFSIZE (2 * 1024 * 1024)
 void HexagonControlWrapper::PrintLog() {
  char *buf;
  if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
-  MACE_CHECK(hexagon_nn_getlog(nn_id_,
+  MACE_CHECK(hexagon_nn_getlog(nn_id_, reinterpret_cast<unsigned char *>(buf),
-                               reinterpret_cast<unsigned char *>(buf),
+                               PRINT_BUFSIZE) == 0,
-                               PRINT_BUFSIZE) == 0, "print log error");
+             "print log error");
  LOG(INFO) << std::string(buf);
-  delete[]buf;
+  delete[] buf;
 }
 void HexagonControlWrapper::PrintGraph() {
  LOG(INFO) << "Print Graph";
  char *buf;
  if ((buf = new char[PRINT_BUFSIZE]) == NULL) return;
-  MACE_CHECK(hexagon_nn_snpprint(nn_id_,
+  MACE_CHECK(hexagon_nn_snpprint(nn_id_, reinterpret_cast<unsigned char *>(buf),
-                                 reinterpret_cast<unsigned char *>(buf),
+                                 PRINT_BUFSIZE) == 0,
-                                 PRINT_BUFSIZE) == 0, "print graph error");
+             "print graph error");
  LOG(INFO) << std::string(buf);
-  delete[]buf;
+  delete[] buf;
 }
 void HexagonControlWrapper::SetDebugLevel(int level) {
@@ -256,8 +253,8 @@ void HexagonControlWrapper::GetPerfInfo() {
  LOG(INFO) << "Get perf info";
  std::vector<hexagon_nn_perfinfo> perf_info(MAX_NODE);
  unsigned int n_items = 0;
-  MACE_CHECK(
+  MACE_CHECK(hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE,
-    hexagon_nn_get_perfinfo(nn_id_, perf_info.data(), MAX_NODE, &n_items) == 0,
+                                     &n_items) == 0,
             "get perf info error");
  std::unordered_map<uint32_t, float> node_id_counters;
@@ -269,8 +266,9 @@ void HexagonControlWrapper::GetPerfInfo() {
    unsigned int node_id = perf_info[i].node_id;
    unsigned int node_type_id = perf_info[i].node_type;
    node_id_counters[node_id] =
-      ((static_cast<uint64_t>(perf_info[i].counter_hi) << 32)
+        ((static_cast<uint64_t>(perf_info[i].counter_hi) << 32) +
-        + perf_info[i].counter_lo) * 1.0f / perf_info[i].executions;
+         perf_info[i].counter_lo) *
+        1.0f / perf_info[i].executions;
    char node_type_buf[MAX_NODE];
    hexagon_nn_op_id_to_name(node_type_id, node_type_buf, MAX_NODE);
@@ -288,7 +286,7 @@ void HexagonControlWrapper::GetPerfInfo() {
    total_duration += node_id_counters[node_id];
  }
-  for (auto &node_type_counter: node_type_counters) {
+  for (auto &node_type_counter : node_type_counters) {
    LOG(INFO) << "node type: " << node_type_counter.first
              << ", time: " << node_type_counter.second.first
              << ", duration: " << node_type_counter.second.second;
@@ -312,32 +310,24 @@ bool HexagonControlWrapper::ExecuteGraph(const Tensor &input_tensor,
  output_tensor->Resize(output_shapes_[0]);
  std::vector<uint32_t> output_shape(4);
  uint32_t output_bytes;
-  int res = hexagon_nn_execute(nn_id_,
+  int res = hexagon_nn_execute(
-                               input_tensor.shape()[0],
+      nn_id_, input_tensor.shape()[0], input_tensor.shape()[1],
-                               input_tensor.shape()[1],
+      input_tensor.shape()[2], input_tensor.shape()[3],
-                               input_tensor.shape()[2],
+      reinterpret_cast<const unsigned char *>(input_tensor.raw_data()),
-                               input_tensor.shape()[3],
+      input_tensor.raw_size(), &output_shape[0], &output_shape[1],
-                               reinterpret_cast<const unsigned char *>(
+      &output_shape[2], &output_shape[3],
-                                 input_tensor.raw_data()),
+      reinterpret_cast<unsigned char *>(output_tensor->raw_mutable_data()),
-                               input_tensor.raw_size(),
+      output_tensor->raw_size(), &output_bytes);
-                               &output_shape[0],
-                               &output_shape[1],
-                               &output_shape[2],
-                               &output_shape[3],
-                               reinterpret_cast<unsigned char *>(
-                                 output_tensor->raw_mutable_data()),
-                               output_tensor->raw_size(),
-                               &output_bytes);
  MACE_CHECK(res == 0, "execute error");
-  MACE_ASSERT(output_shape == output_shapes_[0],
+  MACE_ASSERT(output_shape == output_shapes_[0], "wrong output shape inferred");
-              "wrong output shape inferred");
  MACE_ASSERT(output_bytes == output_tensor->raw_size(),
              "wrong output bytes inferred.");
  return res == 0;
 };
-bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_tensors,
+bool HexagonControlWrapper::ExecuteGraphNew(
+    const std::vector<Tensor> &input_tensors,
    std::vector<Tensor> *output_tensors) {
  LOG(INFO) << "Execute graph new: " << nn_id_;
  int num_inputs = input_tensors.size();
@@ -369,8 +359,8 @@ bool HexagonControlWrapper::ExecuteGraphNew(const std::vector<Tensor> &input_ten
    outputs[i].dataLen = (*output_tensors)[i].raw_size();
  }
-  int res = hexagon_nn_execute_new(nn_id_, inputs, num_inputs,
+  int res =
-                                   outputs, num_outputs);
+      hexagon_nn_execute_new(nn_id_, inputs, num_inputs, outputs, num_outputs);
  for (int i = 0; i < num_outputs; ++i) {
    std::vector<uint32_t> output_shape{outputs[i].batches, outputs[i].height,
@@ -397,9 +387,7 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
  float *min_in_data = input_tensors[1].mutable_data<float>();
  input_tensors[2].Resize({1, 1, 1, 1});
  float *max_in_data = input_tensors[2].mutable_data<float>();
-  quantizer_.Quantize(input_tensor,
+  quantizer_.Quantize(input_tensor, &input_tensors[0], min_in_data,
-                      &input_tensors[0],
-                      min_in_data,
                      max_in_data);
  if (!ExecuteGraphNew(input_tensors, &output_tensors)) {
    return false;
@@ -409,9 +397,7 @@ bool HexagonControlWrapper::ExecuteGraphPreQuantize(const Tensor &input_tensor,
  const float *min_out_data = output_tensors[1].data<float>();
  const float *max_out_data = output_tensors[2].data<float>();
-  quantizer_.DeQuantize(output_tensors[0],
+  quantizer_.DeQuantize(output_tensors[0], *min_out_data, *max_out_data,
-                        *min_out_data,
-                        *max_out_data,
                        output_tensor);
  return true;
 }

--- a/mace/core/runtime/hexagon/hexagon_control_wrapper.h
+++ b/mace/core/runtime/hexagon/hexagon_control_wrapper.h
@@ -16,16 +16,17 @@ namespace mace {
 class HexagonControlWrapper {
 public:
-  HexagonControlWrapper() {};
+  HexagonControlWrapper(){};
  int GetVersion();
  bool Config();
  bool Init();
  bool Finalize();
-  bool SetupGraph(const NetDef& net_def);
+  bool SetupGraph(const NetDef &net_def);
  bool ExecuteGraph(const Tensor &input_tensor, Tensor *output_tensor);
-  bool ExecuteGraphNew(const std::vector<Tensor>& input_tensors,
+  bool ExecuteGraphNew(const std::vector<Tensor> &input_tensors,
                       std::vector<Tensor> *output_tensors);
-  bool ExecuteGraphPreQuantize(const Tensor &input_tensor, Tensor *output_tensor);
+  bool ExecuteGraphPreQuantize(const Tensor &input_tensor,
+                               Tensor *output_tensor);
  bool TeardownGraph();
  void PrintLog();
@@ -38,9 +39,7 @@ class HexagonControlWrapper {
 private:
  static constexpr int NODE_ID_OFFSET = 10000;
-  inline uint32_t node_id(uint32_t nodeid) {
+  inline uint32_t node_id(uint32_t nodeid) { return NODE_ID_OFFSET + nodeid; }
-    return NODE_ID_OFFSET + nodeid;
-  }
  int nn_id_;
  Quantizer quantizer_;
@@ -54,7 +53,6 @@ class HexagonControlWrapper {
  DISABLE_COPY_AND_ASSIGN(HexagonControlWrapper);
 };
 }
 #endif  // MACE_DSP_HEXAGON_CONTROL_WRAPPER_H_
--- a/mace/core/runtime/hexagon/hexagon_controller_dummy.cc
+++ b/mace/core/runtime/hexagon/hexagon_controller_dummy.cc
@@ -10,31 +10,145 @@ int hexagon_controller_InitHexagonWithMaxAttributes(int enable_dcvs,
  return 0;
 }
-int hexagon_controller_DeInitHexagon() {
+int hexagon_controller_DeInitHexagon() { return 0; }
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(
+    hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(
+    hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id,
+                                                            unsigned char *buf,
+                                                            int bufLen)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id,
+                                                          unsigned char *buf,
+                                                          int bufLen)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(
+    hexagon_nn_nn_id id,
+    unsigned int node_id,
+    unsigned int operation,
+    hexagon_nn_padding_type padding,
+    const hexagon_nn_input *inputs,
+    int inputsLen,
+    const hexagon_nn_output *outputs,
+    int outputsLen) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(
+    hexagon_nn_nn_id id,
+    const hexagon_nn_op_node *ops,
+    int opsLen) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(
+    hexagon_nn_nn_id id,
+    unsigned int node_id,
+    unsigned int batches,
+    unsigned int height,
+    unsigned int width,
+    unsigned int depth,
+    const unsigned char *data,
+    int dataLen) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(
+    hexagon_nn_nn_id id,
+    const hexagon_nn_const_node *consts,
+    int constsLen) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(
+    hexagon_nn_nn_id id,
+    unsigned int batches_in,
+    unsigned int height_in,
+    unsigned int width_in,
+    unsigned int depth_in,
+    const unsigned char *data_in,
+    int data_inLen,
+    unsigned int *batches_out,
+    unsigned int *height_out,
+    unsigned int *width_out,
+    unsigned int *depth_out,
+    unsigned char *data_out,
+    int data_outLen,
+    unsigned int *data_len_out) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(
+    unsigned int level) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(
+    hexagon_nn_nn_id id,
+    hexagon_nn_perfinfo *info_out,
+    int info_outLen,
+    unsigned int *n_items) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(
+    hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(
+    hexagon_nn_nn_id id,
+    unsigned int *cycles_lo,
+    unsigned int *cycles_hi) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int *ver)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(
+    const char *name, unsigned int *node_id) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(
+    unsigned int node_id, char *name, int nameLen) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void)
+    __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(
+    int *ver) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(
+    const unsigned char *buf, int bufLen) __QAIC_HEADER_ATTRIBUTE {
+  return 0;
+}
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(
+    hexagon_nn_nn_id id,
+    const hexagon_nn_tensordef *inputs,
+    int inputsLen,
+    hexagon_nn_tensordef *outputs,
+    int outputsLen) __QAIC_HEADER_ATTRIBUTE {
  return 0;
 }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int operation, hexagon_nn_padding_type padding, const hexagon_nn_input* inputs, int inputsLen, const hexagon_nn_output* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(hexagon_nn_nn_id id, const hexagon_nn_op_node* ops, int opsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int batches, unsigned int height, unsigned int width, unsigned int depth, const unsigned char* data, int dataLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(hexagon_nn_nn_id id, const hexagon_nn_const_node* consts, int constsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(hexagon_nn_nn_id id, unsigned int batches_in, unsigned int height_in, unsigned int width_in, unsigned int depth_in, const unsigned char* data_in, int data_inLen, unsigned int* batches_out, unsigned int* height_out, unsigned int* width_out, unsigned int* depth_out, unsigned char* data_out, int data_outLen, unsigned int* data_len_out) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(unsigned int level) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(hexagon_nn_nn_id id, hexagon_nn_perfinfo* info_out, int info_outLen, unsigned int* n_items) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(hexagon_nn_nn_id id, unsigned int* cycles_lo, unsigned int* cycles_hi) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int* ver) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(const char* name, unsigned int* node_id) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(unsigned int node_id, char* name, int nameLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(int* ver) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(const unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(hexagon_nn_nn_id id, const hexagon_nn_tensordef* inputs, int inputsLen, hexagon_nn_tensordef* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE { return 0; }
--- a/mace/core/runtime/hexagon/hexagon_nn.h
+++ b/mace/core/runtime/hexagon/hexagon_nn.h
@@ -30,7 +30,7 @@ extern "C" {
 #define __QAIC_STRING1_OBJECT_DEFINED__
 #define __STRING1_OBJECT__
 typedef struct _cstring1_s {
-   char* data;
+  char *data;
  int dataLen;
 } _cstring1_t;
@@ -71,7 +71,7 @@ struct hexagon_nn_tensordef {
  unsigned int height;
  unsigned int width;
  unsigned int depth;
-   unsigned char* data;
+  unsigned char *data;
  int dataLen;
  unsigned int data_valid_len;
  unsigned int unused;
@@ -81,9 +81,9 @@ struct hexagon_nn_op_node {
  unsigned int node_id;
  unsigned int operation;
  hexagon_nn_padding_type padding;
-   hexagon_nn_input* inputs;
+  hexagon_nn_input *inputs;
  int inputsLen;
-   hexagon_nn_output* outputs;
+  hexagon_nn_output *outputs;
  int outputsLen;
 };
 typedef struct hexagon_nn_const_node hexagon_nn_const_node;
@@ -91,30 +91,98 @@ struct hexagon_nn_const_node {
  unsigned int node_id;
  hexagon_nn_tensordef tensor;
 };
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_config)(void)
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void) __QAIC_HEADER_ATTRIBUTE;
+    __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_init)(void)
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE;
+    __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_debug_level)(
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id, unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
+    hexagon_nn_nn_id id, int level) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int operation, hexagon_nn_padding_type padding, const hexagon_nn_input* inputs, int inputsLen, const hexagon_nn_output* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_graph_mode)(
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(hexagon_nn_nn_id id, const hexagon_nn_op_node* ops, int opsLen) __QAIC_HEADER_ATTRIBUTE;
+    hexagon_nn_nn_id id, int mode) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(hexagon_nn_nn_id id, unsigned int node_id, unsigned int batches, unsigned int height, unsigned int width, unsigned int depth, const unsigned char* data, int dataLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_snpprint)(hexagon_nn_nn_id id,
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(hexagon_nn_nn_id id, const hexagon_nn_const_node* consts, int constsLen) __QAIC_HEADER_ATTRIBUTE;
+                                                            unsigned char *buf,
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE;
+                                                            int bufLen)
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(hexagon_nn_nn_id id, unsigned int batches_in, unsigned int height_in, unsigned int width_in, unsigned int depth_in, const unsigned char* data_in, int data_inLen, unsigned int* batches_out, unsigned int* height_out, unsigned int* width_out, unsigned int* depth_out, unsigned char* data_out, int data_outLen, unsigned int* data_len_out) __QAIC_HEADER_ATTRIBUTE;
+    __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_getlog)(hexagon_nn_nn_id id,
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(unsigned int level) __QAIC_HEADER_ATTRIBUTE;
+                                                          unsigned char *buf,
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(hexagon_nn_nn_id id, hexagon_nn_perfinfo* info_out, int info_outLen, unsigned int* n_items) __QAIC_HEADER_ATTRIBUTE;
+                                                          int bufLen)
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE;
+    __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(hexagon_nn_nn_id id, unsigned int* cycles_lo, unsigned int* cycles_hi) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node)(
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int* ver) __QAIC_HEADER_ATTRIBUTE;
+    hexagon_nn_nn_id id,
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(const char* name, unsigned int* node_id) __QAIC_HEADER_ATTRIBUTE;
+    unsigned int node_id,
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(unsigned int node_id, char* name, int nameLen) __QAIC_HEADER_ATTRIBUTE;
+    unsigned int operation,
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void) __QAIC_HEADER_ATTRIBUTE;
+    hexagon_nn_padding_type padding,
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(int* ver) __QAIC_HEADER_ATTRIBUTE;
+    const hexagon_nn_input *inputs,
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(const unsigned char* buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
+    int inputsLen,
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(hexagon_nn_nn_id id, const hexagon_nn_tensordef* inputs, int inputsLen, hexagon_nn_tensordef* outputs, int outputsLen) __QAIC_HEADER_ATTRIBUTE;
+    const hexagon_nn_output *outputs,
+    int outputsLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_node_list)(
+    hexagon_nn_nn_id id,
+    const hexagon_nn_op_node *ops,
+    int opsLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node)(
+    hexagon_nn_nn_id id,
+    unsigned int node_id,
+    unsigned int batches,
+    unsigned int height,
+    unsigned int width,
+    unsigned int depth,
+    const unsigned char *data,
+    int dataLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_append_const_node_list)(
+    hexagon_nn_nn_id id,
+    const hexagon_nn_const_node *consts,
+    int constsLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_prepare)(hexagon_nn_nn_id id)
+    __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute)(
+    hexagon_nn_nn_id id,
+    unsigned int batches_in,
+    unsigned int height_in,
+    unsigned int width_in,
+    unsigned int depth_in,
+    const unsigned char *data_in,
+    int data_inLen,
+    unsigned int *batches_out,
+    unsigned int *height_out,
+    unsigned int *width_out,
+    unsigned int *depth_out,
+    unsigned char *data_out,
+    int data_outLen,
+    unsigned int *data_len_out) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_teardown)(hexagon_nn_nn_id id)
+    __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_set_powersave_level)(
+    unsigned int level) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_get_perfinfo)(
+    hexagon_nn_nn_id id,
+    hexagon_nn_perfinfo *info_out,
+    int info_outLen,
+    unsigned int *n_items) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_reset_perfinfo)(
+    hexagon_nn_nn_id id, unsigned int event) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_last_execution_cycles)(
+    hexagon_nn_nn_id id,
+    unsigned int *cycles_lo,
+    unsigned int *cycles_hi) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_version)(int *ver)
+    __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_name_to_id)(
+    const char *name, unsigned int *node_id) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_op_id_to_name)(
+    unsigned int node_id, char *name, int nameLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_disable_dcvs)(void)
+    __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_GetHexagonBinaryVersion)(
+    int *ver) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_PrintLog)(
+    const unsigned char *buf, int bufLen) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(hexagon_nn_execute_new)(
+    hexagon_nn_nn_id id,
+    const hexagon_nn_tensordef *inputs,
+    int inputsLen,
+    hexagon_nn_tensordef *outputs,
+    int outputsLen) __QAIC_HEADER_ATTRIBUTE;
 #ifdef __cplusplus
 }
 #endif

--- a/mace/core/runtime/hexagon/hexagon_nn_ops.h
+++ b/mace/core/runtime/hexagon/hexagon_nn_ops.h
@@ -5,8 +5,8 @@
 #ifndef LIBMACE_HEXAGON_NN_OPS_H
 #define LIBMACE_HEXAGON_NN_OPS_H
-#include "mace/utils/logging.h"
 #include <unordered_map>
+#include "mace/utils/logging.h"
 namespace mace {
@@ -24,8 +24,7 @@ typedef enum op_type_enum {
 class OpMap {
 public:
  void Init() {
-#define DEF_OP(NAME) \
+#define DEF_OP(NAME) op_map_[#NAME] = OP_##NAME;
-    op_map_[#NAME] = OP_##NAME;
 #include "mace/core/runtime/hexagon/ops.h"
@@ -40,6 +39,7 @@ class OpMap {
      return OP_INVALID;
    }
  }
 private:
  std::unordered_map<std::string, int> op_map_;
 };

--- a/mace/core/runtime/hexagon/ops.h
+++ b/mace/core/runtime/hexagon/ops.h
@@ -178,4 +178,3 @@ DEF_OP(QuantizedBiasAdd_8p8to8)
 #undef __SELF_DEF_OP_WREF
 #undef DEF_OP_WREF
 #endif
--- a/mace/core/runtime/hexagon/quantize.cc
+++ b/mace/core/runtime/hexagon/quantize.cc
@@ -29,16 +29,16 @@ void Quantizer::Quantize(const Tensor &in_tensor,
                         float *max_out) {
  float stepsize;
  float recip_stepsize;
-  QuantizeAdjustRange(min_in, max_in,
+  QuantizeAdjustRange(min_in, max_in, min_out, max_out, &stepsize,
-                      min_out, max_out,
+                      &recip_stepsize);
-                      &stepsize, &recip_stepsize);
  const float *in = in_tensor.data<float>();
  uint8_t *out = out_tensor->mutable_data<uint8_t>();
  for (int i = 0; i < in_tensor.size(); i++) {
    const float inval = in[i];
-    float ival = static_cast<uint8_t>((inval - *min_out) * recip_stepsize + 0.5f);
+    float ival =
+        static_cast<uint8_t>((inval - *min_out) * recip_stepsize + 0.5f);
    if (ival < 0) ival = 0;
    if (ival > 255) ival = 255;
    out[i] = static_cast<uint8_t>(ival);

--- a/mace/core/runtime/hexagon/quantize.h
+++ b/mace/core/runtime/hexagon/quantize.h
@@ -16,13 +16,17 @@ class Quantizer {
  void Quantize(const Tensor &in_tensor,
                Tensor *out_tensor,
-                float *min_out, float *max_out);
+                float *min_out,
+                float *max_out);
  void Quantize(const Tensor &in_tensor,
-                const float min_in, const float max_in,
+                const float min_in,
+                const float max_in,
                Tensor *out_tensor,
-                float *min_out, float *max_out);
+                float *min_out,
+                float *max_out);
  void DeQuantize(const Tensor &in_tensor,
-                  const float min_in, const float max_in,
+                  const float min_in,
+                  const float max_in,
                  Tensor *out_tensor);
 private:

--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -2,8 +2,8 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_allocator.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 namespace mace {
@@ -29,7 +29,6 @@ static cl_channel_type DataTypeToCLChannelType(const DataType t) {
      return 0;
  }
 }
 }
 OpenCLAllocator::OpenCLAllocator() {}
@@ -49,17 +48,16 @@ void *OpenCLAllocator::New(size_t nbytes) const {
 void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
                                const DataType dt) const {
  MACE_CHECK(image_shape.size() == 2) << "Image shape's size must equal 2";
-  VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", " << image_shape[1];
+  VLOG(3) << "Allocate OpenCL image: " << image_shape[0] << ", "
+          << image_shape[1];
  cl::ImageFormat img_format(CL_RGBA, DataTypeToCLChannelType(dt));
  cl_int error;
  cl::Image2D *cl_image =
      new cl::Image2D(OpenCLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, img_format,
-                      img_format,
+                      image_shape[0], image_shape[1], 0, nullptr, &error);
-                      image_shape[0], image_shape[1],
-                      0, nullptr, &error);
  MACE_CHECK(error == CL_SUCCESS) << error << " with image shape: ["
                                  << image_shape[0] << ", " << image_shape[1]
                                  << "]";
@@ -89,8 +87,8 @@ void *OpenCLAllocator::Map(void *buffer, size_t offset, size_t nbytes) const {
  // TODO(heliangliang) Non-blocking call
  cl_int error;
  void *mapped_ptr =
-      queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, offset,
+      queue.enqueueMapBuffer(*cl_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
-                             nbytes, nullptr, nullptr, &error);
+                             offset, nbytes, nullptr, nullptr, &error);
  MACE_CHECK(error == CL_SUCCESS);
  return mapped_ptr;
 }
@@ -106,13 +104,10 @@ void *OpenCLAllocator::MapImage(void *buffer,
  mapped_image_pitch->resize(2);
  cl_int error;
-  void *mapped_ptr =
+  void *mapped_ptr = OpenCLRuntime::Global()->command_queue().enqueueMapImage(
-      OpenCLRuntime::Global()->command_queue().enqueueMapImage(*cl_image,
+      *cl_image, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, origin, region,
-                                                            CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+      mapped_image_pitch->data(), mapped_image_pitch->data() + 1, nullptr,
-                                                            origin, region,
+      nullptr, &error);
-                                                            mapped_image_pitch->data(),
-                                                            mapped_image_pitch->data() + 1,
-                                                            nullptr, nullptr, &error);
  MACE_CHECK(error == CL_SUCCESS) << error;
  return mapped_ptr;

--- a/mace/core/runtime/opencl/opencl_development.cc
+++ b/mace/core/runtime/opencl/opencl_development.cc
@@ -5,8 +5,8 @@
 #include <vector>
 #include "mace/core/runtime/opencl/cl2_header.h"
-#include "mace/utils/utils.h"
 #include "mace/utils/logging.h"
+#include "mace/utils/utils.h"
 namespace mace {
@@ -16,7 +16,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name,
                              cl::Device &device,
                              cl::Program *program,
                              bool *is_binary) {
-  extern const std::map<std::string, std::vector<unsigned char>> kEncryptedProgramMap;
+  extern const std::map<std::string, std::vector<unsigned char>>
+      kEncryptedProgramMap;
  *is_binary = false;
  auto it_source = kEncryptedProgramMap.find(program_name);
  if (it_source == kEncryptedProgramMap.end()) {

--- a/mace/core/runtime/opencl/opencl_production.cc
+++ b/mace/core/runtime/opencl/opencl_production.cc
@@ -14,7 +14,8 @@ bool GetSourceOrBinaryProgram(const std::string &program_name,
                              cl::Device &device,
                              cl::Program *program,
                              bool *is_binary) {
-  extern const std::map<std::string, std::vector<unsigned char>> kCompiledProgramMap;
+  extern const std::map<std::string, std::vector<unsigned char>>
+      kCompiledProgramMap;
  *is_binary = true;
  auto it_binary = kCompiledProgramMap.find(binary_file_name_prefix);
  if (it_binary == kCompiledProgramMap.end()) {

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -48,11 +48,9 @@ double OpenCLProfilingTimer::ElapsedMicros() {
  return (stop_nanos_ - start_nanos_) / 1000.0;
 }
-double OpenCLProfilingTimer::AccumulatedMicros() {
+double OpenCLProfilingTimer::AccumulatedMicros() { return accumulated_micros_; }
-  return accumulated_micros_;
-}
-void OpenCLProfilingTimer::AccumulateTiming(){
+void OpenCLProfilingTimer::AccumulateTiming() {
  StopTiming();
  accumulated_micros_ += (stop_nanos_ - start_nanos_) / 1000.0;
 }
@@ -116,7 +114,8 @@ OpenCLRuntime::OpenCLRuntime() {
  cl::CommandQueue command_queue(context, gpu_device, properties);
  const char *kernel_path = getenv("MACE_KERNEL_PATH");
-  this->kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
+  this->kernel_path_ =
+      std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
  this->device_ = new cl::Device(gpu_device);
  this->context_ = new cl::Context(context);
@@ -166,15 +165,11 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
      GenerateCLBinaryFilenamePrefix(built_program_key);
  std::vector<unsigned char> program_vec;
  bool is_opencl_binary;
-  const bool found = GetSourceOrBinaryProgram(program_name,
+  const bool found =
-                                              binary_file_name_prefix,
+      GetSourceOrBinaryProgram(program_name, binary_file_name_prefix, context(),
-                                              context(),
+                               device(), program, &is_opencl_binary);
-                                              device(),
-                                              program,
-                                              &is_opencl_binary);
  MACE_CHECK(found, "Program not found for ",
-                    is_opencl_binary ? "binary: " : "source: ",
+             is_opencl_binary ? "binary: " : "source: ", built_program_key);
-                    built_program_key);
  // Build program
  std::string build_options_str =
@@ -190,13 +185,13 @@ void OpenCLRuntime::BuildProgram(const std::string &program_name,
    }
    LOG(FATAL) << "Build program from "
               << (is_opencl_binary ? "binary: " : "source: ")
-               << built_program_key
+               << built_program_key << " failed: " << ret;
-               << " failed: " << ret;
  }
  if (!is_opencl_binary) {
    // Write binary if necessary
-    std::string binary_filename = kernel_path_ + binary_file_name_prefix + ".bin";
+    std::string binary_filename =
+        kernel_path_ + binary_file_name_prefix + ".bin";
    size_t device_list_size = 1;
    std::unique_ptr<size_t[]> program_binary_sizes(
        new size_t[device_list_size]);
@@ -240,8 +235,8 @@ cl::Kernel OpenCLRuntime::BuildKernel(
  if (built_program_it != built_program_map_.end()) {
    program = built_program_it->second;
  } else {
-    this->BuildProgram(program_name, built_program_key,
+    this->BuildProgram(program_name, built_program_key, build_options_str,
-                       build_options_str, &program);
+                       &program);
    built_program_map_.emplace(built_program_key, program);
  }
  return cl::Kernel(program, kernel_name.c_str());

--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -19,7 +19,8 @@ namespace mace {
 class OpenCLProfilingTimer : public Timer {
 public:
-  explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event), accumulated_micros_(0) {};
+  explicit OpenCLProfilingTimer(const cl::Event *event)
+      : event_(event), accumulated_micros_(0){};
  void StartTiming() override;
  void StopTiming() override;
  void AccumulateTiming() override;
@@ -48,6 +49,7 @@ class OpenCLRuntime {
  cl::Kernel BuildKernel(const std::string &program_name,
                         const std::string &kernel_name,
                         const std::set<std::string> &build_options);
 private:
  OpenCLRuntime();
  ~OpenCLRuntime();

--- a/mace/core/runtime/opencl/opencl_wrapper.h
+++ b/mace/core/runtime/opencl/opencl_wrapper.h
@@ -7,9 +7,9 @@
 namespace mace {
-  // These functions are not thread-safe.
+// These functions are not thread-safe.
-  void LoadOpenCLLibrary();
+void LoadOpenCLLibrary();
-  void UnloadOpenCLLibrary();
+void UnloadOpenCLLibrary();
 }  // namespace mace

--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -69,13 +69,10 @@ class Tensor {
        dtype_(type),
        buffer_(nullptr),
        is_buffer_owner_(true),
-      name_("") {};
+        name_(""){};
  Tensor(BufferBase *buffer, DataType dtype)
-    : dtype_(dtype),
+      : dtype_(dtype), buffer_(buffer), is_buffer_owner_(false), name_("") {}
-      buffer_(buffer),
-      is_buffer_owner_(false),
-      name_("") {}
  Tensor(const BufferSlice &buffer_slice, DataType dtype)
      : dtype_(dtype),
@@ -102,8 +99,8 @@ class Tensor {
  inline index_t dim_size() const { return shape_.size(); }
  inline index_t dim(unsigned int index) const {
-    MACE_CHECK(index < shape_.size(), "Dim out of range: ",
+    MACE_CHECK(index < shape_.size(), "Dim out of range: ", index, " >= ",
-               index, " >= ", shape_.size());
+               shape_.size());
    return shape_[index];
  }
@@ -112,40 +109,35 @@ class Tensor {
                           std::multiplies<int64_t>());
  }
-  inline index_t raw_size() const {
+  inline index_t raw_size() const { return size() * SizeOfType(); }
-    return size() * SizeOfType();
-  }
  inline bool has_opencl_image() const {
-    return buffer_ != nullptr && !buffer_->OnHost()
+    return buffer_ != nullptr && !buffer_->OnHost() &&
-      && typeid(*buffer_) == typeid(Image);
+           typeid(*buffer_) == typeid(Image);
  }
  inline bool has_opencl_buffer() const {
-    return buffer_ != nullptr && !buffer_->OnHost()
+    return buffer_ != nullptr && !buffer_->OnHost() && !has_opencl_image();
-      && !has_opencl_image();
  }
  inline cl::Image *opencl_image() const {
    MACE_CHECK(has_opencl_image(), "do not have image");
-    return static_cast<cl::Image*>(buffer_->buffer());
+    return static_cast<cl::Image *>(buffer_->buffer());
  }
  inline cl::Buffer *opencl_buffer() const {
    MACE_CHECK(has_opencl_buffer(), "do not have opencl buffer");
-    return static_cast<cl::Buffer*>(buffer_->buffer());
+    return static_cast<cl::Buffer *>(buffer_->buffer());
  }
-  inline index_t buffer_offset() const {
+  inline index_t buffer_offset() const { return buffer_->offset(); }
-    return buffer_->offset();
-  }
  inline const void *raw_data() const {
    MACE_CHECK(buffer_ != nullptr, "buffer is null");
    return buffer_->raw_data();
  }
-  template<typename T>
+  template <typename T>
  inline const T *data() const {
    MACE_CHECK(buffer_ != nullptr, "buffer is null");
    return buffer_->data<T>();
@@ -156,7 +148,7 @@ class Tensor {
    return buffer_->raw_mutable_data();
  }
-  template<typename T>
+  template <typename T>
  inline T *mutable_data() {
    MACE_CHECK(buffer_ != nullptr, "buffer is null");
    return static_cast<T *>(buffer_->raw_mutable_data());
@@ -188,25 +180,17 @@ class Tensor {
      is_buffer_owner_ = true;
    } else {
      MACE_CHECK(has_opencl_image(), "Cannot ResizeImage buffer, use Resize.");
-      Image *image = dynamic_cast<Image*>(buffer_);
+      Image *image = dynamic_cast<Image *>(buffer_);
-      MACE_CHECK(image_shape[0] <= image->image_shape()[0]
+      MACE_CHECK(image_shape[0] <= image->image_shape()[0] &&
-                   && image_shape[1] <= image->image_shape()[1],
+                     image_shape[1] <= image->image_shape()[1],
-                 "tensor (source op ",
+                 "tensor (source op ", name_,
-                 name_,
+                 "): current physical image shape: ", image->image_shape()[0],
-                 "): current physical image shape: ",
+                 ", ", image->image_shape()[1], " < logical image shape: ",
-                 image->image_shape()[0],
+                 image_shape[0], ", ", image_shape[1]);
-                 ", ",
-                 image->image_shape()[1],
-                 " < logical image shape: ",
-                 image_shape[0],
-                 ", ",
-                 image_shape[1]);
    }
  }
-  inline void ResizeLike(const Tensor &other) {
+  inline void ResizeLike(const Tensor &other) { ResizeLike(&other); }
-    ResizeLike(&other);
-  }
  inline void ResizeLike(const Tensor *other) {
    if (other->has_opencl_image()) {
@@ -229,7 +213,7 @@ class Tensor {
    memcpy(buffer_->raw_mutable_data(), src, size);
  }
-  template<typename T>
+  template <typename T>
  inline void Copy(const T *src, index_t length) {
    MACE_CHECK(length == size(), "copy src and dst with different size.");
    CopyBytes(static_cast<const void *>(src), sizeof(T) * length);
@@ -248,13 +232,9 @@ class Tensor {
    return type_size;
  }
-  inline BufferBase *UnderlyingBuffer() const {
+  inline BufferBase *UnderlyingBuffer() const { return buffer_; }
-    return buffer_;
-  }
-  inline void SetSourceOpName(const std::string name) {
+  inline void SetSourceOpName(const std::string name) { name_ = name; }
-    name_ = name;
-  }
  inline void DebugPrint() const {
    using namespace numerical_chars;
@@ -272,8 +252,9 @@ class Tensor {
      }
      CASES(dtype_, (os << (this->data<T>()[i]) << ", "));
    }
-    LOG(INFO) << "Tensor size: [" << dim(0) << ", " << dim(1) << ", "
+    LOG(INFO) << "Tensor size: [" << dim(0) << ", " << dim(1) << ", " << dim(2)
-              << dim(2) << ", " << dim(3) << "], content:\n" << os.str();
+              << ", " << dim(3) << "], content:\n"
+              << os.str();
  }
  class MappingGuard {
@@ -308,7 +289,7 @@ class Tensor {
  Allocator *allocator_;
  DataType dtype_;
  std::vector<index_t> shape_;
-  std::vector<size_t > image_shape_;
+  std::vector<size_t> image_shape_;
  BufferBase *buffer_;
  BufferSlice buffer_slice_;
  bool is_buffer_owner_;

--- a/mace/core/testing/test_benchmark.cc
+++ b/mace/core/testing/test_benchmark.cc
@@ -99,9 +99,7 @@ void RestartTiming() {
  accum_time = 0;
  start_time = NowMicros();
 }
-void StartTiming() {
+void StartTiming() { start_time = NowMicros(); }
-  start_time = NowMicros();
-}
 void StopTiming() {
  if (start_time != 0) {
    accum_time += (NowMicros() - start_time);

--- a/mace/core/testing/test_benchmark.h
+++ b/mace/core/testing/test_benchmark.h
@@ -6,9 +6,9 @@
 #ifndef MACE_CORE_TESTING_TEST_BENCHMARK_H_
 #define MACE_CORE_TESTING_TEST_BENCHMARK_H_
+#include <string>
 #include <utility>
 #include <vector>
-#include <string>
 #define MACE_BENCHMARK_CONCAT(a, b, c) a##b##c
 #define BENCHMARK(n)                                        \

--- a/mace/core/types.cc
+++ b/mace/core/types.cc
@@ -2,8 +2,8 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
-#include <map>
 #include <cstdint>
+#include <map>
 #include "mace/core/types.h"
 #include "mace/utils/logging.h"
@@ -30,18 +30,12 @@ bool DataTypeCanUseMemcpy(DataType dt) {
 std::string DataTypeToString(const DataType dt) {
  static std::map<DataType, std::string> dtype_string_map = {
-      {DT_FLOAT, "DT_FLOAT"},
+      {DT_FLOAT, "DT_FLOAT"},   {DT_HALF, "DT_HALF"},
-      {DT_HALF, "DT_HALF"},
+      {DT_DOUBLE, "DT_DOUBLE"}, {DT_UINT8, "DT_UINT8"},
-      {DT_DOUBLE, "DT_DOUBLE"},
+      {DT_INT8, "DT_INT8"},     {DT_INT32, "DT_INT32"},
-      {DT_UINT8, "DT_UINT8"},
+      {DT_UINT32, "DT_UINT32"}, {DT_UINT16, "DT_UINT16"},
-      {DT_INT8, "DT_INT8"},
+      {DT_INT64, "DT_INT64"},   {DT_BOOL, "DT_BOOL"},
-      {DT_INT32, "DT_INT32"},
+      {DT_STRING, "DT_STRING"}};
-      {DT_UINT32, "DT_UINT32"},
-      {DT_UINT16, "DT_UINT16"},
-      {DT_INT64, "DT_INT64"},
-      {DT_BOOL, "DT_BOOL"},
-      {DT_STRING, "DT_STRING"}
-  };
  MACE_CHECK(dt != DT_INVALID) << "Not support Invalid data type";
  return dtype_string_map[dt];
 }

--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -5,8 +5,8 @@
 #include <string>
 #include <vector>
-#include "mace/core/workspace.h"
 #include "mace/core/arg_helper.h"
+#include "mace/core/workspace.h"
 #include "mace/utils/timer.h"
 namespace mace {
@@ -51,19 +51,19 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
  index_t model_data_size = 0;
  unsigned char *model_data_ptr = nullptr;
  for (auto &const_tensor : net_def.tensors()) {
-    if (model_data_ptr == nullptr
+    if (model_data_ptr == nullptr ||
-      || reinterpret_cast<long long>(const_tensor.data())
+        reinterpret_cast<long long>(const_tensor.data()) <
-        < reinterpret_cast<long long>(model_data_ptr)) {
+            reinterpret_cast<long long>(model_data_ptr)) {
      model_data_ptr = const_cast<unsigned char *>(const_tensor.data());
    }
  }
  for (auto &const_tensor : net_def.tensors()) {
-    model_data_size = std::max(model_data_size,
+    model_data_size = std::max(
-                               static_cast<index_t>(
+        model_data_size,
-                                 (reinterpret_cast<long long>(const_tensor.data())
+        static_cast<index_t>((reinterpret_cast<long long>(const_tensor.data()) -
-                                   - reinterpret_cast<long long>(model_data_ptr))
+                              reinterpret_cast<long long>(model_data_ptr)) +
-                                   + const_tensor.data_size()
+                             const_tensor.data_size() *
-                                     * GetEnumTypeSize(const_tensor.data_type())));
+                                 GetEnumTypeSize(const_tensor.data_type())));
  }
  VLOG(3) << "Model data size: " << model_data_size;
@@ -81,8 +81,7 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
  for (auto &const_tensor : net_def.tensors()) {
    MACE_LATENCY_LOGGER(2, "Load tensor ", const_tensor.name());
    VLOG(3) << "Tensor name: " << const_tensor.name()
-            << ", data type: " << const_tensor.data_type()
+            << ", data type: " << const_tensor.data_type() << ", shape: "
-            << ", shape: "
            << MakeString(std::vector<index_t>(const_tensor.dims().begin(),
                                               const_tensor.dims().end()));
    std::vector<index_t> dims;
@@ -90,13 +89,11 @@ void Workspace::LoadModelTensor(const NetDef &net_def, DeviceType type) {
      dims.push_back(d);
    }
-    index_t
+    index_t offset = (long long)const_tensor.data() - (long long)model_data_ptr;
-      offset = (long long) const_tensor.data() - (long long) model_data_ptr;
    std::unique_ptr<Tensor> tensor(
-      new Tensor(BufferSlice(tensor_buffer_.get(),
+        new Tensor(BufferSlice(tensor_buffer_.get(), offset,
-                             offset,
+                               const_tensor.data_size() *
-                             const_tensor.data_size()
+                                   GetEnumTypeSize(const_tensor.data_type())),
-                               * GetEnumTypeSize(const_tensor.data_type())),
                   const_tensor.data_type()));
    tensor->Reshape(dims);
@@ -118,13 +115,11 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
  // as GPU have consistent data type for each layer for now.
  // As DSP may have different data output type for each op,
  // we stick to the same concept.
-  for (auto &op: net_def.op()) {
+  for (auto &op : net_def.op()) {
    if (op.has_mem_id()) {
      const DataType op_dtype = static_cast<DataType>(
          ArgumentHelper::GetSingleArgument<OperatorDef, int>(
-          op,
+              op, "T", static_cast<int>(DT_FLOAT)));
-          "T",
-          static_cast<int>(DT_FLOAT)));
      if (op_dtype != DataType::DT_INVALID) {
        dtype = op_dtype;
        // find first valid data type, break
@@ -133,22 +128,24 @@ void Workspace::CreateImageOutputTensor(const NetDef &net_def) {
    }
  }
  MACE_CHECK(dtype != DataType::DT_INVALID, "data type is invalid.");
-  for (auto &mem_block: net_def.mem_arena().mem_block()) {
+  for (auto &mem_block : net_def.mem_arena().mem_block()) {
-    std::unique_ptr<BufferBase>
+    std::unique_ptr<BufferBase> image_buf(
-      image_buf(new Image({mem_block.x(), mem_block.y()}, dtype));
+        new Image({mem_block.x(), mem_block.y()}, dtype));
    preallocated_allocator_.SetBuffer(mem_block.mem_id(), std::move(image_buf));
  }
  VLOG(3) << "Preallocate image to tensors";
-  for (auto &op: net_def.op()) {
+  for (auto &op : net_def.op()) {
    if (op.has_mem_id()) {
-      std::unique_ptr<Tensor> tensor
+      std::unique_ptr<Tensor> tensor(
-        (new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype));
+          new Tensor(preallocated_allocator_.GetBuffer(op.mem_id()), dtype));
      tensor->SetSourceOpName(op.name());
-      VLOG(3) << "Tensor: " << op.name() << "(" << op.type() << ")" << "; Mem: "
+      VLOG(3)
-              << op.mem_id() << "; Image shape: "
+          << "Tensor: " << op.name() << "(" << op.type() << ")"
+          << "; Mem: " << op.mem_id() << "; Image shape: "
          << dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[0]
          << ", "
-              << dynamic_cast<Image *>(tensor->UnderlyingBuffer())->image_shape()[1];
+          << dynamic_cast<Image *>(tensor->UnderlyingBuffer())
+                 ->image_shape()[1];
      tensor_map_[op.output(0)] = std::move(tensor);
    }
  }

--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -5,9 +5,9 @@
 #ifndef MACE_CORE_WORKSPACE_H_
 #define MACE_CORE_WORKSPACE_H_
+#include "mace/core/preallocated_pooled_allocator.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
-#include "mace/core/preallocated_pooled_allocator.h"
 namespace mace {

--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -6,9 +6,9 @@
 #define MACE_KERNELS_ACTIVATION_H_
 #include "mace/core/future.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 namespace mace {
 namespace kernels {
@@ -99,15 +99,13 @@ void PReLUActivation(const T *input_ptr,
      output_ptr[i] = in;
    }
  }
 }
 template <DeviceType D, typename T>
 class ActivationFunctor {
 public:
  ActivationFunctor(ActivationType type, T relux_max_limit)
-      : activation_(type),
+      : activation_(type), relux_max_limit_(relux_max_limit) {}
-        relux_max_limit_(relux_max_limit){}
  void operator()(const Tensor *input,
                  const Tensor *alpha,
@@ -118,9 +116,11 @@ class ActivationFunctor {
    if (activation_ == PRELU) {
      MACE_CHECK_NOTNULL(alpha);
      const T *alpha_ptr = alpha->data<T>();
-      PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr, output_ptr); 
+      PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr,
+                      output_ptr);
    } else {
-      DoActivation(input_ptr, output_ptr, output->size(), activation_, relux_max_limit_);
+      DoActivation(input_ptr, output_ptr, output->size(), activation_,
+                   relux_max_limit_);
    }
  }
@@ -131,14 +131,16 @@ class ActivationFunctor {
 template <>
 void ActivationFunctor<DeviceType::NEON, float>::operator()(
-    const Tensor *input, const Tensor *alpha, Tensor *output, StatsFuture *future);
+    const Tensor *input,
+    const Tensor *alpha,
+    Tensor *output,
+    StatsFuture *future);
 template <typename T>
 class ActivationFunctor<DeviceType::OPENCL, T> {
 public:
  ActivationFunctor(ActivationType type, T relux_max_limit)
-      : activation_(type),
+      : activation_(type), relux_max_limit_(relux_max_limit) {}
-        relux_max_limit_(relux_max_limit){}
  void operator()(const Tensor *input,
                  const Tensor *alpha,

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -18,7 +18,7 @@ namespace mace {
 namespace kernels {
 namespace {
-  constexpr int kCostPerGroup = 1024;
+constexpr int kCostPerGroup = 1024;
 }  // namespace
 template <DeviceType D, typename T>

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -10,10 +10,10 @@
 #endif
 #include "mace/core/future.h"
-#include "mace/public/mace.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace kernels {
@@ -24,7 +24,7 @@ struct BatchNormFunctorBase {
                       const float relux_max_limit)
      : folded_constant_(folded_constant),
        activation_(activation),
-        relux_max_limit_(relux_max_limit){}
+        relux_max_limit_(relux_max_limit) {}
  const bool folded_constant_;
  const ActivationType activation_;
@@ -36,8 +36,7 @@ struct BatchNormFunctor : BatchNormFunctorBase {
  BatchNormFunctor(const bool folded_constant,
                   const ActivationType activation,
                   const float relux_max_limit)
-      : BatchNormFunctorBase(
+      : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
-            folded_constant, activation, relux_max_limit) {}
  void operator()(const Tensor *input,
                  const Tensor *scale,
@@ -147,8 +146,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
  BatchNormFunctor(const bool folded_constant,
                   const ActivationType activation,
                   const float relux_max_limit)
-      : BatchNormFunctorBase(
+      : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
-            folded_constant, activation, relux_max_limit) {}
  void operator()(const Tensor *input,
                  const Tensor *scale,
                  const Tensor *offset,

--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -6,9 +6,9 @@
 #define MACE_KERNELS_BIAS_ADD_H_
 #include "mace/core/future.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 namespace mace {
 namespace kernels {
@@ -32,7 +32,6 @@ struct BiasAddFunctor {
    const T *bias_ptr = bias->data<T>();
    T *output_ptr = output->mutable_data<T>();
 #pragma omp parallel for collapse(4)
    for (index_t n = 0; n < batch; ++n) {
      for (index_t h = 0; h < height; ++h) {
@@ -44,7 +43,6 @@ struct BiasAddFunctor {
        }
      }
    }
  }
 };

--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -17,10 +17,9 @@ struct BufferToImageFunctorBase {
  bool i2b_;
 };
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
-struct BufferToImageFunctor : BufferToImageFunctorBase{
+struct BufferToImageFunctor : BufferToImageFunctorBase {
-  BufferToImageFunctor(bool i2b = false) :
+  BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
-      BufferToImageFunctorBase(i2b) {}
  void operator()(Tensor *input,
                  const BufferType type,
                  Tensor *output,
@@ -29,10 +28,9 @@ struct BufferToImageFunctor : BufferToImageFunctorBase{
  }
 };
-template<typename T>
+template <typename T>
-struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase{
+struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase {
-  BufferToImageFunctor(bool i2b = false) :
+  BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
-      BufferToImageFunctorBase(i2b) {}
  void operator()(Tensor *input,
                  const BufferType type,
                  Tensor *output,

--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -16,8 +16,10 @@ class ChannelShuffleFunctor {
 public:
  ChannelShuffleFunctor(const int group) : group_(group) {}
-  void operator()(const T *input, const index_t *input_shape,
+  void operator()(const T *input,
-                  T *output, StatsFuture *future) {
+                  const index_t *input_shape,
+                  T *output,
+                  StatsFuture *future) {
    index_t batch = input_shape[0];
    index_t channels = input_shape[1];
    index_t height = input_shape[2];

--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -6,23 +6,23 @@
 #define MACE_KERNELS_CONCAT_H_
 #include "mace/core/future.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/tensor.h"
 #include "mace/core/types.h"
 #include "mace/public/mace.h"
-#include "mace/core/tensor.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 namespace mace {
 namespace kernels {
 struct ConcatFunctorBase {
-  ConcatFunctorBase(const int32_t axis): axis_(axis){}
+  ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
  int32_t axis_;
 };
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 struct ConcatFunctor : ConcatFunctorBase {
-  ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}
+  ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
  void operator()(const std::vector<const Tensor *> &input_list,
                  Tensor *output,
@@ -75,14 +75,14 @@ struct ConcatFunctor : ConcatFunctorBase {
  }
 };
-template<typename T>
+template <typename T>
-struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase{
+struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
-  ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}
+  ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
  void operator()(const std::vector<const Tensor *> &input_list,
-                  Tensor *output, StatsFuture *future);
+                  Tensor *output,
+                  StatsFuture *future);
  cl::Kernel kernel_;
 };
 }  // namepsace kernels

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -116,8 +116,7 @@ void Conv2dKernelFunc(const T *input_ptr,  // batch start
              sum[sum_idx] += vaddvq_f32(tmp);
 #else
              for (int inci = 0; inci < inc_tile_size; ++inci) {
-                sum[sum_idx] +=
+                sum[sum_idx] += in[in_idx * inc_tile_size + inci] *
-                    in[in_idx * inc_tile_size + inci] *
                                weights[weights_idx * inc_tile_size + inci];
              }
 #endif
@@ -188,7 +187,7 @@ struct Conv2dFunctorBase {
        paddings_(paddings),
        dilations_(dilations),
        activation_(activation),
-        relux_max_limit_(relux_max_limit){}
+        relux_max_limit_(relux_max_limit) {}
  const int *strides_;  // [stride_h, stride_w]
  const Padding padding_type_;
@@ -230,8 +229,9 @@ struct Conv2dFunctor : Conv2dFunctorBase {
          padding_type_, output_shape.data(), paddings.data());
    } else {
      paddings = paddings_;
-      CalcOutputSize(input->shape().data(), filter->shape().data(), paddings_.data(),
+      CalcOutputSize(input->shape().data(), filter->shape().data(),
-                     dilations_, strides_, RoundType::FLOOR, output_shape.data());
+                     paddings_.data(), dilations_, strides_, RoundType::FLOOR,
+                     output_shape.data());
    }
    output->Resize(output_shape);

--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -159,18 +159,29 @@ void CalcOutputSize(const index_t *input_shape,   // NHWC
  */
  output_shape[0] = input_shape[0];
  if (round_type == FLOOR) {
-    output_shape[1] = static_cast<index_t>(std::floor(1.0 * (input_shape[1] + padding_size[0]
+    output_shape[1] = static_cast<index_t>(
-        - filter_shape[0] - (filter_shape[0] - 1) * (dilations[0] - 1)) / strides[0]) + 1);
+        std::floor(1.0 * (input_shape[1] + padding_size[0] - filter_shape[0] -
-    output_shape[2] = static_cast<index_t>(std::floor(1.0 * (input_shape[2] + padding_size[1]
+                          (filter_shape[0] - 1) * (dilations[0] - 1)) /
-        - filter_shape[1] - (filter_shape[1] - 1) * (dilations[1] - 1)) / strides[1]) + 1);
+                   strides[0]) +
+        1);
+    output_shape[2] = static_cast<index_t>(
+        std::floor(1.0 * (input_shape[2] + padding_size[1] - filter_shape[1] -
+                          (filter_shape[1] - 1) * (dilations[1] - 1)) /
+                   strides[1]) +
+        1);
  } else {
-    output_shape[1] = static_cast<index_t>(std::ceil(1.0 * (input_shape[1] + padding_size[0]
+    output_shape[1] = static_cast<index_t>(
-        - filter_shape[0] - (filter_shape[0] - 1) * (dilations[0] - 1)) / strides[0]) + 1);
+        std::ceil(1.0 * (input_shape[1] + padding_size[0] - filter_shape[0] -
-    output_shape[2] = static_cast<index_t>(std::ceil(1.0 * (input_shape[2] + padding_size[1]
+                         (filter_shape[0] - 1) * (dilations[0] - 1)) /
-        - filter_shape[1] - (filter_shape[1] - 1) * (dilations[1] - 1)) / strides[1]) + 1);
+                  strides[0]) +
+        1);
+    output_shape[2] = static_cast<index_t>(
+        std::ceil(1.0 * (input_shape[2] + padding_size[1] - filter_shape[1] -
+                         (filter_shape[1] - 1) * (dilations[1] - 1)) /
+                  strides[1]) +
+        1);
  }
  output_shape[3] = filter_shape[2];
 }
 void CalPaddingSize(const index_t *input_shape,   // NCHW

--- a/mace/kernels/conv_pool_2d_util.h
+++ b/mace/kernels/conv_pool_2d_util.h
@@ -15,7 +15,7 @@ enum Padding {
  FULL = 2,   // Pads with one less than the filter size on both sides
 };
-enum RoundType{
+enum RoundType {
  FLOOR = 0,
  CEIL = 1,
 };

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -10,9 +10,9 @@
 #endif
 #include "mace/core/future.h"
-#include "mace/public/mace.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/public/mace.h"
 namespace mace {
 namespace kernels {
@@ -247,7 +247,7 @@ struct DepthwiseConv2dFunctorBase {
        paddings_(paddings),
        dilations_(dilations),
        activation_(activation),
-        relux_max_limit_(relux_max_limit){}
+        relux_max_limit_(relux_max_limit) {}
  const int *strides_;  // [stride_h, stride_w]
  const Padding padding_type_;
@@ -296,8 +296,9 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
          padding_type_, output_shape.data(), paddings.data());
    } else {
      paddings = paddings_;
-      CalcOutputSize(input->shape().data(), fake_filter_shape.data(), paddings_.data(),
+      CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
-                     dilations_, strides_, RoundType::FLOOR, output_shape.data());
+                     paddings_.data(), dilations_, strides_, RoundType::FLOOR,
+                     output_shape.data());
    }
    auto input_shape = fake_filter_shape;
    output->Resize(output_shape);

--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -5,13 +5,13 @@
 #define MACE_KERNELS_ELTWISE_H_
 #include "mace/core/future.h"
-#include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/tensor.h"
 namespace mace {
 namespace kernels {
-enum EltwiseType{
+enum EltwiseType {
  PROD = 0,
  SUM = 1,
  MAX = 2,
@@ -19,8 +19,7 @@ enum EltwiseType{
 };
 struct EltwiseFunctorBase {
-  EltwiseFunctorBase(const EltwiseType type,
+  EltwiseFunctorBase(const EltwiseType type, const std::vector<float> &coeff)
-                     const std::vector<float> &coeff)
      : type_(type), coeff_(coeff) {}
  EltwiseType type_;
@@ -29,8 +28,7 @@ struct EltwiseFunctorBase {
 template <DeviceType D, typename T>
 struct EltwiseFunctor : EltwiseFunctorBase {
-  EltwiseFunctor(const EltwiseType type,
+  EltwiseFunctor(const EltwiseType type, const std::vector<float> &coeff)
-                     const std::vector<float> &coeff)
      : EltwiseFunctorBase(type, coeff) {}
  void operator()(const Tensor *input0,
@@ -49,7 +47,7 @@ struct EltwiseFunctor : EltwiseFunctorBase {
    switch (type_) {
      case PROD:
 #pragma omp parallel for
-        for(index_t i = 0; i < size; ++i) {
+        for (index_t i = 0; i < size; ++i) {
          output_ptr[i] = input0_ptr[i] * input1_ptr[i];
        }
        break;
@@ -62,19 +60,20 @@ struct EltwiseFunctor : EltwiseFunctorBase {
        } else {
 #pragma omp parallel for
          for (index_t i = 0; i < size; ++i) {
-            output_ptr[i] = coeff_[0] * input0_ptr[i] + coeff_[1] * input1_ptr[i];
+            output_ptr[i] =
+                coeff_[0] * input0_ptr[i] + coeff_[1] * input1_ptr[i];
          }
        }
        break;
      case MAX:
 #pragma omp parallel for
-        for(index_t i = 0; i < size; ++i) {
+        for (index_t i = 0; i < size; ++i) {
          output_ptr[i] = std::max<T>(input0_ptr[i], input1_ptr[i]);
        }
        break;
      case MIN:
 #pragma omp parallel for
-        for(index_t i = 0; i < size; ++i) {
+        for (index_t i = 0; i < size; ++i) {
          output_ptr[i] = std::min<T>(input0_ptr[i], input1_ptr[i]);
        }
        break;
@@ -84,11 +83,9 @@ struct EltwiseFunctor : EltwiseFunctorBase {
  }
 };
 template <typename T>
-struct EltwiseFunctor<DeviceType::OPENCL, T>: EltwiseFunctorBase {
+struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
-  EltwiseFunctor(const EltwiseType type,
+  EltwiseFunctor(const EltwiseType type, const std::vector<float> &coeff)
-                 const std::vector<float> &coeff)
      : EltwiseFunctorBase(type, coeff) {}
  void operator()(const Tensor *input0,

--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -6,8 +6,8 @@
 #define MACE_KERNELS_FULLY_CONNECTED_H_
 #include "mace/core/future.h"
-#include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"
 namespace mace {
@@ -16,25 +16,23 @@ namespace kernels {
 struct FullyConnectedBase {
  FullyConnectedBase(const ActivationType activation,
                     const float relux_max_limit)
-      : activation_(activation),
+      : activation_(activation), relux_max_limit_(relux_max_limit) {}
-        relux_max_limit_(relux_max_limit){}
  const ActivationType activation_;
  const float relux_max_limit_;
 };
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 struct FullyConnectedFunctor : FullyConnectedBase {
  FullyConnectedFunctor(const ActivationType activation,
-                        const float relux_max_limit) :
+                        const float relux_max_limit)
-      FullyConnectedBase(activation, relux_max_limit) {}
+      : FullyConnectedBase(activation, relux_max_limit) {}
  void operator()(const Tensor *input,
                  const Tensor *weight,
                  const Tensor *bias,
                  Tensor *output,
                  StatsFuture *future) {
    std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
    output->Resize(output_shape);
    const index_t N = output->dim(0);
@@ -70,11 +68,11 @@ struct FullyConnectedFunctor : FullyConnectedBase {
  }
 };
-template<typename T>
+template <typename T>
 struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
  FullyConnectedFunctor(const ActivationType activation,
-                        const float relux_max_limit) :
+                        const float relux_max_limit)
-      FullyConnectedBase(activation, relux_max_limit) {}
+      : FullyConnectedBase(activation, relux_max_limit) {}
  void operator()(const Tensor *input,
                  const Tensor *weight,

--- a/mace/kernels/global_avg_pooling.h
+++ b/mace/kernels/global_avg_pooling.h
@@ -39,8 +39,10 @@ struct GlobalAvgPoolingFunctor {
 template <>
 void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
-    const float *input, const index_t *input_shape,
+    const float *input,
-    float *output, StatsFuture *future);
+    const index_t *input_shape,
+    float *output,
+    StatsFuture *future);
 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -6,20 +6,18 @@
 #define MACE_KERNELS_MATMUL_H_
 #include "mace/core/future.h"
-#include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/tensor.h"
 namespace mace {
 namespace kernels {
 template <DeviceType D, typename T>
 struct MatMulFunctor {
  void operator()(const Tensor *A,
                  const Tensor *B,
                  Tensor *C,
                  StatsFuture *future) {
    std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
    C->Resize(c_shape);
    const index_t N = C->dim(0);
@@ -52,7 +50,6 @@ struct MatMulFunctor {
  }
 };
 template <typename T>
 struct MatMulFunctor<DeviceType::OPENCL, T> {
  void operator()(const Tensor *A,

--- a/mace/kernels/neon/batch_norm_neon.cc
+++ b/mace/kernels/neon/batch_norm_neon.cc
@@ -52,7 +52,8 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
 #pragma omp parallel for collapse(2)
  for (index_t i = 0; i < n; ++i) {
    for (index_t j = 0; j < sample_size; ++j) {
-      const float *input_sample_ptr = input_ptr + (i * sample_size + j) * channel;
+      const float *input_sample_ptr =
+          input_ptr + (i * sample_size + j) * channel;
      float *output_sample_ptr = output_ptr + (i * sample_size + j) * channel;
      const float *new_scale_ptr = new_scale.data();
      const float *new_offset_ptr = new_offset.data();

--- a/mace/kernels/neon/conv_2d_neon.cc
+++ b/mace/kernels/neon/conv_2d_neon.cc
@@ -50,12 +50,11 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
  MACE_CHECK_NOTNULL(filter);
  MACE_CHECK_NOTNULL(output);
  std::vector<index_t> output_shape_vec(4);
  std::vector<int> paddings(2);
  kernels::CalcPaddingAndOutputSize(
-      input->shape().data(), filter->shape().data(), dilations_,
+      input->shape().data(), filter->shape().data(), dilations_, strides_,
-      strides_, paddings_, output_shape_vec.data(), paddings.data());
+      paddings_, output_shape_vec.data(), paddings.data());
  output->Resize(output_shape_vec);
  typedef void (*Conv2dNeonFunction)(
@@ -102,8 +101,8 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
  auto output_shape = output->shape().data();
  auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
-  conv2d_neon_func(input_data, input_shape, filter_data, nullptr,
+  conv2d_neon_func(input_data, input_shape, filter_data, nullptr, bias_data,
-                   bias_data, output_data, output_shape);
+                   output_data, output_shape);
 }
 }  // namespace kernels

--- a/mace/kernels/neon/conv_2d_neon_3x3.cc
+++ b/mace/kernels/neon/conv_2d_neon_3x3.cc
@@ -27,10 +27,8 @@ void Conv2dNeonK3x3S1(const float *input,  // NCHW
  int input_channels = input_shape[1];
  int input_height = input_shape[2];
  int input_width = input_shape[3];
-  int multiplier =
+  int multiplier = filter_shape == nullptr ? 0 : filter_shape[0];
-      filter_shape == nullptr ? 0 : filter_shape[0];
+  int filter_in_channels = filter_shape == nullptr ? input_channels : 1;
-  int filter_in_channels =
-      filter_shape == nullptr ? input_channels : 1;
 #pragma omp parallel for collapse(2)
  for (int b = 0; b < output_batch; ++b) {
    for (int oc = 0; oc < output_channels; ++oc) {
@@ -230,10 +228,8 @@ void Conv2dNeonK3x3S2(const float *input,  // NCHW
  int input_channels = input_shape[1];
  int input_height = input_shape[2];
  int input_width = input_shape[3];
-  int multiplier =
+  int multiplier = filter_shape == nullptr ? 0 : filter_shape[0];
-      filter_shape == nullptr ? 0 : filter_shape[0];
+  int filter_in_channels = filter_shape == nullptr ? input_channels : 1;
-  int filter_in_channels =
-      filter_shape == nullptr ? input_channels : 1;
 #pragma omp parallel for collapse(2)
  for (int b = 0; b < output_batch; ++b) {

--- a/mace/kernels/neon/depthwise_conv_neon.cc
+++ b/mace/kernels/neon/depthwise_conv_neon.cc
@@ -52,9 +52,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
                 << "filter" << kernel_h << "x" << kernel_w << ","
                 << " stride " << strides_[0] << "x" << strides_[1]
                 << " is not implemented yet, using slow version";
-    DepthwiseConv2dFunctor<DeviceType::CPU, float>(strides_, paddings_,
+    DepthwiseConv2dFunctor<DeviceType::CPU, float>(
-                                                   dilations_)(
+        strides_, paddings_, dilations_)(input, filter, bias, output, future);
-        input, filter, bias, output, future);
    return;
  }
@@ -73,8 +72,8 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
    input_shape = padded_input.shape().data();
  }
  auto conv2d_neon_func = selector[kernel_h - 1][strides_[0] - 1];
-  conv2d_neon_func(input_ptr, input_shape, filter_ptr, filter_shape, bias_ptr, output_ptr,
+  conv2d_neon_func(input_ptr, input_shape, filter_ptr, filter_shape, bias_ptr,
-                   output_shape);
+                   output_ptr, output_shape);
 }
 }  // namespace kernels

--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -57,8 +57,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
      default:
        LOG(FATAL) << "Unknown activation type: " << activation_;
    }
-    kernel_ =
+    kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
-        runtime->BuildKernel("activation", kernel_name, built_options);
    int idx = 0;
    kernel_.setArg(idx++, *(input->opencl_image()));
    if (activation_ == PRELU) {
@@ -74,8 +73,8 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                           static_cast<uint32_t>(height * batch)};
  const std::vector<uint32_t> lws = {8, 16, 8, 1};
  std::string tuning_key =
-      Concat(tuning_key_prefix_, output->dim(0), output->dim(1),
+      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
-             output->dim(2), output->dim(3));
+             output->dim(3));
  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
 }

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -5,8 +5,8 @@
 #include "mace/kernels/addn.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/utils.h"
 #include "mace/utils/tuner.h"
+#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
@@ -57,31 +57,23 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
    uint32_t idx = 0;
    for (auto input : input_tensors) {
-      kernel_.setArg(idx++,
+      kernel_.setArg(idx++, *(input->opencl_image()));
-                         *(input->opencl_image()));
    }
    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
  }
-  const uint32_t gws[2] = {
+  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
-      static_cast<uint32_t>(width_pixels),
+                           static_cast<uint32_t>(batch_height_pixels)};
-      static_cast<uint32_t>(batch_height_pixels)
-  };
  const std::vector<uint32_t> lws = {64, 16, 1};
  std::stringstream ss;
-  ss << "addn_opencl_kernel_"
+  ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
-     << output_shape[0] << "_"
+     << "_" << output_shape[2] << "_" << output_shape[3];
-     << output_shape[1] << "_"
-     << output_shape[2] << "_"
-     << output_shape[3];
  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
 };
-template
+template struct AddNFunctor<DeviceType::OPENCL, float>;
-struct AddNFunctor<DeviceType::OPENCL, float>;
-template
+template struct AddNFunctor<DeviceType::OPENCL, half>;
-struct AddNFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -60,17 +60,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
        LOG(FATAL) << "Unknown activation type: " << activation_;
    }
-    kernel_ =
+    kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
-        runtime->BuildKernel("batch_norm", kernel_name, built_options);
    uint32_t idx = 0;
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, *(scale->opencl_image()));
-    kernel_.setArg(idx++,
+    kernel_.setArg(idx++, *(offset->opencl_image()));
-                     *(offset->opencl_image()));
    if (!folded_constant_) {
-      kernel_.setArg(idx++,
+      kernel_.setArg(idx++, *(mean->opencl_image()));
-                       *(mean->opencl_image()));
      kernel_.setArg(idx++, *(var->opencl_image()));
      kernel_.setArg(idx++, epsilon);
    }

--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -12,8 +12,7 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
+void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
-    const Tensor *input,
                                                       const Tensor *bias,
                                                       Tensor *output,
                                                       StatsFuture *future) {
@@ -47,10 +46,8 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
  cl::Event event;
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      kernel_, cl::NullRange,
+      kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(gws[0], gws[1], gws[2]),
+      cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
-      cl::NDRange(lws[0], lws[1], lws[2]),
-      nullptr, &event);
  MACE_CHECK(error == CL_SUCCESS);
  if (future != nullptr) {
    future->wait_fn = [runtime, event](CallStats *stats) {
@@ -62,9 +59,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
  }
 }
-template
+template struct BiasAddFunctor<DeviceType::OPENCL, float>;
-struct BiasAddFunctor<DeviceType::OPENCL, float>;
+template struct BiasAddFunctor<DeviceType::OPENCL, half>;
-template
-struct BiasAddFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -9,36 +9,33 @@
 namespace mace {
 namespace kernels {
-template<typename T>
+template <typename T>
-void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
+void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
-                                                             const BufferType type,
+    Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
-                                                             Tensor *image,
-                                                             StatsFuture *future) {
  std::vector<size_t> image_shape;
  if (!i2b_) {
    CalImage2DShape(buffer->shape(), type, image_shape);
-    if(type == WINOGRAD_FILTER) {
+    if (type == WINOGRAD_FILTER) {
-      std::vector<index_t> new_shape = 
+      std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
-        CalWinogradShape(buffer->shape(), type);
      image->ResizeImage(new_shape, image_shape);
    } else {
      image->ResizeImage(buffer->shape(), image_shape);
    }
  } else {
-    Image *image_buf = dynamic_cast<Image*>(image->UnderlyingBuffer());
+    Image *image_buf = dynamic_cast<Image *>(image->UnderlyingBuffer());
    image_shape = image_buf->image_shape();
    buffer->Resize(image->shape());
  }
-  size_t gws[2] = {image_shape[0],
+  size_t gws[2] = {image_shape[0], image_shape[1]};
-                   image_shape[1]};
  std::string kernel_name;
  switch (type) {
    case CONV2D_FILTER:
      kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image";
      break;
    case DW_CONV2D_FILTER:
-      kernel_name = i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image";
+      kernel_name =
+          i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image";
      break;
    case IN_OUT_CHANNEL:
      kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image";
@@ -48,7 +45,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
      break;
    case IN_OUT_HEIGHT:
    case WEIGHT_HEIGHT:
-      kernel_name = i2b_ ? "in_out_height_image_to_buffer" : "in_out_height_buffer_to_image";
+      kernel_name = i2b_ ? "in_out_height_image_to_buffer"
+                         : "in_out_height_buffer_to_image";
      break;
    case IN_OUT_WIDTH:
      MACE_CHECK(!i2b_) << "IN_OUT_WIDTH only support buffer to image now";
@@ -56,7 +54,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
      break;
    case WINOGRAD_FILTER:
      gws[1] /= 16;
-      kernel_name = i2b_ ? "winograd_filter_image_to_buffer" : "winograd_filter_buffer_to_image";
+      kernel_name = i2b_ ? "winograd_filter_image_to_buffer"
+                         : "winograd_filter_buffer_to_image";
      break;
  }
  std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
@@ -66,25 +65,30 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
  built_options.emplace(kernel_name_ss.str());
  if (buffer->dtype() == image->dtype()) {
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+                          DtToCLCMDDt(DataTypeToEnum<T>::value));
  } else {
-    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DDATA_TYPE=" +
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+                          DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
  }
  auto runtime = OpenCLRuntime::Global();
  auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
-                                         obfuscated_kernel_name,
+                                         obfuscated_kernel_name, built_options);
-                                         built_options);
  uint32_t idx = 0;
  b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
  if (!i2b_) {
-    MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0, "buffer offset not aligned");
+    MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
-    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->buffer_offset() / GetEnumTypeSize(buffer->dtype())));
+               "buffer offset not aligned");
+    b2f_kernel.setArg(idx++,
+                      static_cast<uint32_t>(buffer->buffer_offset() /
+                                            GetEnumTypeSize(buffer->dtype())));
  }
  if (type == ARGUMENT) {
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
-  } else if(type == WEIGHT_HEIGHT) {
+  } else if (type == WEIGHT_HEIGHT) {
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(0)));
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(1)));
    b2f_kernel.setArg(idx++, 1);
@@ -97,10 +101,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
  const std::vector<uint32_t> lws = {16, 64};
  cl::Event event;
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      b2f_kernel, cl::NullRange,
+      b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
-      cl::NDRange(gws[0], gws[1]),
+      cl::NDRange(lws[0], lws[1]), nullptr, &event);
-      cl::NDRange(lws[0], lws[1]),
-      nullptr, &event);
  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
  if (future != nullptr) {

--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -18,8 +18,8 @@
 #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
 #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
-__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+__constant sampler_t SAMPLER =
+    CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
 #ifdef USE_PRELU

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -5,8 +5,8 @@
 #include "mace/kernels/concat.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/utils.h"
 #include "mace/utils/tuner.h"
+#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
@@ -42,24 +42,23 @@ static void Concat2(cl::Kernel *kernel,
    *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
    uint32_t idx = 0;
-    kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input0->opencl_image())));
+    kernel->setArg(idx++,
-    kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input1->opencl_image())));
+                   *(static_cast<const cl::Image2D *>(input0->opencl_image())));
+    kernel->setArg(idx++,
+                   *(static_cast<const cl::Image2D *>(input1->opencl_image())));
    kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
-    kernel->setArg(idx++, *(static_cast<cl::Image2D *>(output->opencl_image())));
+    kernel->setArg(idx++,
+                   *(static_cast<cl::Image2D *>(output->opencl_image())));
  }
  const uint32_t gws[3] = {
-      static_cast<uint32_t>(channel_blk),
+      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(width),
-      static_cast<uint32_t>(width),
      static_cast<uint32_t>(batch * height),
  };
  const std::vector<uint32_t> lws = {8, 16, 8, 1};
  std::stringstream ss;
-  ss << "concat_opencl_kernel_"
+  ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
-     << output->dim(0) << "_"
+     << "_" << output->dim(2) << "_" << output->dim(3);
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
  TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
 }
@@ -97,22 +96,20 @@ static void ConcatN(cl::Kernel *kernel,
    index_t input_channel_blk = input->dim(3) / 4;
    chan_blk_offset += input_channel_blk;
    const uint32_t gws[3] = {
-        static_cast<uint32_t>(input_channel_blk),
+        static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
-        static_cast<uint32_t>(width),
        static_cast<uint32_t>(batch * height),
    };
    const std::vector<uint32_t> lws = {8, 16, 8, 1};
    std::stringstream ss;
-    ss << "concat_n_opencl_kernel_"
+    ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
-       << input_channel_blk << "_"
-       << width << "_"
       << batch * height;
    TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
  }
 }
-template<typename T>
+template <typename T>
-void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Tensor *> &input_list,
+void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
+    const std::vector<const Tensor *> &input_list,
    Tensor *output,
    StatsFuture *future) {
  const int inputs_count = input_list.size();
@@ -137,7 +134,8 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
    }
    output_shape[axis_] += input->dim(axis_);
  }
-  MACE_CHECK(inputs_count == 2 || divisible_four,
+  MACE_CHECK(
+      inputs_count == 2 || divisible_four,
      "Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
  std::vector<size_t> image_shape;
  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
@@ -151,17 +149,14 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
    default:
      if (divisible_four) {
        ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future);
-      }
+      } else {
-      else {
        MACE_NOT_IMPLEMENTED;
      }
  }
 };
-template
+template struct ConcatFunctor<DeviceType::OPENCL, float>;
-struct ConcatFunctor<DeviceType::OPENCL, float>;
+template struct ConcatFunctor<DeviceType::OPENCL, half>;
-template
-struct ConcatFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -47,21 +47,21 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                         Tensor *output,
                         StatsFuture *future);
-template<typename T>
+template <typename T>
 void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                      const Tensor *filter,
                                                      const Tensor *bias,
                                                      Tensor *output,
                                                      StatsFuture *future) {
  typedef void (*Conv2dOpenclFunction)(
-      cl::Kernel *kernel,
+      cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
-      const Tensor *input, const Tensor *filter, const Tensor *bias, const int stride,
+      const Tensor *bias, const int stride, const int *padding,
-      const int *padding, const int *dilations, const ActivationType activation,
+      const int *dilations, const ActivationType activation,
-      const float relux_max_limit, const DataType dt,
+      const float relux_max_limit, const DataType dt, Tensor *output,
-      Tensor *output, StatsFuture *future);
+      StatsFuture *future);
  // Selection matrix: kernel_size x stride_size
-  static const Conv2dOpenclFunction selector[5] =
+  static const Conv2dOpenclFunction selector[5] = {
-      {Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
+      Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
  index_t kernel_h = filter->dim(0);
  index_t kernel_w = filter->dim(1);
@@ -83,8 +83,9 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
        padding_type_, output_shape.data(), paddings.data());
  } else {
    paddings = paddings_;
-    CalcOutputSize(input->shape().data(), filter->shape().data(), paddings_.data(),
+    CalcOutputSize(input->shape().data(), filter->shape().data(),
-                   dilations_, strides_, RoundType::FLOOR, output_shape.data());
+                   paddings_.data(), dilations_, strides_, RoundType::FLOOR,
+                   output_shape.data());
  }
  std::vector<size_t> output_image_shape;
@@ -94,18 +95,18 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  if (kernel_h == kernel_w && kernel_h <= 5 &&
      selector[kernel_h - 1] != nullptr) {
    auto conv2d_func = selector[kernel_h - 1];
-    conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_,
+    conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
-                relux_max_limit_, DataTypeToEnum<T>::value, output, future);
+                dilations_, activation_, relux_max_limit_,
+                DataTypeToEnum<T>::value, output, future);
  } else {
-    Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
+    Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
-                 activation_, relux_max_limit_, DataTypeToEnum<T>::value, output, future);
+                 dilations_, activation_, relux_max_limit_,
+                 DataTypeToEnum<T>::value, output, future);
  }
 }
-template
+template struct Conv2dFunctor<DeviceType::OPENCL, float>;
-struct Conv2dFunctor<DeviceType::OPENCL, float>;
+template struct Conv2dFunctor<DeviceType::OPENCL, half>;
-template
-struct Conv2dFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -66,20 +66,15 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
    }
    auto runtime = OpenCLRuntime::Global();
-    *kernel =
+    *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
-        runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
    uint32_t idx = 0;
-    kernel->setArg(idx++,
+    kernel->setArg(idx++, *(input->opencl_image()));
-                          *(input->opencl_image()));
+    kernel->setArg(idx++, *(filter->opencl_image()));
-    kernel->setArg(idx++,
-                          *(filter->opencl_image()));
    if (bias != nullptr) {
-      kernel->setArg(idx++,
+      kernel->setArg(idx++, *(bias->opencl_image()));
-                            *(bias->opencl_image()));
    }
-    kernel->setArg(idx++,
+    kernel->setArg(idx++, *(output->opencl_image()));
-                          *(output->opencl_image()));
    // FIXME handle flexable data type: half not supported
    kernel->setArg(idx++, relux_max_limit);
    kernel->setArg(idx++, static_cast<int>(input_height));
@@ -100,6 +95,5 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
 }
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -61,20 +61,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
    }
    auto runtime = OpenCLRuntime::Global();
-    *kernel =
+    *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
-        runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
    uint32_t idx = 0;
-    kernel->setArg(idx++,
+    kernel->setArg(idx++, *(input->opencl_image()));
-                          *(input->opencl_image()));
+    kernel->setArg(idx++, *(filter->opencl_image()));
-    kernel->setArg(idx++,
-                          *(filter->opencl_image()));
    if (bias != nullptr) {
-      kernel->setArg(idx++,
+      kernel->setArg(idx++, *(bias->opencl_image()));
-                            *(bias->opencl_image()));
    }
-    kernel->setArg(idx++,
+    kernel->setArg(idx++, *(output->opencl_image()));
-                          *(output->opencl_image()));
    kernel->setArg(idx++, relux_max_limit);
    kernel->setArg(idx++, static_cast<int>(input->dim(1)));
    kernel->setArg(idx++, static_cast<int>(input->dim(2)));

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -61,20 +61,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
    }
    auto runtime = OpenCLRuntime::Global();
-    *kernel =
+    *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
-        runtime->BuildKernel("conv_2d", kernel_name, built_options);
    uint32_t idx = 0;
-    kernel->setArg(idx++,
+    kernel->setArg(idx++, *(input->opencl_image()));
-                          *(input->opencl_image()));
+    kernel->setArg(idx++, *(filter->opencl_image()));
-    kernel->setArg(idx++,
-                          *(filter->opencl_image()));
    if (bias != nullptr) {
-      kernel->setArg(idx++,
+      kernel->setArg(idx++, *(bias->opencl_image()));
-                            *(bias->opencl_image()));
    }
-    kernel->setArg(idx++,
+    kernel->setArg(idx++, *(output->opencl_image()));
-                          *(output->opencl_image()));
    kernel->setArg(idx++, relux_max_limit);
    kernel->setArg(idx++, static_cast<uint32_t>(input->dim(1)));
    kernel->setArg(idx++, static_cast<uint32_t>(input->dim(2)));

--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -34,7 +34,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
  const index_t channel_blocks = RoundUpDiv4(channels);
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
  const index_t width_blocks = RoundUpDiv4(width);
-  if(kernel->get() == nullptr) {
+  if (kernel->get() == nullptr) {
    const index_t input_batch = input->dim(0);
    const index_t input_height = input->dim(1);
    const index_t input_width = input->dim(2);
@@ -78,18 +78,16 @@ void DepthwiseConv2d(cl::Kernel *kernel,
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
-    *kernel = runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
+    *kernel =
+        runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
    uint32_t idx = 0;
    kernel->setArg(idx++, *(input->opencl_image()));
-    kernel->setArg(
+    kernel->setArg(idx++, *(filter->opencl_image()));
-        idx++, *(filter->opencl_image()));
    if (bias != nullptr) {
-      kernel->setArg(
+      kernel->setArg(idx++, *(bias->opencl_image()));
-          idx++, *(bias->opencl_image()));
    }
-    kernel->setArg(
+    kernel->setArg(idx++, *(output->opencl_image()));
-        idx++, *(output->opencl_image()));
    kernel->setArg(idx++, relux_max_limit);
    kernel->setArg(idx++, static_cast<short>(input_height));
    kernel->setArg(idx++, static_cast<short>(input_width));
@@ -154,16 +152,17 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
        padding_type_, output_shape.data(), paddings.data());
  } else {
    paddings = paddings_;
-    CalcOutputSize(input->shape().data(), fake_filter_shape.data(), paddings_.data(),
+    CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
-                   dilations_, strides_, RoundType::FLOOR, output_shape.data());
+                   paddings_.data(), dilations_, strides_, RoundType::FLOOR,
+                   output_shape.data());
  }
  std::vector<size_t> output_image_shape;
  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
  output->ResizeImage(output_shape, output_image_shape);
-  DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
+  DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
-                  activation_, relux_max_limit_, 
+                  dilations_, activation_, relux_max_limit_,
                  DataTypeToEnum<T>::value, output, future);
 }

--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -15,7 +15,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
                                                       const Tensor *input1,
                                                       Tensor *output,
                                                       StatsFuture *future) {
  const index_t batch = input0->dim(0);
  const index_t height = input0->dim(1);
  const index_t width = input0->dim(2);
@@ -38,10 +37,8 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
    kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
    uint32_t idx = 0;
-    kernel_.setArg(idx++,
+    kernel_.setArg(idx++, *(input0->opencl_image()));
-                   *(input0->opencl_image()));
+    kernel_.setArg(idx++, *(input1->opencl_image()));
-    kernel_.setArg(idx++,
-                   *(input1->opencl_image()));
    if (!coeff_.empty()) {
      kernel_.setArg(idx++, coeff_[0]);
      kernel_.setArg(idx++, coeff_[1]);
@@ -49,17 +46,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
    kernel_.setArg(idx++, *(output->opencl_image()));
  }
-  const uint32_t gws[2] = {
+  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
-      static_cast<uint32_t>(width_pixels),
+                           static_cast<uint32_t>(batch_height_pixels)};
-      static_cast<uint32_t>(batch_height_pixels)
-  };
  const std::vector<uint32_t> lws = {64, 16, 1};
  std::stringstream ss;
-  ss << "eltwise_opencl_kernel_"
+  ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
-     << output->dim(0) << "_"
+     << "_" << output->dim(2) << "_" << output->dim(3);
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
 }

--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -10,14 +10,13 @@
 namespace mace {
 namespace kernels {
-template<typename T>
+template <typename T>
 void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
    const Tensor *input,
    const Tensor *weight,
    const Tensor *bias,
    Tensor *output,
    StatsFuture *future) {
  std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
  std::vector<size_t> output_image_shape;
  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
@@ -57,19 +56,16 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
      default:
        LOG(FATAL) << "Unknown activation type: " << activation_;
    }
-    kernel_ = runtime->BuildKernel("fully_connected", kernel_name, built_options);
+    kernel_ =
+        runtime->BuildKernel("fully_connected", kernel_name, built_options);
    uint32_t idx = 0;
-    kernel_.setArg(idx++,
+    kernel_.setArg(idx++, *(input->opencl_image()));
-                   *(input->opencl_image()));
+    kernel_.setArg(idx++, *(weight->opencl_image()));
-    kernel_.setArg(idx++,
-                   *(weight->opencl_image()));
    if (bias != nullptr) {
-      kernel_.setArg(idx++,
+      kernel_.setArg(idx++, *(bias->opencl_image()));
-                     *(bias->opencl_image()));
    }
-    kernel_.setArg(idx++,
+    kernel_.setArg(idx++, *(output->opencl_image()));
-                   *(output->opencl_image()));
    kernel_.setArg(idx++, static_cast<int>(input->dim(1)));
    kernel_.setArg(idx++, static_cast<int>(input->dim(2)));
    kernel_.setArg(idx++, static_cast<int>(input->dim(3)));
@@ -78,25 +74,18 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
  }
  const uint32_t gws[2] = {
-      static_cast<uint32_t>(batch),
+      static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
-      static_cast<uint32_t>(output_blocks),
  };
  const std::vector<uint32_t> lws = {16, 64, 1};
  std::stringstream ss;
-  ss << "fc_opencl_kernel_"
+  ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
-     << output->dim(0) << "_"
+     << output->dim(2) << "_" << output->dim(3);
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
 };
-template
+template struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
-struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
-template
+template struct FullyConnectedFunctor<DeviceType::OPENCL, half>;
-struct FullyConnectedFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -3,8 +3,8 @@
 //
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/utils.h"
 #include "mace/utils/tuner.h"
+#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
@@ -28,7 +28,8 @@ void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */
 }
 // [H * W * M, (Ic + 3) / 4]
-void CalDepthwiseConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWIM */
+void CalDepthwiseConv2dFilterImageShape(
+    const std::vector<index_t> &shape, /* HWIM */
    std::vector<size_t> &image_shape) {
  MACE_CHECK(shape.size() == 4);
  image_shape.resize(2);
@@ -47,7 +48,8 @@ void CalArgImageShape(const std::vector<index_t> &shape,
 // Only support 3x3 now
 // [ (Ic + 3) / 4, 16 * Oc]
-void CalWinogradFilterImageShape(const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
+void CalWinogradFilterImageShape(
+    const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
    std::vector<size_t> &image_shape) {
  MACE_CHECK(shape.size() == 4);
  image_shape.resize(2);
@@ -115,15 +117,12 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
  }
 }
 std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
                                      const BufferType type) {
  if (type == WINOGRAD_FILTER) {
    return {16, shape[0], shape[1], 1};
-  }else if (type == IN_OUT_HEIGHT) {
+  } else if (type == IN_OUT_HEIGHT) {
-    index_t out_width = shape[0] *
+    index_t out_width = shape[0] * ((shape[1] - 1) / 2) * ((shape[2] - 1) / 2);
-                        ((shape[1] - 1) / 2) *
-                        ((shape[2] - 1) / 2);
    return {16, shape[3], out_width, 1};
  } else {
    LOG(FATAL) << "Mace not supported yet.";
@@ -188,8 +187,8 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
    std::vector<uint32_t> local_ws(3, 0);
    local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
    local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(gws[2],
+    local_ws[2] =
-                                     kwg_size / (local_ws[0] * local_ws[1]));
+        std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1]));
    return {
        // TODO tuning these magic numbers
        {local_ws[0], local_ws[1], local_ws[2], 1},
@@ -217,20 +216,20 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
    };
  };
  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params,
+  auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
-                  Timer *timer,
                  std::vector<uint32_t> *tuning_result) -> cl_int {
-    MACE_CHECK(params.size() == 4) << "Tuning parameters of 3D kernel must be 4D";
+    MACE_CHECK(params.size() == 4)
+        << "Tuning parameters of 3D kernel must be 4D";
    cl_int error = CL_SUCCESS;
    if (timer == nullptr) {
      uint32_t num_blocks = params[3];
      const uint32_t block_size = gws[2] / num_blocks;
      if (gws[2] % num_blocks > 0) num_blocks++;
      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        uint32_t gws2 =
+            (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
        error = runtime->command_queue().enqueueNDRangeKernel(
-            kernel,
+            kernel, cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(0, 0, i * block_size),
            cl::NDRange(gws[0], gws[1], gws2),
            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
@@ -247,15 +246,16 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
      if (LimitKernelTime()) {
        double elapse_time = timer->AccumulatedMicros();
        timer->ClearTiming();
-        uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+        uint32_t num_blocks = std::min(
+            static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
        (*tuning_result)[3] = num_blocks;
        const uint32_t block_size = gws[2] / num_blocks;
        if (gws[2] % num_blocks > 0) num_blocks++;
        for (uint32_t i = 0; i < num_blocks; ++i) {
-          uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+          uint32_t gws2 =
+              (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
          error = runtime->command_queue().enqueueNDRangeKernel(
-              kernel,
+              kernel, cl::NDRange(0, 0, i * block_size),
-              cl::NDRange(0, 0, i * block_size),
              cl::NDRange(gws[0], gws[1], gws2),
              cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
          MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
@@ -300,34 +300,30 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
            {kwg_size / 256, 256, 1},
            {kwg_size / 512, 512, 1},
            {kwg_size, 1, 1},
-            {1, kwg_size, 1}
+            {1, kwg_size, 1}};
-    };
  };
  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params,
+  auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
-                  Timer *timer,
                  std::vector<uint32_t> *tuning_result) -> cl_int {
-    MACE_CHECK(params.size() == 3) << "Tuning parameters of 2D kernel must be 3d";
+    MACE_CHECK(params.size() == 3)
+        << "Tuning parameters of 2D kernel must be 3d";
    cl_int error = CL_SUCCESS;
    if (timer == nullptr) {
      uint32_t num_blocks = params[2];
      const uint32_t block_size = gws[1] / num_blocks;
      if (gws[1] % num_blocks > 0) num_blocks++;
      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
+        uint32_t gws1 =
+            (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
        error = runtime->command_queue().enqueueNDRangeKernel(
-            kernel,
+            kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
-            cl::NDRange(0, i * block_size),
+            cl::NDRange(params[0], params[1]), nullptr, &event);
-            cl::NDRange(gws[0], gws1),
-            cl::NDRange(params[0], params[1]),
-            nullptr, &event);
        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
      }
    } else {
      timer->ClearTiming();
      error = runtime->command_queue().enqueueNDRangeKernel(
-          kernel, cl::NullRange,
+          kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
-          cl::NDRange(gws[0], gws[1]),
          cl::NDRange(params[0], params[1]), nullptr, &event);
      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
      timer->AccumulateTiming();
@@ -336,16 +332,16 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
      if (LimitKernelTime()) {
        double elapse_time = timer->AccumulatedMicros();
        timer->ClearTiming();
-        uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
+        uint32_t num_blocks = std::min(
+            static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
        (*tuning_result)[2] = num_blocks;
        const uint32_t block_size = gws[1] / num_blocks;
        if (gws[1] % num_blocks > 0) num_blocks++;
        for (uint32_t i = 0; i < num_blocks; ++i) {
-          uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
+          uint32_t gws1 =
+              (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
          error = runtime->command_queue().enqueueNDRangeKernel(
-              kernel,
+              kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
-              cl::NDRange(0, i * block_size),
-              cl::NDRange(gws[0], gws1),
              cl::NDRange(params[0], params[1]), nullptr, &event);
          MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
          timer->AccumulateTiming();
@@ -355,11 +351,8 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
    return error;
  };
  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(tuning_key,
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
-                                                     lws,
+      tuning_key, lws, params_generator, func, &timer);
-                                                     params_generator,
-                                                     func,
-                                                     &timer);
  if (future != nullptr) {
    future->wait_fn = [runtime, event](CallStats *stats) {
      event.wait();
@@ -368,7 +361,6 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
      }
    };
  }
 }
 }  // namespace kernels

--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -5,11 +5,11 @@
 #ifndef MACE_KERNELS_OPENCL_HELPER_H_
 #define MACE_KERNELS_OPENCL_HELPER_H_
+#include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/types.h"
 #include "mace/utils/utils.h"
-#include "mace/core/future.h"
 namespace mace {
 namespace kernels {
@@ -48,7 +48,6 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
                         const std::vector<uint32_t> &lws,
                         StatsFuture *future);
 void TuningOrRun2DKernel(cl::Kernel &kernel,
                         const std::string tuning_key,
                         const uint32_t *gws,
@@ -72,12 +71,12 @@ inline bool LimitKernelTime() {
 }
 namespace {
-template<typename T>
+template <typename T>
 void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
  (*ss) << v;
 }
-template<typename T, typename... Args>
+template <typename T, typename... Args>
 void AppendToStream(std::stringstream *ss,
                    const std::string &delimiter,
                    T first,
@@ -87,7 +86,7 @@ void AppendToStream(std::stringstream *ss,
 }
 }  // namespace
-template<typename... Args>
+template <typename... Args>
 std::string Concat(Args... args) {
  std::stringstream ss;
  AppendToStream(&ss, "_", args...);

--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -11,12 +11,10 @@ namespace mace {
 namespace kernels {
 template <typename T>
-void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
+void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
-    const Tensor *A,
                                                      const Tensor *B,
                                                      Tensor *C,
                                                      StatsFuture *future) {
  std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
  std::vector<size_t> c_image_shape;
  CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, c_image_shape);
@@ -41,8 +39,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
    uint32_t idx = 0;
    kernel_.setArg(idx++, *(A->opencl_image()));
-    kernel_.setArg(idx++,
+    kernel_.setArg(idx++, *(B->opencl_image()));
-                         *(B->opencl_image()));
    kernel_.setArg(idx++, *(C->opencl_image()));
    kernel_.setArg(idx++, static_cast<int>(height));
    kernel_.setArg(idx++, static_cast<int>(width));
@@ -57,20 +54,14 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
  };
  const std::vector<uint32_t> lws = {16, 64, 1};
  std::stringstream ss;
-  ss << "matmul_opencl_kernel_"
+  ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
-     << C->dim(0) << "_"
+     << C->dim(2) << "_" << C->dim(3);
-     << C->dim(1) << "_"
-     << C->dim(2) << "_"
-     << C->dim(3);
  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
 };
-template
+template struct MatMulFunctor<DeviceType::OPENCL, float>;
-struct MatMulFunctor<DeviceType::OPENCL, float>;
-template
+template struct MatMulFunctor<DeviceType::OPENCL, half>;
-struct MatMulFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -11,17 +11,15 @@
 namespace mace {
 namespace kernels {
-template<typename T>
+template <typename T>
 void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                       Tensor *output,
                                                       StatsFuture *future) {
  MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
      << "Pooling opencl kernel not support dilation yet";
  std::vector<index_t> output_shape(4);
-  std::vector<index_t> filter_shape = {
+  std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
-      kernels_[0], kernels_[1],
+                                       input->dim(3)};
-      input->dim(3), input->dim(3)
-  };
  std::vector<int> paddings(2);
  if (paddings_.empty()) {
@@ -77,24 +75,17 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  }
  const uint32_t gws[3] = {
-      static_cast<uint32_t>(channel_blocks),
+      static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
-      static_cast<uint32_t>(out_width),
      static_cast<uint32_t>(batch * out_height),
  };
  std::vector<uint32_t> lws = {8, 16, 8, 1};
  std::stringstream ss;
-  ss << "pooling_opencl_kernel_"
+  ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
-     << output->dim(0) << "_"
+     << "_" << output->dim(2) << "_" << output->dim(3);
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
 }
-template
+template struct PoolingFunctor<DeviceType::OPENCL, float>;
-struct PoolingFunctor<DeviceType::OPENCL, float>;
+template struct PoolingFunctor<DeviceType::OPENCL, half>;
-template
-struct PoolingFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -2,12 +2,12 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
+#include "mace/kernels/resize_bilinear.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/resize_bilinear.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/utils.h"
 #include "mace/utils/tuner.h"
+#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
@@ -29,14 +29,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
    std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape,
+    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                    BufferType::IN_OUT_CHANNEL,
                    output_image_shape);
    output->ResizeImage(output_shape, output_image_shape);
    float height_scale =
        CalculateResizeScale(in_height, out_height, align_corners_);
-    float width_scale = CalculateResizeScale(in_width, out_width, align_corners_);
+    float width_scale =
+        CalculateResizeScale(in_width, out_width, align_corners_);
    auto runtime = OpenCLRuntime::Global();
    std::set<std::string> built_options;
@@ -45,7 +45,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    kernel_ = runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
+    kernel_ =
+        runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
    uint32_t idx = 0;
    kernel_.setArg(idx++, *(input->opencl_image()));
@@ -62,11 +63,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
                           static_cast<uint32_t>(out_height * batch)};
  const std::vector<uint32_t> lws = {8, 16, 8, 1};
  std::stringstream ss;
-  ss << "resize_bilinear_opencl_kernel_"
+  ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
-     << output->dim(0) << "_"
+     << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
 }

--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -6,13 +6,13 @@
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/utils.h"
 #include "mace/utils/tuner.h"
+#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
-template<typename T>
+template <typename T>
 void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
                                                       Tensor *output,
                                                       StatsFuture *future) {
@@ -45,17 +45,12 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
                           static_cast<uint32_t>(height * batch)};
  const std::vector<uint32_t> lws = {8, 16, 8, 1};
  std::stringstream ss;
-  ss << "softmax_opencl_kernel_"
+  ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
-     << output->dim(0) << "_"
+     << "_" << output->dim(2) << "_" << output->dim(3);
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
 }
-template
+template struct SoftmaxFunctor<DeviceType::OPENCL, float>;
-struct SoftmaxFunctor<DeviceType::OPENCL, float>;
+template struct SoftmaxFunctor<DeviceType::OPENCL, half>;
-template
-struct SoftmaxFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -5,17 +5,18 @@
 #ifndef MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
 #define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
-#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/space_to_batch.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/utils.h"
 #include "mace/utils/tuner.h"
+#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
 template <typename T>
-void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor,
+void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
+    Tensor *space_tensor,
    const std::vector<index_t> &output_shape,
    Tensor *batch_tensor,
    StatsFuture *future) {
@@ -37,8 +38,10 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
    built_options.emplace(kernel_name_ss.str());
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
-    kernel_ = runtime->BuildKernel("space_to_batch", kernel_name, built_options);
+                          DtToCLCMDDt(DataTypeToEnum<T>::value));
+    kernel_ =
+        runtime->BuildKernel("space_to_batch", kernel_name, built_options);
    uint32_t idx = 0;
    if (b2s_) {
@@ -59,15 +62,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
  }
  const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
-  const uint32_t gws[3] = {chan_blk,
+  const uint32_t gws[3] = {
-                           static_cast<uint32_t>(batch_tensor->dim(2)),
+      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
  const std::vector<uint32_t> lws = {8, 16, 8, 1};
  std::stringstream ss;
-  ss << kernel_name << "_"
+  ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
-     << batch_tensor->dim(0) << "_"
+     << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
-     << batch_tensor->dim(1) << "_"
-     << batch_tensor->dim(2) << "_"
     << batch_tensor->dim(3);
  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
 }

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -11,21 +11,21 @@
 namespace mace {
 namespace kernels {
-template<typename T>
+template <typename T>
-void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input_tensor,
+void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
-                                                                 Tensor *output_tensor,
+    const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
-                                                                 StatsFuture *future) {
  std::vector<index_t> output_shape(4);
  std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
  std::vector<int> paddings(2);
  if (paddings_.empty()) {
    kernels::CalcNHWCPaddingAndOutputSize(
-        input_tensor->shape().data(), filter_shape.data(), dilations_.data(), strides_.data(),
+        input_tensor->shape().data(), filter_shape.data(), dilations_.data(),
-        padding_type_, output_shape.data(), paddings.data());
+        strides_.data(), padding_type_, output_shape.data(), paddings.data());
  } else {
    paddings = paddings_;
-    CalcOutputSize(input_tensor->shape().data(), filter_shape.data(), paddings_.data(),
+    CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
-                   dilations_.data(), strides_.data(), RoundType::FLOOR, output_shape.data());
+                   paddings_.data(), dilations_.data(), strides_.data(),
+                   RoundType::FLOOR, output_shape.data());
  }
  const index_t round_h = (output_shape[1] + 1) / 2;
@@ -38,14 +38,16 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
    CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
    output_tensor->ResizeImage(output_shape, image_shape);
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
+    std::string obfuscated_kernel_name =
+        MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
    std::set<std::string> built_options;
    built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
-    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DDATA_TYPE=" +
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+                          DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
    auto runtime = OpenCLRuntime::Global();
-    kernel_ = runtime->BuildKernel("winograd_transform",
+    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
-                                   obfuscated_kernel_name,
                                   built_options);
    uint32_t idx = 0;
@@ -60,34 +62,39 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
  }
-  const uint32_t gws[2] = {static_cast<uint32_t>(out_width),
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(out_width),
      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
  const std::vector<uint32_t> lws = {128, 8, 1};
  std::stringstream ss;
-  ss << "winograd_transform_kernel_"
+  ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
-     << input_tensor->dim(0) << "_"
+     << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
-     << input_tensor->dim(1) << "_"
-     << input_tensor->dim(2) << "_"
     << input_tensor->dim(3);
  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
 }
-template<typename T>
+template <typename T>
-void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input_tensor,
+void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
+    const Tensor *input_tensor,
    const Tensor *bias,
    Tensor *output_tensor,
    StatsFuture *future) {
-  std::vector<index_t> output_shape = {batch_, height_, width_, input_tensor->dim(1)};
+  std::vector<index_t> output_shape = {batch_, height_, width_,
+                                       input_tensor->dim(1)};
  std::vector<size_t> image_shape;
  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
  output_tensor->ResizeImage(output_shape, image_shape);
  if (kernel_.get() == nullptr) {
-    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
+    std::string obfuscated_kernel_name =
+        MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
    std::set<std::string> built_options;
-    built_options.emplace("-Dwinograd_inverse_transform_2x2=" + obfuscated_kernel_name);
+    built_options.emplace("-Dwinograd_inverse_transform_2x2=" +
-    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+                          obfuscated_kernel_name);
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DDATA_TYPE=" +
+                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" +
+                          DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation_) {
      case NOOP:
@@ -112,18 +119,21 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
    }
    auto runtime = OpenCLRuntime::Global();
-    kernel_ = runtime->BuildKernel("winograd_transform",
+    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
-                                   obfuscated_kernel_name,
                                   built_options);
    const uint32_t round_h = (height_ + 1) / 2;
    const uint32_t round_w = (width_ + 1) / 2;
    uint32_t idx = 0;
-    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
+    kernel_.setArg(
+        idx++,
+        *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
    if (bias != nullptr) {
-      kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->opencl_image())));
+      kernel_.setArg(idx++,
+                     *(static_cast<const cl::Image2D *>(bias->opencl_image())));
    }
-    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
+    kernel_.setArg(
+        idx++, *(static_cast<cl::Image2D *>(output_tensor->opencl_image())));
    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
@@ -131,28 +141,23 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
    kernel_.setArg(idx++, relux_max_limit_);
  }
-  const uint32_t gws[2] = {static_cast<uint32_t>(input_tensor->dim(2)),
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(input_tensor->dim(2)),
      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
  const std::vector<uint32_t> lws = {128, 8, 1};
  std::stringstream ss;
-  ss << "winograd_inverse_transform_kernel_"
+  ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"
-     << input_tensor->dim(0) << "_"
+     << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
-     << input_tensor->dim(1) << "_"
-     << input_tensor->dim(2) << "_"
     << input_tensor->dim(3);
  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
 }
-template
+template struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
-struct WinogradTransformFunctor<DeviceType::OPENCL, float>;
+template struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
-template
-struct WinogradTransformFunctor<DeviceType::OPENCL, half>;
-template
+template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>;
-struct WinogradInverseTransformFunctor<DeviceType::OPENCL, float>;
+template struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
-template
-struct WinogradInverseTransformFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -7,9 +7,9 @@
 #include <limits>
 #include "mace/core/future.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/conv_pool_2d_util.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 namespace mace {
@@ -42,7 +42,7 @@ struct PoolingFunctorBase {
  const int *dilations_;
 };
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 struct PoolingFunctor : PoolingFunctorBase {
  PoolingFunctor(const PoolingType pooling_type,
                 const int *kernels,
@@ -50,29 +50,27 @@ struct PoolingFunctor : PoolingFunctorBase {
                 const Padding padding_type,
                 const std::vector<int> &paddings,
                 const int *dilations)
-      : PoolingFunctorBase(pooling_type, kernels,
+      : PoolingFunctorBase(
-                           strides, padding_type,
+            pooling_type, kernels, strides, padding_type, paddings, dilations) {
-                           paddings, dilations) {}
+  }
  void operator()(const Tensor *input_tensor,
                  Tensor *output_tensor,
                  StatsFuture *future) {
    std::vector<index_t> output_shape(4);
    std::vector<index_t> filter_shape = {
-        kernels_[0], kernels_[1],
+        kernels_[0], kernels_[1], input_tensor->dim(3), input_tensor->dim(3)};
-        input_tensor->dim(3), input_tensor->dim(3)
-    };
    std::vector<int> paddings(2);
    if (paddings_.empty()) {
      kernels::CalcNHWCPaddingAndOutputSize(
-          input_tensor->shape().data(), filter_shape.data(), dilations_, strides_,
+          input_tensor->shape().data(), filter_shape.data(), dilations_,
-          padding_type_, output_shape.data(), paddings.data());
+          strides_, padding_type_, output_shape.data(), paddings.data());
    } else {
      paddings = paddings_;
-      CalcOutputSize(input_tensor->shape().data(), filter_shape.data(), paddings_.data(),
+      CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
-                     dilations_, strides_, RoundType::CEIL, output_shape.data());
+                     paddings_.data(), dilations_, strides_, RoundType::CEIL,
+                     output_shape.data());
    }
    output_tensor->Resize(output_shape);
@@ -110,7 +108,8 @@ struct PoolingFunctor : PoolingFunctorBase {
        for (int h = 0; h < height; ++h) {
          for (int w = 0; w < width; ++w) {
            for (int c = 0; c < channels; ++c) {
-              index_t out_offset = (((b * height) + h) * width + w) * channels + c;
+              index_t out_offset =
+                  (((b * height) + h) * width + w) * channels + c;
              index_t in_offset = b * in_image_size * input_channels + c;
              T res = std::numeric_limits<T>::lowest();
              for (int kh = 0; kh < kernel_h; ++kh) {
@@ -119,7 +118,8 @@ struct PoolingFunctor : PoolingFunctorBase {
                  int inw = padded_w_start + w * stride_w + dilation_w * kw;
                  if (inh >= 0 && inh < input_height && inw >= 0 &&
                      inw < input_width) {
-                    index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
+                    index_t input_offset =
+                        in_offset + (inh * input_width + inw) * input_channels;
                    res = std::max(res, input[input_offset]);
                  }
                }
@@ -135,7 +135,8 @@ struct PoolingFunctor : PoolingFunctorBase {
        for (int h = 0; h < height; ++h) {
          for (int w = 0; w < width; ++w) {
            for (int c = 0; c < channels; ++c) {
-              index_t out_offset = (((b * height) + h) * width + w) * channels + c;
+              index_t out_offset =
+                  (((b * height) + h) * width + w) * channels + c;
              index_t in_offset = b * in_image_size * input_channels + c;
              T sum = 0;
              int block_size = 0;
@@ -145,7 +146,8 @@ struct PoolingFunctor : PoolingFunctorBase {
                  int inw = padded_w_start + w * stride_w + dilation_w * kw;
                  if (inh >= 0 && inh < input_height && inw >= 0 &&
                      inw < input_width) {
-                    index_t input_offset = in_offset + (inh * input_width + inw) * input_channels;
+                    index_t input_offset =
+                        in_offset + (inh * input_width + inw) * input_channels;
                    sum += input[input_offset];
                    block_size += 1;
                  }
@@ -158,16 +160,13 @@ struct PoolingFunctor : PoolingFunctorBase {
      }
    }
  }
 };
-template<>
+template <>
 void PoolingFunctor<DeviceType::NEON, float>::operator()(
-    const Tensor *input_tensor,
+    const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future);
-    Tensor *output_tensor,
-    StatsFuture *future);
-template<typename T>
+template <typename T>
 struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
  PoolingFunctor(const PoolingType pooling_type,
                 const int *kernels,
@@ -175,9 +174,9 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
                 const Padding padding_type,
                 const std::vector<int> &paddings,
                 const int *dilations)
-      : PoolingFunctorBase(pooling_type, kernels,
+      : PoolingFunctorBase(
-                           strides, padding_type,
+            pooling_type, kernels, strides, padding_type, paddings, dilations) {
-                           paddings, dilations) {}
+  }
  void operator()(const Tensor *input_tensor,
                  Tensor *output_tensor,
                  StatsFuture *future);

--- a/mace/kernels/reshape.h
+++ b/mace/kernels/reshape.h
@@ -5,8 +5,8 @@
 #define MACE_KERNELS_RESHAPE_H_
 #include "mace/core/future.h"
-#include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/tensor.h"
 namespace mace {
 namespace kernels {
@@ -25,7 +25,6 @@ struct ReshapeFunctor {
  }
 };
 }  // namespace kernels
 }  // namespace mace

--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -5,8 +5,8 @@
 #define MACE_KERNELS_RESIZE_BILINEAR_H_
 #include "mace/core/future.h"
-#include "mace/core/tensor.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/tensor.h"
 namespace mace {
 namespace kernels {
@@ -163,8 +163,9 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
  }
 };
-template<typename T>
+template <typename T>
-struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase {
+struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
+    : ResizeBilinearFunctorBase {
  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
      : ResizeBilinearFunctorBase(size, align_corners) {}

--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -6,9 +6,9 @@
 #define MACE_KERNELS_CONV_2D_H_
 #include "mace/core/future.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 namespace mace {
 namespace kernels {
@@ -16,11 +16,10 @@ namespace kernels {
 struct SpaceToBatchFunctorBase {
  SpaceToBatchFunctorBase(const std::vector<int> &paddings,
                          const std::vector<int> &block_shape,
-                          bool b2s):
+                          bool b2s)
-      paddings_(paddings.begin(), paddings.end()),
+      : paddings_(paddings.begin(), paddings.end()),
        block_shape_(block_shape.begin(), block_shape.end()),
-      b2s_(b2s)
+        b2s_(b2s) {}
-  {}
  std::vector<int> paddings_;
  std::vector<int> block_shape_;
@@ -28,10 +27,11 @@ struct SpaceToBatchFunctorBase {
 };
 template <DeviceType D, typename T>
-struct SpaceToBatchFunctor : SpaceToBatchFunctorBase{
+struct SpaceToBatchFunctor : SpaceToBatchFunctorBase {
  SpaceToBatchFunctor(const std::vector<int> &paddings,
                      const std::vector<int> &block_shape,
-                      bool b2s): SpaceToBatchFunctorBase(paddings, block_shape, b2s){}
+                      bool b2s)
+      : SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
  void operator()(Tensor *space_tensor,
                  const std::vector<index_t> &output_shape,
@@ -42,10 +42,11 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase{
 };
 template <typename T>
-struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{
+struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
  SpaceToBatchFunctor(const std::vector<int> &paddings,
                      const std::vector<int> &block_shape,
-                      bool b2s): SpaceToBatchFunctorBase(paddings, block_shape, b2s){}
+                      bool b2s)
+      : SpaceToBatchFunctorBase(paddings, block_shape, b2s) {}
  void operator()(Tensor *space_tensor,
                  const std::vector<index_t> &output_shape,
@@ -53,7 +54,6 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{
                  StatsFuture *future);
  cl::Kernel kernel_;
 };
 }  // namespace kernels

--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -6,10 +6,10 @@
 #define MACE_KERNELS_WINOGRAD_TRANSFORM_H_
 #include "mace/core/future.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/kernels/activation.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/kernels/conv_pool_2d_util.h"
 namespace mace {
 namespace kernels {
@@ -17,8 +17,10 @@ namespace kernels {
 struct WinogradTransformFunctorBase {
  WinogradTransformFunctorBase(const Padding &padding_type,
                               const std::vector<int> &paddings)
-      : strides_({1, 1}), dilations_({1, 1}),
+      : strides_({1, 1}),
-        padding_type_(padding_type), paddings_(paddings) {}
+        dilations_({1, 1}),
+        padding_type_(padding_type),
+        paddings_(paddings) {}
  const std::vector<int> strides_;    // [stride_h, stride_w]
  const std::vector<int> dilations_;  // [dilation_h, dilation_w]
@@ -26,29 +28,25 @@ struct WinogradTransformFunctorBase {
  std::vector<int> paddings_;
 };
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 struct WinogradTransformFunctor : WinogradTransformFunctorBase {
  WinogradTransformFunctor(const Padding &padding_type,
                           const std::vector<int> &paddings)
      : WinogradTransformFunctorBase(padding_type, paddings) {}
-  void operator()(const Tensor *input,
+  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
-                  Tensor *output,
-                  StatsFuture *future) {
    MACE_NOT_IMPLEMENTED;
  }
 };
-template<typename T>
+template <typename T>
-struct WinogradTransformFunctor<DeviceType::OPENCL, T> : WinogradTransformFunctorBase {
+struct WinogradTransformFunctor<DeviceType::OPENCL, T>
+    : WinogradTransformFunctorBase {
  WinogradTransformFunctor(const Padding &padding_type,
                           const std::vector<int> &paddings)
      : WinogradTransformFunctorBase(padding_type, paddings) {}
-  void operator()(const Tensor *input,
+  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
-                  Tensor *output,
-                  StatsFuture *future);
  cl::Kernel kernel_;
 };
@@ -72,14 +70,15 @@ struct WinogradInverseTransformFunctorBase {
  const float relux_max_limit_;
 };
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
  WinogradInverseTransformFunctor(const int batch,
                                  const int height,
                                  const int width,
                                  const ActivationType activation,
                                  const float relux_max_limit)
-      : WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit) {}
+      : WinogradInverseTransformFunctorBase(
+            batch, height, width, activation, relux_max_limit) {}
  void operator()(const Tensor *input,
                  const Tensor *bias,
@@ -87,17 +86,18 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
                  StatsFuture *future) {
    MACE_NOT_IMPLEMENTED;
  }
 };
-template<typename T>
+template <typename T>
-struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> : WinogradInverseTransformFunctorBase {
+struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
+    : WinogradInverseTransformFunctorBase {
  WinogradInverseTransformFunctor(const int batch,
                                  const int height,
                                  const int width,
                                  const ActivationType activation,
                                  const float relux_max_limit)
-      : WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit) {}
+      : WinogradInverseTransformFunctorBase(
+            batch, height, width, activation, relux_max_limit) {}
  void operator()(const Tensor *input,
                  const Tensor *bias,

--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -22,7 +22,8 @@ class ActivationOp : public Operator<D, T> {
  bool Run(StatsFuture *future) override {
    const Tensor *input_tensor = this->Input(0);
-    const Tensor *alpha_tensor = this->InputSize() >= 2 ? this->Input(1) : nullptr;
+    const Tensor *alpha_tensor =
+        this->InputSize() >= 2 ? this->Input(1) : nullptr;
    Tensor *output_tensor = this->outputs_[0];
    output_tensor->ResizeLike(input_tensor);

--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -214,9 +214,7 @@ void TestSimplePrelu() {
  net.AddInputFromArray<D, float>(
      "Input", {2, 2, 2, 2},
      {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, float>("Alpha", {2}, {2.0, 3.0});
-      "Alpha", {2},
-      {2.0, 3.0});
  if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(net, "Input", "InputImage",
@@ -250,7 +248,8 @@ void TestSimplePrelu() {
  }
  auto expected = CreateTensor<float>(
-      {2, 2, 2, 2}, {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
+      {2, 2, 2, 2},
+      {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }

--- a/mace/ops/addn.h
+++ b/mace/ops/addn.h
@@ -26,12 +26,10 @@ class AddNOp : public Operator<D, T> {
    for (int i = 1; i < n; ++i) {
      inputs[i] = this->Input(i);
      MACE_CHECK(inputs[0]->dim_size() == inputs[i]->dim_size());
-      MACE_CHECK(inputs[0]->size() == inputs[i]->size()) << "Input 0: "
+      MACE_CHECK(inputs[0]->size() == inputs[i]->size())
-                                                         << MakeString(inputs[0]->shape())
+          << "Input 0: " << MakeString(inputs[0]->shape())
-                                                         << ", size: " << inputs[0]->size()
+          << ", size: " << inputs[0]->size() << ". Input " << i << ": "
-                                                         << ". Input " << i << ": "
+          << MakeString(inputs[i]->shape()) << ", size: " << inputs[i]->size();
-                                                         << MakeString(inputs[i]->shape())
-                                                         << ", size: " << inputs[i]->size();
    }
    functor_(inputs, output_tensor, future);

--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -15,8 +15,7 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
  OpsTestNet net;
  // Add input data
  for (int i = 0; i < inputs; ++i) {
-    net.AddRandomInput<D, float>(MakeString("Input", i).c_str(),
+    net.AddRandomInput<D, float>(MakeString("Input", i).c_str(), {n, h, w, c});
-                                 {n, h, w, c});
  }
  if (D == DeviceType::OPENCL) {

--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
--- a/mace/ops/batch_to_space.h
+++ b/mace/ops/batch_to_space.h
@@ -12,13 +12,12 @@
 namespace mace {
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 class BatchToSpaceNDOp : public Operator<D, T> {
 public:
  BatchToSpaceNDOp(const OperatorDef &op_def, Workspace *ws)
      : Operator<D, T>(op_def, ws),
-        functor_(
+        functor_(OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0}),
-            OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0}),
                 OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1}),
                 true) {}
@@ -28,7 +27,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
    std::vector<index_t> output_shape(4, 0);
    CalculateOutputShape(batch_tensor, space_tensor, output_shape.data());
-    functor_(space_tensor, output_shape, const_cast<Tensor *>(batch_tensor), future);
+    functor_(space_tensor, output_shape, const_cast<Tensor *>(batch_tensor),
+             future);
    return true;
  }
@@ -37,7 +37,8 @@ class BatchToSpaceNDOp : public Operator<D, T> {
                                   Tensor *output,
                                   index_t *output_shape) {
    auto crops = OperatorBase::GetRepeatedArgument<int>("crops", {0, 0, 0, 0});
-    auto block_shape = OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1});
+    auto block_shape =
+        OperatorBase::GetRepeatedArgument<int>("block_shape", {1, 1});
    MACE_CHECK(input_tensor->dim_size() == 4, "Input's shape should be 4D");
    MACE_CHECK(block_shape.size() == 2, "Block's shape should be 1D");
    MACE_CHECK(crops.size() == 4, "Crops' shape should be 2D");
@@ -45,13 +46,13 @@ class BatchToSpaceNDOp : public Operator<D, T> {
    const index_t block_dims = block_shape.size();
    index_t block_shape_product = 1;
    for (uint32_t block_dim = 0; block_dim < block_dims; ++block_dim) {
-      MACE_CHECK(block_shape[block_dim] > 1, "block_shape's value should be great to 1");
+      MACE_CHECK(block_shape[block_dim] > 1,
+                 "block_shape's value should be great to 1");
      const index_t block_shape_value = block_shape[block_dim];
-      const index_t cropped_input_size = input_tensor->dim(block_dim + 1) * block_shape_value
+      const index_t cropped_input_size =
-          - crops[block_dim * 2]
+          input_tensor->dim(block_dim + 1) * block_shape_value -
-          - crops[block_dim * 2 + 1];
+          crops[block_dim * 2] - crops[block_dim * 2 + 1];
-      MACE_CHECK(cropped_input_size >= 0,
+      MACE_CHECK(cropped_input_size >= 0, "cropped size must be non-negative");
-                 "cropped size must be non-negative");
      block_shape_product *= block_shape_value;
      output_shape[block_dim + 1] = cropped_input_size;
    }

--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
--- a/mace/ops/buffer_to_image.h
+++ b/mace/ops/buffer_to_image.h
@@ -11,7 +11,7 @@
 namespace mace {
 template <DeviceType D, typename T>
-class BufferToImageOp: public Operator<D, T> {
+class BufferToImageOp : public Operator<D, T> {
 public:
  BufferToImageOp(const OperatorDef &op_def, Workspace *ws)
      : Operator<D, T>(op_def, ws) {}
@@ -19,7 +19,8 @@ class BufferToImageOp: public Operator<D, T> {
  bool Run(StatsFuture *future) override {
    const Tensor *input_tensor = this->Input(INPUT);
-    kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
+    kernels::BufferType type =
+        static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
            "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
    Tensor *output = this->Output(OUTPUT);

--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -7,8 +7,9 @@
 using namespace mace;
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
-void TestBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
+void TestBidirectionTransform(const int type,
+                              const std::vector<index_t> &input_shape) {
  OpsTestNet net;
  OpDefBuilder("BufferToImage", "BufferToImageTest")
      .Input("Input")
@@ -34,7 +35,8 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
  net.RunOp(D);
  // Check
-  ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-5);
+  ExpectTensorNear<T>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
+                      1e-5);
 }
 TEST(BufferToImageTest, ArgSmall) {
@@ -54,51 +56,63 @@ TEST(BufferToImageTest, ArgLarge) {
 }
 TEST(BufferToImageTest, InputSmallSingleChannel) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {1, 2, 3, 1});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+                                                      {1, 2, 3, 1});
 }
 TEST(BufferToImageTest, InputSmallMultipleChannel) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {1, 2, 3, 3});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+                                                      {1, 2, 3, 3});
 }
 TEST(BufferToImageTest, InputSmallMultipleBatchAndChannel) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {3, 2, 3, 3});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+                                                      {3, 2, 3, 3});
 }
 TEST(BufferToImageTest, InputMedia) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {3, 13, 17, 128});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+                                                      {3, 13, 17, 128});
 }
 TEST(BufferToImageTest, InputLarge) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL, {3, 64, 64, 256});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::IN_OUT_CHANNEL,
+                                                      {3, 64, 64, 256});
 }
 TEST(BufferToImageTest, Filter1x1Small) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 3, 5});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+                                                      {1, 1, 3, 5});
 }
 TEST(BufferToImageTest, Filter1x1Media) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 13, 17});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+                                                      {1, 1, 13, 17});
 }
 TEST(BufferToImageTest, Filter1x1Large) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 128, 512});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+                                                      {1, 1, 128, 512});
 }
 TEST(BufferToImageTest, Filter3x3Small) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 3, 5});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+                                                      {3, 3, 3, 5});
 }
 TEST(BufferToImageTest, Filter3x3Meida) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 13, 17});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+                                                      {3, 3, 13, 17});
 }
 TEST(BufferToImageTest, Filter3x3Large) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 128, 256});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER,
+                                                      {3, 3, 128, 256});
 }
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
-void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t> &input_shape) {
+void TestDiffTypeBidirectionTransform(const int type,
+                                      const std::vector<index_t> &input_shape) {
  OpsTestNet net;
  OpDefBuilder("BufferToImage", "BufferToImageTest")
      .Input("Input")
@@ -123,14 +137,16 @@ void TestDiffTypeBidirectionTransform(const int type, const std::vector<index_t>
  net.RunOp(D);
  // Check
-  ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-2);
+  ExpectTensorNear<float>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
+                          1e-2);
 }
 TEST(BufferToImageTest, ArgFloatToHalfSmall) {
-  TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT, {11});
+  TestDiffTypeBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT,
+                                                             {11});
 }
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 void TestStringHalfBidirectionTransform(const int type,
                                        const std::vector<index_t> &input_shape,
                                        const unsigned char *input_data) {
@@ -142,9 +158,10 @@ void TestStringHalfBidirectionTransform(const int type,
      .AddIntArg("T", DataTypeToEnum<T>::value)
      .Finalize(net.NewOperatorDef());
-  const half *h_data = reinterpret_cast<const half*>(input_data);
+  const half *h_data = reinterpret_cast<const half *>(input_data);
-  net.AddInputFromArray<D, half>("Input", input_shape, std::vector<half>(h_data, h_data+2));
+  net.AddInputFromArray<D, half>("Input", input_shape,
+                                 std::vector<half>(h_data, h_data + 2));
  // Run
  net.RunOp(D);
@@ -160,12 +177,14 @@ void TestStringHalfBidirectionTransform(const int type,
  net.RunOp(D);
  // Check
-  ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"), 1e-2);
+  ExpectTensorNear<half>(*net.GetOutput("Input"), *net.GetOutput("I2BOutput"),
+                         1e-2);
 }
 TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
-  const unsigned char input_data[] = {0xCD, 0x3C, 0x33, 0x40,};
+  const unsigned char input_data[] = {
-  TestStringHalfBidirectionTransform<DeviceType::OPENCL, half>(kernels::ARGUMENT,
+      0xCD, 0x3C, 0x33, 0x40,
-                                                               {2},
+  };
-                                                               input_data);
+  TestStringHalfBidirectionTransform<DeviceType::OPENCL, half>(
+      kernels::ARGUMENT, {2}, input_data);
 }
--- a/mace/ops/channel_shuffle.h
+++ b/mace/ops/channel_shuffle.h
@@ -28,8 +28,8 @@ class ChannelShuffleOp : public Operator<D, T> {
               input->shape()[1]);
    output->ResizeLike(input);
-    functor_(input->data<T>(), input->shape().data(),
+    functor_(input->data<T>(), input->shape().data(), output->mutable_data<T>(),
-             output->mutable_data<T>(), future);
+             future);
    return true;
  }

--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
--- a/mace/ops/concat.h
+++ b/mace/ops/concat.h
@@ -14,10 +14,11 @@ class ConcatOp : public Operator<D, T> {
 public:
  ConcatOp(const OperatorDef &op_def, Workspace *ws)
      : Operator<D, T>(op_def, ws),
-        functor_(OperatorBase::GetSingleArgument<int>("axis", 3)){}
+        functor_(OperatorBase::GetSingleArgument<int>("axis", 3)) {}
  bool Run(StatsFuture *future) override {
-    MACE_CHECK(this->InputSize() >= 2) << "There must be at least two inputs to concat";
+    MACE_CHECK(this->InputSize() >= 2)
+        << "There must be at least two inputs to concat";
    const std::vector<const Tensor *> input_list = this->Inputs();
    const int32_t concat_axis = OperatorBase::GetSingleArgument<int>("axis", 3);
    const int32_t input_dims = input_list[0]->dim_size();

--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -38,8 +38,7 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
 }
 #define BM_CONCAT_CPU_MACRO(DIM0, DIM1)                      \
-  static void BM_CONCAT_CPU_##DIM0##_##DIM1( \
+  static void BM_CONCAT_CPU_##DIM0##_##DIM1(int iters) {     \
-      int iters) { \
    ConcatHelper<DeviceType::CPU, float>(iters, DIM0, DIM1); \
  }                                                          \
  BENCHMARK(BM_CONCAT_CPU_##DIM0##_##DIM1)
@@ -90,10 +89,8 @@ static void OpenclConcatHelper(int iters,
  }
 }
 #define BM_CONCAT_OPENCL_MACRO(N, H, W, C, TYPE)                           \
-  static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE( \
+  static void BM_CONCAT_OPENCL_##N##_##H##_##W##_##C##_##TYPE(int iters) { \
-      int iters) { \
    std::vector<index_t> shape = {N, H, W, C};                             \
    OpenclConcatHelper<TYPE>(iters, shape, shape, 3);                      \
  }                                                                        \

--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -112,8 +112,8 @@ TEST_F(ConcatOpTest, CPURandom) {
    concat_axis_size += input_shapes[i][axis];
    GenerateRandomRealTypeData(input_shapes[i], inputs[i]);
    input_ptrs[i] = inputs[i].data();
-    net.AddInputFromArray<DeviceType::CPU, float>(
+    net.AddInputFromArray<DeviceType::CPU, float>(MakeString("Input", i),
-        MakeString("Input", i), input_shapes[i], inputs[i]);
+                                                  input_shapes[i], inputs[i]);
  }
  // Run
@@ -214,6 +214,6 @@ TEST_F(ConcatOpTest, OPENCLUnAligned) {
 }
 TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) {
-  OpenclRandomTest<float>({{3, 32, 32, 32}, {3, 32, 32, 32},
+  OpenclRandomTest<float>(
-                           {3, 32, 32, 32}, {3, 32, 32, 32}}, 3);
+      {{3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}, {3, 32, 32, 32}}, 3);
 }
\ No newline at end of file
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -2,8 +2,8 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
-#include <fstream>
 #include "mace/ops/conv_2d.h"
+#include <fstream>
 #include "mace/ops/ops_test_util.h"
 using namespace mace;
@@ -342,7 +342,8 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
 TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::OPENCL>(); }
 template <DeviceType D, typename T>
-static void TestComplexConvNxNS12(const std::vector<index_t> &shape, const int stride) {
+static void TestComplexConvNxNS12(const std::vector<index_t> &shape,
+                                  const int stride) {
  testing::internal::LogToStderr();
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                  Padding type) {
@@ -412,27 +413,21 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape, const int s
 }
 TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32},
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 1);
-                                                   1);
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 2);
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32},
-                                                   2);
 }
 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7},
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 1);
-                                                   1);
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7}, 2);
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({17, 113, 5, 7},
-                                                   2);
 }
 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({31, 113, 13, 17},
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({31, 113, 13, 17}, 3);
-                                                   3);
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 13, 17}, 4);
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 32, 13, 17},
-                                                   4);
 }
-template<DeviceType D>
+template <DeviceType D>
 static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
                                      const std::vector<index_t> &filter_shape,
                                      const std::vector<int> &dilations) {
@@ -519,67 +514,58 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
 }
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x1S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 1, 32, 64},
-                                                {1, 1, 32, 64},
                                                {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {3, 3, 32, 64},
-                                                {3, 3, 32, 64},
                                                {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {15, 1, 256, 2},
-                                                {15, 1, 256, 2},
                                                {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {1, 15, 256, 2},
-                                                {1, 15, 256, 2},
                                                {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({32, 32}, {7, 7, 3, 64},
-                                                {7, 7, 3, 64},
                                                {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv1x1S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {1, 1, 5, 7},
-                                                {1, 1, 5, 7},
                                                {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({107, 113}, {3, 3, 5, 7},
-                                                {3, 3, 5, 7},
                                                {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {5, 5, 16, 16},
-                                                {5, 5, 16, 16},
                                                {2, 2});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation2) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({64, 64}, {7, 7, 16, 16},
-                                                {7, 7, 16, 16},
                                                {2, 2});
 }
 TEST_F(Conv2dOpTest, OPENCLHalfConv7x7Dilation4) {
-  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67},
+  TestHalfComplexConvNxNS12<DeviceType::OPENCL>({63, 67}, {7, 7, 16, 16},
-                                                {7, 7, 16, 16},
                                                {4, 4});
 }
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
-static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dilation_rate) {
+static void TestDilationConvNxN(const std::vector<index_t> &shape,
+                                const int dilation_rate) {
  testing::internal::LogToStderr();
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                  Padding type) {
@@ -617,9 +603,12 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
    expected.Copy(*net.GetOutput("Output"));
    // run on gpu
-    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, T>(net, "Input", "InputImage",
-    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
+                        kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage",
+                        kernels::BufferType::CONV2D_FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage",
+                        kernels::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
@@ -634,7 +623,8 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
    // Run on device
    net.RunOp(D);
-    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
+    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
+                        kernels::BufferType::IN_OUT_CHANNEL);
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
  };
@@ -647,22 +637,20 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
 }
 TEST_F(Conv2dOpTest, OPENCLAlignedDilation2) {
-  TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64},
+  TestDilationConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, 2);
-                                                 2);
 }
 TEST_F(Conv2dOpTest, OPENCLAligned2Dilation4) {
-  TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16},
+  TestDilationConvNxN<DeviceType::OPENCL, float>({128, 128, 16, 16}, 4);
-                                                 4);
 }
 TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
-  TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7},
+  TestDilationConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, 4);
-                                                 4);
 }
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
-static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std::vector<int> &paddings) {
+static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
+                                    const std::vector<int> &paddings) {
  testing::internal::LogToStderr();
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w) {
    srand(time(NULL));
@@ -698,9 +686,12 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std
    expected.Copy(*net.GetOutput("Output"));
    // run on gpu
-    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, T>(net, "Input", "InputImage",
-    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
+                        kernels::BufferType::IN_OUT_CHANNEL);
-    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage",
+                        kernels::BufferType::CONV2D_FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage",
+                        kernels::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
@@ -714,7 +705,8 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std
    // Run on device
    net.RunOp(D);
-    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
+    ImageToBuffer<D, T>(net, "OutputImage", "OPENCLOutput",
+                        kernels::BufferType::IN_OUT_CHANNEL);
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 0.001);
  };
@@ -726,8 +718,7 @@ static void TestArbitraryPadConvNxN(const std::vector<index_t> &shape, const std
 }
 TEST_F(Conv2dOpTest, OPENCLAlignedPad1) {
-  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64},
+  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({32, 32, 32, 64}, {1, 1});
-                                                     {1, 1});
 }
 TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
@@ -736,6 +727,5 @@ TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
 }
 TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
-  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7},
+  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, {4, 4});
-                                                     {4, 4});
 }
--- a/mace/ops/eltwise.h
+++ b/mace/ops/eltwise.h
@@ -18,15 +18,17 @@ class EltwiseOp : public Operator<D, T> {
        functor_(static_cast<kernels::EltwiseType>(
                     OperatorBase::GetSingleArgument<int>(
                         "type", static_cast<int>(kernels::EltwiseType::SUM))),
-                 OperatorBase::GetRepeatedArgument<float>("coeff")){}
+                 OperatorBase::GetRepeatedArgument<float>("coeff")) {}
  bool Run(StatsFuture *future) override {
    const Tensor *input0 = this->Input(0);
    const Tensor *input1 = this->Input(1);
    Tensor *output = this->Output(OUTPUT);
-    MACE_CHECK(input0->dim_size() == input1->dim_size()) << "Inputs of Eltwise op must be same shape";
+    MACE_CHECK(input0->dim_size() == input1->dim_size())
-    for(int i = 0; i < input0->dim_size(); ++i) {
+        << "Inputs of Eltwise op must be same shape";
-      MACE_CHECK(input0->dim(i) == input1->dim(i)) << "Inputs of Eltwise op must be same shape";
+    for (int i = 0; i < input0->dim_size(); ++i) {
+      MACE_CHECK(input0->dim(i) == input1->dim(i))
+          << "Inputs of Eltwise op must be same shape";
    }
    output->ResizeLike(input0);

--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -2,15 +2,15 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
+#include "mace/kernels/eltwise.h"
 #include "mace/core/operator.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/kernels/eltwise.h"
 namespace mace {
 class EltwiseOpTest : public OpsTestBase {};
-template<DeviceType D>
+template <DeviceType D>
 void Simple(const kernels::EltwiseType type,
            const std::vector<index_t> &shape,
            const std::vector<float> &input0,
@@ -36,8 +36,10 @@ void Simple(const kernels::EltwiseType type,
    // Run
    net.RunOp(D);
  } else {
-    BufferToImage<D, half>(net, "Input1", "InputImg1", kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, half>(net, "Input1", "InputImg1",
-    BufferToImage<D, half>(net, "Input2", "InputImg2", kernels::BufferType::IN_OUT_CHANNEL);
+                           kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, half>(net, "Input2", "InputImg2",
+                           kernels::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("Eltwise", "EltwiseTest")
        .Input("InputImg1")
        .Input("InputImg2")
@@ -49,7 +51,8 @@ void Simple(const kernels::EltwiseType type,
    // Run
    net.RunOp(D);
-    ImageToBuffer<D, float>(net, "OutputImg", "Output", kernels::BufferType::IN_OUT_CHANNEL);
+    ImageToBuffer<D, float>(net, "OutputImg", "Output",
+                            kernels::BufferType::IN_OUT_CHANNEL);
  }
  auto expected = CreateTensor<float>(shape, output);
@@ -58,64 +61,42 @@ void Simple(const kernels::EltwiseType type,
 }
 TEST_F(EltwiseOpTest, CPUSimple) {
-  Simple<DeviceType::CPU>(kernels::EltwiseType::PROD,
+  Simple<DeviceType::CPU>(kernels::EltwiseType::PROD, {1, 1, 2, 3},
-                          {1, 1, 2, 3},
+                          {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
-                          {1, 2, 3, 4, 5, 6},
-                          {1, 2, 3, 4, 5, 6},
                          {1, 4, 9, 16, 25, 36});
-  Simple<DeviceType::CPU>(kernels::EltwiseType::SUM,
+  Simple<DeviceType::CPU>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
-                          {1, 1, 2, 3},
+                          {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
-                          {1, 2, 3, 4, 5, 6},
-                          {1, 2, 3, 4, 5, 6},
                          {2, 4, 6, 8, 10, 12});
-  Simple<DeviceType::CPU>(kernels::EltwiseType::SUM,
+  Simple<DeviceType::CPU>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
-                          {1, 1, 2, 3},
+                          {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
-                          {1, 2, 3, 4, 5, 6},
+                          {3, 6, 9, 12, 15, 18}, {2, 1});
-                          {1, 2, 3, 4, 5, 6},
+  Simple<DeviceType::CPU>(kernels::EltwiseType::MAX, {1, 1, 2, 3},
-                          {3, 6, 9, 12, 15, 18},
+                          {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
-                          {2, 1});
-  Simple<DeviceType::CPU>(kernels::EltwiseType::MAX,
-                          {1, 1, 2, 3},
-                          {1, 2, 3, 4, 5, 6},
-                          {1, 1, 3, 3, 6, 6},
                          {1, 2, 3, 4, 6, 6});
-  Simple<DeviceType::CPU>(kernels::EltwiseType::MIN,
+  Simple<DeviceType::CPU>(kernels::EltwiseType::MIN, {1, 1, 2, 3},
-                          {1, 1, 2, 3},
+                          {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
-                          {1, 2, 3, 4, 5, 6},
-                          {1, 1, 3, 3, 6, 6},
                          {1, 1, 3, 3, 5, 6});
 }
 TEST_F(EltwiseOpTest, GPUSimple) {
-  Simple<DeviceType::OPENCL>(kernels::EltwiseType::PROD,
+  Simple<DeviceType::OPENCL>(kernels::EltwiseType::PROD, {1, 1, 2, 3},
-                             {1, 1, 2, 3},
+                             {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
-                             {1, 2, 3, 4, 5, 6},
-                             {1, 2, 3, 4, 5, 6},
                             {1, 4, 9, 16, 25, 36});
-  Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM,
+  Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
-                             {1, 1, 2, 3},
+                             {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
-                             {1, 2, 3, 4, 5, 6},
-                             {1, 2, 3, 4, 5, 6},
                             {2, 4, 6, 8, 10, 12});
-  Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM,
+  Simple<DeviceType::OPENCL>(kernels::EltwiseType::SUM, {1, 1, 2, 3},
-                             {1, 1, 2, 3},
+                             {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6},
-                             {1, 2, 3, 4, 5, 6},
+                             {3, 6, 9, 12, 15, 18}, {2, 1});
-                             {1, 2, 3, 4, 5, 6},
+  Simple<DeviceType::OPENCL>(kernels::EltwiseType::MAX, {1, 1, 2, 3},
-                             {3, 6, 9, 12, 15, 18},
+                             {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
-                             {2, 1});
-  Simple<DeviceType::OPENCL>(kernels::EltwiseType::MAX,
-                             {1, 1, 2, 3},
-                             {1, 2, 3, 4, 5, 6},
-                             {1, 1, 3, 3, 6, 6},
                             {1, 2, 3, 4, 6, 6});
-  Simple<DeviceType::OPENCL>(kernels::EltwiseType::MIN,
+  Simple<DeviceType::OPENCL>(kernels::EltwiseType::MIN, {1, 1, 2, 3},
-                             {1, 1, 2, 3},
+                             {1, 2, 3, 4, 5, 6}, {1, 1, 3, 3, 6, 6},
-                             {1, 2, 3, 4, 5, 6},
-                             {1, 1, 3, 3, 6, 6},
                             {1, 1, 3, 3, 5, 6});
 }
-template<DeviceType D, typename T>
+template <DeviceType D, typename T>
 void RandomTest(const kernels::EltwiseType type,
                const std::vector<index_t> &shape) {
  testing::internal::LogToStderr();
@@ -139,8 +120,10 @@ void RandomTest(const kernels::EltwiseType type,
  // Run
  net.RunOp();
-  BufferToImage<D, T>(net, "Input1", "InputImg1", kernels::BufferType::IN_OUT_CHANNEL);
+  BufferToImage<D, T>(net, "Input1", "InputImg1",
-  BufferToImage<D, T>(net, "Input2", "InputImg2", kernels::BufferType::IN_OUT_CHANNEL);
+                      kernels::BufferType::IN_OUT_CHANNEL);
+  BufferToImage<D, T>(net, "Input2", "InputImg2",
+                      kernels::BufferType::IN_OUT_CHANNEL);
  OpDefBuilder("Eltwise", "EltwiseTest")
      .Input("InputImg1")
      .Input("InputImg2")
@@ -153,12 +136,15 @@ void RandomTest(const kernels::EltwiseType type,
  // Run
  net.RunOp(D);
-  ImageToBuffer<D, float>(net, "OutputImg", "OPENCLOutput", kernels::BufferType::IN_OUT_CHANNEL);
+  ImageToBuffer<D, float>(net, "OutputImg", "OPENCLOutput",
+                          kernels::BufferType::IN_OUT_CHANNEL);
  if (DataTypeToEnum<T>::value == DT_FLOAT) {
-    ExpectTensorNear<float>(*net.GetTensor("Output"), *net.GetOutput("OPENCLOutput"), 1e-3);
+    ExpectTensorNear<float>(*net.GetTensor("Output"),
+                            *net.GetOutput("OPENCLOutput"), 1e-3);
  } else {
-    ExpectTensorNear<float>(*net.GetTensor("Output"), *net.GetOutput("OPENCLOutput"), 1e-1);
+    ExpectTensorNear<float>(*net.GetTensor("Output"),
+                            *net.GetOutput("OPENCLOutput"), 1e-1);
  }
 }

--- a/mace/ops/folded_batch_norm.cc
+++ b/mace/ops/folded_batch_norm.cc
@@ -7,22 +7,19 @@
 namespace mace {
 void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
-  REGISTER_OPERATOR(op_registry,
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
-                    OpKeyBuilder("FoldedBatchNorm")
                                     .Device(DeviceType::CPU)
                                     .TypeConstraint<float>("T")
                                     .Build(),
                    FoldedBatchNormOp<DeviceType::CPU, float>);
-  REGISTER_OPERATOR(op_registry,
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
-                    OpKeyBuilder("FoldedBatchNorm")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
                                     .Build(),
                    FoldedBatchNormOp<DeviceType::OPENCL, float>);
-  REGISTER_OPERATOR(op_registry,
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
-                    OpKeyBuilder("FoldedBatchNorm")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<half>("T")
                                     .Build(),

--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
--- a/mace/ops/fully_connected.h
+++ b/mace/ops/fully_connected.h
--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
--- a/mace/ops/fused_conv_2d_test.cc
+++ b/mace/ops/fused_conv_2d_test.cc
--- a/mace/ops/global_avg_pooling.h
+++ b/mace/ops/global_avg_pooling.h
--- a/mace/ops/global_avg_pooling_benchmark.cc
+++ b/mace/ops/global_avg_pooling_benchmark.cc
--- a/mace/ops/image_to_buffer.h
+++ b/mace/ops/image_to_buffer.h
--- a/mace/ops/matmul.h
+++ b/mace/ops/matmul.h
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
--- a/mace/ops/reshape.h
+++ b/mace/ops/reshape.h
--- a/mace/ops/reshape_test.cc
+++ b/mace/ops/reshape_test.cc
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
--- a/mace/ops/softmax.h
+++ b/mace/ops/softmax.h
--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
--- a/mace/ops/space_to_batch.h
+++ b/mace/ops/space_to_batch.h
--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
--- a/mace/ops/winograd_inverse_transform.h
+++ b/mace/ops/winograd_inverse_transform.h
--- a/mace/ops/winograd_transform.h
+++ b/mace/ops/winograd_transform.h
--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
--- a/mace/utils/command_line_flags.h
+++ b/mace/utils/command_line_flags.h
--- a/mace/utils/env_time.h
+++ b/mace/utils/env_time.h
--- a/mace/utils/logging.h
+++ b/mace/utils/logging.h
--- a/mace/utils/string_util.h
+++ b/mace/utils/string_util.h
--- a/mace/utils/timer.h
+++ b/mace/utils/timer.h
--- a/mace/utils/tuner_test.cc
+++ b/mace/utils/tuner_test.cc
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
--- a/mace/utils/utils_test.cc
+++ b/mace/utils/utils_test.cc