diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h
index 1e3601a4a5f2f38dbf0bfa6d5acc8dbd21c2fa4d..55368c3ca83c8aa7dd9e8d76efb47bde568ec4ce 100644
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -5,6 +5,10 @@
 #ifndef MACE_KERNELS_ACTIVATION_H_
 #define MACE_KERNELS_ACTIVATION_H_
 
+#include <algorithm>
+#include <string>
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
index 3a5a45df4dd3476e4ab7a2f58bae658b461e206a..70d9583ba798babd3a27737c9ed7487913441bf6 100644
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -8,6 +8,7 @@
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
 #include <arm_neon.h>
 #endif
+#include <algorithm>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -17,9 +18,7 @@
 namespace mace {
 namespace kernels {
 
-namespace {
 constexpr int kCostPerGroup = 1024;
-}  // namespace
 
 template <DeviceType D, typename T>
 struct AddNFunctor {
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index 57f0f4d66a0107ef6e907e4c5579bed0feef2be3..28b8d776c967e48a4af835ee55913c437aa3d3ea 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -8,6 +8,7 @@
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
 #include <arm_neon.h>
 #endif
+#include <vector>
 
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -159,7 +160,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
   std::vector<index_t> input_shape_;
 };
 
-}  // namepsace kernels
+}  // namespace kernels
 }  // namespace mace
 
 #endif  // MACE_KERNELS_BATCH_NORM_H_
diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h
index d8e411efa8a70ebd2ca850f6ac91fa1bd2198fe6..d5372850bcf604b0f1e01e630c0c30b59e95abc0 100644
--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -5,6 +5,8 @@
 #ifndef MACE_KERNELS_BIAS_ADD_H_
 #define MACE_KERNELS_BIAS_ADD_H_
 
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
@@ -65,7 +67,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
   std::vector<index_t> input_shape_;
 };
 
-}  // namepsace kernels
+}  // namespace kernels
 }  // namespace mace
 
 #endif  // MACE_KERNELS_BIAS_ADD_H_
diff --git a/mace/kernels/buffer_to_image.h b/mace/kernels/buffer_to_image.h
index 3292e993ab107dad1cb0ce5a66632d21370c7302..2956762d5d70fb089e8e2bee34f114693fb1cc12 100644
--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -13,13 +13,14 @@ namespace mace {
 namespace kernels {
 
 struct BufferToImageFunctorBase {
-  BufferToImageFunctorBase(bool i2b) : i2b_(i2b) {}
+  explicit BufferToImageFunctorBase(bool i2b) : i2b_(i2b) {}
   bool i2b_;
 };
 
 template <DeviceType D, typename T>
 struct BufferToImageFunctor : BufferToImageFunctorBase {
-  BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
+  explicit BufferToImageFunctor(bool i2b = false)
+      : BufferToImageFunctorBase(i2b) {}
   void operator()(Tensor *input,
                   const BufferType type,
                   Tensor *output,
@@ -30,14 +31,15 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {
 
 template <typename T>
 struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase {
-  BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
+  explicit BufferToImageFunctor(bool i2b = false)
+      : BufferToImageFunctorBase(i2b) {}
   void operator()(Tensor *input,
                   const BufferType type,
                   Tensor *output,
                   StatsFuture *future);
 };
 
-}  // namepsace kernels
+}  // namespace kernels
 }  // namespace mace
 
 #endif  // MACE_KERNELS_BUFFER_TO_IMAGE_H_
diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h
index da2ce094a141984c49ad21a208bdaafb8a97311e..f1e258337a2d9a871bbb3ac4aec70faf1a18edf9 100644
--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -5,6 +5,8 @@
 #ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_
 #define MACE_KERNELS_CHANNEL_SHUFFLE_H_
 
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 
@@ -13,7 +15,7 @@ namespace kernels {
 
 template <DeviceType D, typename T>
 struct ChannelShuffleFunctor {
-  ChannelShuffleFunctor(const int groups) : groups_(groups) {}
+  explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
 
   void operator()(const Tensor *input,
                   Tensor *output,
@@ -49,7 +51,7 @@ struct ChannelShuffleFunctor {
 
 template <typename T>
 struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
-  ChannelShuffleFunctor(const int groups) : groups_(groups) {}
+  explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}
 
   void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
 
diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h
index 6870594641baaab5aae866c033107a7b6df0507c..de34ed69fa5803f61e9f6785b9d4b7185be2cccc 100644
--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -5,6 +5,8 @@
 #ifndef MACE_KERNELS_CONCAT_H_
 #define MACE_KERNELS_CONCAT_H_
 
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
@@ -15,14 +17,14 @@ namespace mace {
 namespace kernels {
 
 struct ConcatFunctorBase {
-  ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
+  explicit ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
 
   int32_t axis_;
 };
 
 template <DeviceType D, typename T>
 struct ConcatFunctor : ConcatFunctorBase {
-  ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
+  explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
 
   void operator()(const std::vector<const Tensor *> &input_list,
                   Tensor *output,
@@ -77,7 +79,7 @@ struct ConcatFunctor : ConcatFunctorBase {
 
 template <typename T>
 struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
-  ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
+  explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
 
   void operator()(const std::vector<const Tensor *> &input_list,
                   Tensor *output,
@@ -86,7 +88,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
   std::vector<index_t> input_shape_;
 };
 
-}  // namepsace kernels
+}  // namespace kernels
 }  // namespace mace
 
 #endif  // MACE_KERNELS_CONCAT_H_
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index b107d33229c0b77be24e0702db9cf0585801b06f..47516291d14ec21ba2202e2089bee03d6387c433 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -8,6 +8,8 @@
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
 #include <arm_neon.h>
 #endif
+#include <algorithm>
+#include <vector>
 
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -18,7 +20,6 @@
 
 namespace mace {
 namespace kernels {
-namespace {
 
 template <typename T,
           int inc_tile_size,
@@ -61,9 +62,9 @@ void Conv2dKernelFunc(const T *input_ptr,  // batch start
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
         // AArch64 NEON has 32 128-bit general purpose registers
         static_assert(inc_tile_size == 4, "input channels tile size must be 4");
-        float32x4_t in[h_count * w_count];
+        float32x4_t in[h_count * w_count];  // NOLINT(runtime/arrays)
 #else
-        T in[h_count * w_count * inc_tile_size];
+        T in[h_count * w_count * inc_tile_size];  // NOLINT(runtime/arrays)
 #endif
         for (int hi = 0; hi < h_count; ++hi) {
           for (int wi = 0; wi < w_count; ++wi) {
@@ -86,9 +87,9 @@ void Conv2dKernelFunc(const T *input_ptr,  // batch start
 
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
         static_assert(inc_tile_size == 4, "input channels tile size must be 4");
-        float32x4_t weights[c_count];
+        float32x4_t weights[c_count];  // NOLINT(runtime/arrays)
 #else
-        T weights[c_count * inc_tile_size];
+        T weights[c_count * inc_tile_size];  // NOLINT(runtime/arrays)
 #endif
         for (int ci = 0; ci < c_count; ++ci) {
           const int weights_idx = ci;
@@ -126,7 +127,7 @@ void Conv2dKernelFunc(const T *input_ptr,  // batch start
       }
       // handling the remaining input channels
       for (; inc < input_channels; ++inc) {
-        T in[h_count * w_count];
+        T in[h_count * w_count];  // NOLINT(runtime/arrays)
         for (int hi = 0; hi < h_count; ++hi) {
           for (int wi = 0; wi < w_count; ++wi) {
             const int in_idx = hi * w_count + wi;
@@ -138,7 +139,7 @@ void Conv2dKernelFunc(const T *input_ptr,  // batch start
           }
         }
 
-        T weights[c_count];
+        T weights[c_count];  // NOLINT(runtime/arrays)
         for (int ci = 0; ci < c_count; ++ci) {
           const int weights_idx = ci;
           const int filter_offset =
@@ -173,7 +174,6 @@ void Conv2dKernelFunc(const T *input_ptr,  // batch start
     }
   }
 }
-};  // namespace
 
 struct Conv2dFunctorBase {
   Conv2dFunctorBase(const int *strides,
@@ -331,7 +331,7 @@ struct Conv2dFunctor : Conv2dFunctorBase {
     auto output_data = output->mutable_data<T>();
 
     constexpr int inc_tile_size = 4;
-// TODO Auto tuning these parameters
+// TODO(heliangliang) Auto tuning these parameters
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
     const int c_tile_size = 4;
     const int h_tile_size = 2;
diff --git a/mace/kernels/conv_pool_2d_util.cc b/mace/kernels/conv_pool_2d_util.cc
index b1a83782a8ab5dcd96c834fdcff937ecea55d844..9bbbdcf1d96852744de1e073e67e9b4a15dc2c1f 100644
--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -4,6 +4,8 @@
 
 #include "mace/kernels/conv_pool_2d_util.h"
 
+#include <vector>
+
 namespace mace {
 namespace kernels {
 
@@ -56,7 +58,7 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,   // NCHW
   }
 
   // Note: TensorFlow may padded one more on the right/bottom side
-  // TODO may be it's better to also truncate the left/top to
+  // TODO(liuqi): may be it's better to also truncate the left/top to
   // utilize the more centered features. We need to benchmark
   // based on the model accuracy.
 
@@ -120,7 +122,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,   // NHWC
   }
 
   // Note: TensorFlow may padded one more on the right/bottom side
-  // TODO may be it's better to also truncate the left/top to
+  // TODO(liuqi): may be it's better to also truncate the left/top to
   // utilize the more centered features. We need to benchmark
   // based on the model accuracy.
 
@@ -219,7 +221,7 @@ void CalPaddingSize(const index_t *input_shape,   // NCHW
   }
 
   // Note: TensorFlow may padded one more on the right/bottom side
-  // TODO may be it's better to also truncate the left/top to
+  // TODO(liuqi): may be it's better to also truncate the left/top to
   // utilize the more centered features. We need to benchmark
   // based on the model accuracy.
   padding_size[0] = std::max<int>(
diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h
index dc6b737077ab16b093d3993c5f414430fa17d186..166ea18a644ead1d53af2a7c3b83c73c617554d6 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -8,6 +8,8 @@
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
 #include <arm_neon.h>
 #endif
+#include <algorithm>
+#include <vector>
 
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -17,8 +19,6 @@
 namespace mace {
 namespace kernels {
 
-namespace {
-
 template <typename T>
 void DepthwiseConv2dKernel(const T *input_ptr,
                            const T *filter_ptr,
@@ -233,8 +233,6 @@ void DepthwiseConv2dNoOOBCheckKernel(const T *input_ptr,
   }
 }
 
-}  // namespace
-
 struct DepthwiseConv2dFunctorBase {
   DepthwiseConv2dFunctorBase(const int *strides,
                              const Padding padding_type,
diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h
index 1aa883d568ff493fa092e23637b2b6accf1d8a38..0f9e9b40061890a62e36104746bcaf0120bfab0f 100644
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -4,6 +4,9 @@
 #ifndef MACE_KERNELS_ELTWISE_H_
 #define MACE_KERNELS_ELTWISE_H_
 
+#include <algorithm>
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h
index 5c527d4593e02bead0e55998674690a4c5864e50..4ab385291da1854808f73cd0bdd926c7cc17c616 100644
--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -5,6 +5,8 @@
 #ifndef MACE_KERNELS_FULLY_CONNECTED_H_
 #define MACE_KERNELS_FULLY_CONNECTED_H_
 
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h
index 88452bfe83dde8c0d05e1ff61a55410863c1b31a..62590400bf038773c9f16fae68f4c42de4ee9130 100644
--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -21,7 +21,6 @@
 namespace mace {
 namespace kernels {
 
-namespace {
 template<typename T,
   int register_tile_size,
   int h_count,
@@ -87,7 +86,6 @@ inline void MatMulKernelFunc(const T *A,
     }
   }
 }
-}  // namespace
 
 #define MACE_DO_MATMUL(HC, WC, KC) \
 MatMulKernelFunc<T, register_tile_size, HC, WC, KC>(a_ptr_batch_base, \
@@ -118,7 +116,6 @@ switch (k_count) { \
     LOG(FATAL) << "Unsupported k tile: " << k_count; \
 }
 
-
 #define MACE_CASE_W_MATMUL(HC) \
 switch (w_count) { \
   case 1: \
diff --git a/mace/kernels/neon/batch_norm_neon.cc b/mace/kernels/neon/batch_norm_neon.cc
index 19094ef7a15496d3ff65dffcd57d5927b20338e1..930a0c5d5692a120c71bfd962c569443aa90d805 100644
--- a/mace/kernels/neon/batch_norm_neon.cc
+++ b/mace/kernels/neon/batch_norm_neon.cc
@@ -78,7 +78,7 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
       }
     }
   }
-};
+}
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/neon/conv_2d_neon_1x1.cc b/mace/kernels/neon/conv_2d_neon_1x1.cc
index c098587c94610d2f38f44cbbebd7fc01da91bfc7..14c20cc387c03b41f2b190e038a693f9236514ab 100644
--- a/mace/kernels/neon/conv_2d_neon_1x1.cc
+++ b/mace/kernels/neon/conv_2d_neon_1x1.cc
@@ -296,7 +296,7 @@ void Conv2dNeonK1x1S1(const float *input,  // NCHW
       }
     }
   }
-};
+}
 
 void Conv2dNeonPixelK1x1S1(
     const float *input,  // NCHW
@@ -321,7 +321,7 @@ void Conv2dNeonPixelK1x1S1(
 
   const index_t total_pixels = height * width;
   // Process 4 * 2 = 8 pixels for each innermost loop
-  // TODO Does 64 bit v.s. 32 bit index matters? need benchmark
+  // TODO(heliangliang): Does 64 bit v.s. 32 bit index matters? need benchmark
   const index_t total_loops = total_pixels >> 3;
   const index_t loop_remaining = total_pixels & 7;
 
@@ -329,7 +329,7 @@ void Conv2dNeonPixelK1x1S1(
   for (index_t n = 0; n < batch; ++n) {
     for (index_t c = 0; c < channels; ++c) {
       const float *filter_ptr = filter + c * input_channels;
-      // TODO Will GCC opt these out?
+      // TODO(heliangliang): Will GCC opt these out?
       float *channel_output_start =
           output + n * channels * height * width + c * height * width;
       const float *input_ptr =
@@ -469,7 +469,7 @@ void Conv2dNeonPixelK1x1S1(
       }
     }
   }
-};
+}
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index 9f9571d0637026330e1d2f5ad2dea31f116eeefc..e7869bb2fba3959c0fc810cbeb81f44f8f6ab00b 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -45,7 +45,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
     kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
-
   }
 
   std::vector<index_t> output_shape = input_tensors[0]->shape();
@@ -56,7 +55,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
 
   if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
     std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                    &output_image_shape);
     output_tensor->ResizeImage(output_shape, output_image_shape);
 
     uint32_t idx = 0;
@@ -75,7 +75,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
   ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
      << "_" << output_shape[2] << "_" << output_shape[3];
   TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
-};
+}
 
 template struct AddNFunctor<DeviceType::OPENCL, float>;
 
diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc
index d2490000b71a034a8cbe19f9ada9e5f5e1ed08fa..3d4c4ec5c7a64406ead61439a52d155689236240 100644
--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -32,7 +32,6 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
-
   }
   if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index b0fa30a5cf146fd0da2ccd0ea9bc9ea419349f32..126fda7773f3613161d186a445f94b467ddf120c 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -14,7 +14,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
     Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
   std::vector<size_t> image_shape;
   if (!i2b_) {
-    CalImage2DShape(buffer->shape(), type, image_shape);
+    CalImage2DShape(buffer->shape(), type, &image_shape);
     if (type == WINOGRAD_FILTER) {
       std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
       image->ResizeImage(new_shape, image_shape);
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
index a88b3b059cfacd31249f09dffd72f8ddee230c00..78d855e2088c292cc15468c00a6730870a69f740 100644
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -39,7 +39,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, built_options);
+    kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
+                                   built_options);
   }
   if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
@@ -61,7 +62,6 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
      << output->dim(2) << "_"
      << output->dim(3);
   TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
-
 }
 
 template
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index e99ab0605d02714b6851cb3cb8cf96f865ae5e1c..da8671db72ec89ebdc93ae43f64049ea0bcd41ee 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -41,7 +41,6 @@ static void Concat2(cl::Kernel *kernel,
       built_options.emplace("-DDIVISIBLE_FOUR");
     }
     *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
-
   }
   if (!IsVecEqual(*prev_input_shape, input0->shape())) {
     uint32_t idx = 0;
@@ -140,7 +139,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
       inputs_count == 2 || divisible_four,
       "Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
   std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
   output->ResizeImage(output_shape, image_shape);
 
   switch (inputs_count) {
@@ -155,7 +154,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
         MACE_NOT_IMPLEMENTED;
       }
   }
-};
+}
 
 template struct ConcatFunctor<DeviceType::OPENCL, float>;
 template struct ConcatFunctor<DeviceType::OPENCL, half>;
diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc
index 46683fd1709eda83be6826d4e2519d28bf4956b1..468d80f09c60bd9584225d2c263766cef6c790e5 100644
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -92,7 +92,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   }
 
   std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
   output->ResizeImage(output_shape, output_image_shape);
 
   if (kernel_h == kernel_w && kernel_h <= 5 &&
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 4109a97932163919e436a1847549c44ef8d60e31..62f8b09acc3458784cb3506f31dbbbdad51ef7ae 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -68,7 +68,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
 
     auto runtime = OpenCLRuntime::Global();
     *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
-
   }
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
     uint32_t idx = 0;
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc
index 37b587dcef2caddae7ae5d73254a8c87dbf9f5a1..ecb109d1fbc456f8e9cefebcc6d29c35604770c1 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -91,18 +91,18 @@ void DepthwiseConv2d(cl::Kernel *kernel,
     }
     kernel->setArg(idx++, *(output->opencl_image()));
     kernel->setArg(idx++, relux_max_limit);
-    kernel->setArg(idx++, static_cast<short>(input_height));
-    kernel->setArg(idx++, static_cast<short>(input_width));
-    kernel->setArg(idx++, static_cast<short>(input_channel_blocks));
-    kernel->setArg(idx++, static_cast<short>(height));
-    kernel->setArg(idx++, static_cast<short>(width));
-    kernel->setArg(idx++, static_cast<short>(filter_height));
-    kernel->setArg(idx++, static_cast<short>(filter_width));
-    kernel->setArg(idx++, static_cast<short>(paddings[0] / 2));
-    kernel->setArg(idx++, static_cast<short>(paddings[1] / 2));
+    kernel->setArg(idx++, static_cast<int16_t>(input_height));
+    kernel->setArg(idx++, static_cast<int16_t>(input_width));
+    kernel->setArg(idx++, static_cast<int16_t>(input_channel_blocks));
+    kernel->setArg(idx++, static_cast<int16_t>(height));
+    kernel->setArg(idx++, static_cast<int16_t>(width));
+    kernel->setArg(idx++, static_cast<int16_t>(filter_height));
+    kernel->setArg(idx++, static_cast<int16_t>(filter_width));
+    kernel->setArg(idx++, static_cast<int16_t>(paddings[0] / 2));
+    kernel->setArg(idx++, static_cast<int16_t>(paddings[1] / 2));
     if (stride != 1 || dilations[0] != 1 || dilations[1] != 1) {
-      kernel->setArg(idx++, static_cast<short>(dilations[0]));
-      kernel->setArg(idx++, static_cast<short>(dilations[1]));
+      kernel->setArg(idx++, static_cast<int16_t>(dilations[0]));
+      kernel->setArg(idx++, static_cast<int16_t>(dilations[1]));
     }
     *prev_input_shape = input->shape();
   }
@@ -159,7 +159,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
   }
 
   std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
   output->ResizeImage(output_shape, output_image_shape);
 
   DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc
index dde05b29e2b2a6c8264ced78dea7d8fb3a37ef65..548d907de08ba8d25c884a5098f4da8b82db70ee 100644
--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -35,7 +35,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
     built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
     if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
     kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
-
   }
   if (!IsVecEqual(input_shape_, input0->shape())) {
     uint32_t idx = 0;
diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc
index d5db519025514be82b5101de3c25c74c444c9b59..772a6d8d0c17774de35dca46e96fd9a15c94c38c 100644
--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -16,12 +16,14 @@ void FCWXKernel(cl::Kernel *kernel,
                 std::vector<index_t> *prev_input_shape,
                 Tensor *output,
                 const ActivationType activation,
-                std::vector<uint32_t> &gws,
-                std::vector<uint32_t> &lws,
+                std::vector<uint32_t> *gws,
+                std::vector<uint32_t> *lws,
                 const float relux_max_limit,
                 StatsFuture *future) {
   MACE_CHECK(input->dim(3) % 4 == 0)
     << "FC width kernel only support input with 4x channel.";
+  MACE_CHECK_NOTNULL(gws);
+  MACE_CHECK_NOTNULL(lws);
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel->get() == nullptr) {
@@ -62,12 +64,11 @@ void FCWXKernel(cl::Kernel *kernel,
     const index_t output_blocks = RoundUpDiv4(output_size);
     const uint32_t wave_size = runtime->GetKernelWaveSize(*kernel);
 
-    gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
+    *gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
 
     const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(*kernel);
-    const uint32_t inter_local_blks = kwg_size / (gws[0] * gws[1]);
-    lws = {gws[0], gws[1], inter_local_blks};
-
+    const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]);
+    *lws = {(*gws)[0], (*gws)[1], inter_local_blks};
   }
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
     const index_t batch = output->dim(0);
@@ -80,21 +81,22 @@ void FCWXKernel(cl::Kernel *kernel,
       kernel->setArg(idx++, *(bias->opencl_image()));
     }
     kernel->setArg(idx++, *(output->opencl_image()));
-    kernel->setArg(idx++, (lws[0] * lws[1] * lws[2] * sizeof(float)), nullptr);
+    kernel->setArg(idx++, ((*lws)[0] * (*lws)[1] * (*lws)[2] * sizeof(float)),
+                   nullptr);
     kernel->setArg(idx++, static_cast<int>(input->dim(1)));
     kernel->setArg(idx++, static_cast<int>(input->dim(2)));
     kernel->setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
     kernel->setArg(idx++, static_cast<int>(output_blocks));
     kernel->setArg(idx++, relux_max_limit);
 
-    gws[2] = static_cast<uint32_t>(batch * output_blocks);
+    (*gws)[2] = static_cast<uint32_t>(batch * output_blocks);
 
     *prev_input_shape = input->shape();
   }
   cl::Event event;
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      *kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+      *kernel, cl::NullRange, cl::NDRange((*gws)[0], (*gws)[1], (*gws)[2]),
+      cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
   MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
 
   if (future != nullptr) {
@@ -105,7 +107,6 @@ void FCWXKernel(cl::Kernel *kernel,
       }
     };
   }
-
 }
 
 template <typename T>
@@ -116,10 +117,12 @@ void FCWTXKernel(cl::Kernel *kernel,
                  std::vector<index_t> *prev_input_shape,
                  Tensor *output,
                  const ActivationType activation,
-                 std::vector<uint32_t> &gws,
-                 std::vector<uint32_t> &lws,
+                 std::vector<uint32_t> *gws,
+                 std::vector<uint32_t> *lws,
                  const float relux_max_limit,
                  StatsFuture *future) {
+  MACE_CHECK_NOTNULL(gws);
+  MACE_CHECK_NOTNULL(lws);
   if (kernel->get() == nullptr) {
     auto runtime = OpenCLRuntime::Global();
     std::set<std::string> built_options;
@@ -152,7 +155,7 @@ void FCWTXKernel(cl::Kernel *kernel,
     *kernel =
         runtime->BuildKernel("fully_connected", kernel_name, built_options);
 
-    lws = {16, 64, 1};
+    *lws = {16, 64, 1};
   }
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
     uint32_t idx = 0;
@@ -171,18 +174,16 @@ void FCWTXKernel(cl::Kernel *kernel,
     const index_t batch = output->dim(0);
     const index_t output_blocks = RoundUpDiv4(output->dim(3));
 
-    gws = {
+    *gws = {
         static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
     };
-
     *prev_input_shape = input->shape();
   }
 
   std::stringstream ss;
   ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
      << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun2DKernel(*kernel, ss.str(), gws.data(), lws, future);
-
+  TuningOrRun2DKernel(*kernel, ss.str(), gws->data(), *lws, future);
 }
 
 template <typename T>
@@ -194,17 +195,18 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
     StatsFuture *future) {
   std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
   std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
   output->ResizeImage(output_shape, output_image_shape);
 
   if (weight_type_ == BufferType::WEIGHT_HEIGHT) {
     FCWTXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
-                   activation_, gws_, lws_, relux_max_limit_, future);
+                   activation_, &gws_, &lws_, relux_max_limit_, future);
   } else {
     FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
-                  activation_, gws_, lws_, relux_max_limit_, future);
+                  activation_, &gws_, &lws_, relux_max_limit_, future);
   }
-};
+}
 
 template struct FullyConnectedFunctor<DeviceType::OPENCL, float>;
 
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index 3f41966299f2b8ec4b61e65d1191eaef1d94b533..e3cadbc6f5d1cd73b7f5b6a2de02c370a19ce0c1 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -3,6 +3,11 @@
 //
 
 #include "mace/kernels/opencl/helper.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
 #include "mace/utils/tuner.h"
 #include "mace/utils/utils.h"
 
@@ -11,91 +16,92 @@ namespace kernels {
 
 // [(C + 3) / 4 * W, N * H]
 void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
-                           std::vector<size_t> &image_shape) {
+                           std::vector<size_t> *image_shape) {
   MACE_CHECK(shape.size() == 4);
-  image_shape.resize(2);
-  image_shape[0] = RoundUpDiv4(shape[3]) * shape[2];
-  image_shape[1] = shape[0] * shape[1];
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2];
+  (*image_shape)[1] = shape[0] * shape[1];
 }
 
 // [RoundUp<4>(Ic) * H * W, (Oc + 3) / 4]
 void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */
-                               std::vector<size_t> &image_shape) {
+                               std::vector<size_t> *image_shape) {
   MACE_CHECK(shape.size() == 4);
-  image_shape.resize(2);
-  image_shape[0] = shape[0] * shape[1] * RoundUp<index_t>(shape[3], 4);
-  image_shape[1] = RoundUpDiv4(shape[2]);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[0] * shape[1] * RoundUp<index_t>(shape[3], 4);
+  (*image_shape)[1] = RoundUpDiv4(shape[2]);
 }
 
 // [H * W * M, (Ic + 3) / 4]
 void CalDepthwiseConv2dFilterImageShape(
     const std::vector<index_t> &shape, /* HWIM */
-    std::vector<size_t> &image_shape) {
+    std::vector<size_t> *image_shape) {
   MACE_CHECK(shape.size() == 4);
-  image_shape.resize(2);
-  image_shape[0] = shape[0] * shape[1] * shape[3];
-  image_shape[1] = RoundUpDiv4(shape[2]);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[0] * shape[1] * shape[3];
+  (*image_shape)[1] = RoundUpDiv4(shape[2]);
 }
 
 // [(size + 3) / 4, 1]
 void CalArgImageShape(const std::vector<index_t> &shape,
-                      std::vector<size_t> &image_shape) {
+                      std::vector<size_t> *image_shape) {
   MACE_CHECK(shape.size() == 1);
-  image_shape.resize(2);
-  image_shape[0] = RoundUpDiv4(shape[0]);
-  image_shape[1] = 1;
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[0]);
+  (*image_shape)[1] = 1;
 }
 
 // Only support 3x3 now
 // [ (Ic + 3) / 4, 16 * Oc]
 void CalWinogradFilterImageShape(
     const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
-    std::vector<size_t> &image_shape) {
+    std::vector<size_t> *image_shape) {
   MACE_CHECK(shape.size() == 4);
-  image_shape.resize(2);
-  image_shape[0] = RoundUpDiv4(shape[1]);
-  image_shape[1] = (shape[0] << 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[1]);
+  (*image_shape)[1] = (shape[0] << 4);
 }
 
 // [W * C, N * RoundUp<4>(H)]
 void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
-                              std::vector<size_t> &image_shape) {
+                              std::vector<size_t> *image_shape) {
   MACE_CHECK(shape.size() == 4);
-  image_shape.resize(2);
-  image_shape[0] = shape[2] * shape[3];
-  image_shape[1] = shape[0] * RoundUpDiv4(shape[1]);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[2] * shape[3];
+  (*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]);
 }
 
 // [RoundUp<4>(W) * C, N * H]
 void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
-                             std::vector<size_t> &image_shape) {
+                             std::vector<size_t> *image_shape) {
   MACE_CHECK(shape.size() == 4);
-  image_shape.resize(2);
-  image_shape[0] = RoundUpDiv4(shape[2]) * shape[3];
-  image_shape[1] = shape[0] * shape[1];
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3];
+  (*image_shape)[1] = shape[0] * shape[1];
 }
 
 // [W, (H + 3) / 4]
 void CalWeightHeightImageShape(const std::vector<index_t> &shape, /* HW */
-                               std::vector<size_t> &image_shape) {
+                               std::vector<size_t> *image_shape) {
   MACE_CHECK(shape.size() == 2);
-  image_shape.resize(2);
-  image_shape[0] = shape[1];
-  image_shape[1] = RoundUpDiv4(shape[0]);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[1];
+  (*image_shape)[1] = RoundUpDiv4(shape[0]);
 }
 
 // [(W + 3) / 4, H]
 void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* HW */
-                              std::vector<size_t> &image_shape) {
+                              std::vector<size_t> *image_shape) {
   MACE_CHECK(shape.size() == 2);
-  image_shape.resize(2);
-  image_shape[0] = RoundUpDiv4(shape[1]);
-  image_shape[1] = shape[0];
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[1]);
+  (*image_shape)[1] = shape[0];
 }
 
 void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                      const BufferType type,
-                     std::vector<size_t> &image_shape) {
+                     std::vector<size_t> *image_shape) {
+  MACE_CHECK_NOTNULL(image_shape);
   switch (type) {
     case CONV2D_FILTER:
       CalConv2dFilterImageShape(shape, image_shape);
@@ -188,7 +194,7 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
   }
 }
 
-void TuningOrRun3DKernel(cl::Kernel &kernel,
+void TuningOrRun3DKernel(const cl::Kernel &kernel,
                          const std::string tuning_key,
                          const uint32_t *gws,
                          const std::vector<uint32_t> &lws,
@@ -202,7 +208,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
     local_ws[2] =
         std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1]));
     return {
-        // TODO tuning these magic numbers
+        // TODO(heliangliang): tuning these magic numbers
         {local_ws[0], local_ws[1], local_ws[2], 1},
         {kwg_size / 16, 4, 4, 1},
         {kwg_size / 32, 4, 8, 1},
@@ -291,7 +297,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
   }
 }
 
-void TuningOrRun2DKernel(cl::Kernel &kernel,
+void TuningOrRun2DKernel(const cl::Kernel &kernel,
                          const std::string tuning_key,
                          const uint32_t *gws,
                          const std::vector<uint32_t> &lws,
diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h
index 56bf295ee2dec5451f9d142ccd0e63441b37e545..89712c9b96aa043f5019cde6eae23aa07109f6f7 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -5,6 +5,9 @@
 #ifndef MACE_KERNELS_OPENCL_HELPER_H_
 #define MACE_KERNELS_OPENCL_HELPER_H_
 
+#include <string>
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
@@ -30,7 +33,7 @@ enum BufferType {
 
 void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                      const BufferType type,
-                     std::vector<size_t> &image_shape);
+                     std::vector<size_t> *image_shape);
 
 std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
                                       const BufferType type);
@@ -43,13 +46,13 @@ std::string DtToCLDt(const DataType dt);
 
 std::string DtToUpstreamCLDt(const DataType dt);
 
-void TuningOrRun3DKernel(cl::Kernel &kernel,
+void TuningOrRun3DKernel(const cl::Kernel &kernel,
                          const std::string tuning_key,
                          const uint32_t *gws,
                          const std::vector<uint32_t> &lws,
                          StatsFuture *future);
 
-void TuningOrRun2DKernel(cl::Kernel &kernel,
+void TuningOrRun2DKernel(const cl::Kernel &kernel,
                          const std::string tuning_key,
                          const uint32_t *gws,
                          const std::vector<uint32_t> &lws,
@@ -78,7 +81,6 @@ bool IsVecEqual(const std::vector<T> &input0,
       (std::equal(input0.begin(), input0.end(), input1.begin())));
 }
 
-namespace {
 template <typename T>
 void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
   (*ss) << v;
@@ -92,7 +94,6 @@ void AppendToStream(std::stringstream *ss,
   (*ss) << first << delimiter;
   AppendToStream(ss, delimiter, args...);
 }
-}  // namespace
 
 template <typename... Args>
 std::string Concat(Args... args) {
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index 4b61edb271df814b4bdcea251d28b2ca03cf3be4..c5bd2b0ba3f789f28992a49e10ffa7b4a357a8c5 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -17,7 +17,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
                                                       StatsFuture *future) {
   std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
   std::vector<size_t> c_image_shape;
-  CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, c_image_shape);
+  CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
   C->ResizeImage(c_shape, c_image_shape);
 
   const index_t batch = C->dim(0);
@@ -56,7 +56,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
   ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
      << C->dim(2) << "_" << C->dim(3);
   TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
-};
+}
 
 template struct MatMulFunctor<DeviceType::OPENCL, float>;
 
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index d8a6d675a8da5749d3a2cf02360e3ec619a809ff..5b52a0934facd4b4f14affb9bafb819d258fa444 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -36,12 +36,11 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
       built_options.emplace("-DPOOL_AVG");
     }
     kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
-
   }
   if (!IsVecEqual(input_shape_, input->shape())) {
     std::vector<index_t> output_shape(4);
-    std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
-                                         input->dim(3)};
+    std::vector<index_t> filter_shape = {kernels_[0], kernels_[1],
+                                         input->dim(3), input->dim(3)};
 
     std::vector<int> paddings(2);
     if (paddings_.empty()) {
@@ -50,12 +49,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
           padding_type_, output_shape.data(), paddings.data());
     } else {
       paddings = paddings_;
-      CalcOutputSize(input->shape().data(), filter_shape.data(), paddings_.data(),
-                     dilations_, strides_, RoundType::CEIL, output_shape.data());
+      CalcOutputSize(input->shape().data(), filter_shape.data(),
+                     paddings_.data(), dilations_, strides_, RoundType::CEIL,
+                     output_shape.data());
     }
 
     std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                    &output_image_shape);
     output->ResizeImage(output_shape, output_image_shape);
 
     uint32_t idx = 0;
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index a3bb2ee1951f433ae41a4c2dc41367fe77d1e497..373709168f190a6122d29bbaee457a2b356b4833 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -34,7 +34,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     kernel_ =
         runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
-
   }
   if (!IsVecEqual(input_shape_, input->shape())) {
     MACE_CHECK(out_height > 0 && out_width > 0);
@@ -42,7 +41,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
 
     std::vector<size_t> output_image_shape;
     CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                    output_image_shape);
+                    &output_image_shape);
     output->ResizeImage(output_shape, output_image_shape);
 
     float height_scale =
@@ -60,7 +59,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, static_cast<int32_t>(out_height));
 
     input_shape_ = input->shape();
-
   }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
index 63efc555dbf8a743e3fc6881a06e0202480bbd16..6bc9ae3bf57d8c4f3df9ea41cad9bf5f283ce01a 100644
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -24,7 +24,7 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
                                      input->dim(2), output_channels});
 
   std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
   for (size_t i= 0; i < outputs_count; ++i) {
     output_list[i]->ResizeImage(output_shape, image_shape);
   }
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
index 4aabe9017f06073ddffe7e04871b62b76da15dc6..077db9ddc1ecf2d72f71511349945ea53fe0eb73 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -33,7 +33,6 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
-
   }
   if (!IsVecEqual(input_shape_, logits->shape())) {
     uint32_t idx = 0;
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
index 91f5564d520de9e11ad832231060f37ea3f64191..fe911fbddb49687c74edf1e29f0276c86a249ccc 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -22,7 +22,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
     StatsFuture *future) {
   const char *kernel_name = nullptr;
   std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
   if (b2s_) {
     space_tensor->ResizeImage(output_shape, output_image_shape);
     kernel_name = "batch_to_space";
@@ -42,7 +43,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
                           DtToCLCMDDt(DataTypeToEnum<T>::value));
     kernel_ =
         runtime->BuildKernel("space_to_batch", kernel_name, built_options);
-
   }
   if (!IsVecEqual(space_shape_, space_tensor->shape())) {
     uint32_t idx = 0;
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index c07ccc9944786e8cbcd8dde4aa6ada7794542019..3b86640866a307ba97d7b0f064a1df099c021be4 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -27,7 +27,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
     auto runtime = OpenCLRuntime::Global();
     kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
                                    built_options);
-
   }
   std::vector<index_t> output_shape(4);
   std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
@@ -49,7 +48,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
   if (!IsVecEqual(input_shape_, input_tensor->shape())) {
     output_shape = {16, input_tensor->dim(3), out_width, 1};
     std::vector<size_t> image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
+    CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
     output_tensor->ResizeImage(output_shape, image_shape);
 
     uint32_t idx = 0;
@@ -83,7 +82,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
     const Tensor *bias,
     Tensor *output_tensor,
     StatsFuture *future) {
-
   if (kernel_.get() == nullptr) {
     std::string obfuscated_kernel_name =
         MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
@@ -125,7 +123,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
     std::vector<index_t> output_shape = {batch_, height_, width_,
                                          input_tensor->dim(1)};
     std::vector<size_t> image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
+    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
     output_tensor->ResizeImage(output_shape, image_shape);
 
     const uint32_t round_h = (height_ + 1) / 2;
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index bc9892e5864d420f9505de9462df5a17eedb4241..15cc691e71927300bec48224a7666f1468eb74c1 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -2,10 +2,13 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-#ifndef MACE_KERNELS_POOLING_H
-#define MACE_KERNELS_POOLING_H
+#ifndef MACE_KERNELS_POOLING_H_
+#define MACE_KERNELS_POOLING_H_
 
+#include <algorithm>
 #include <limits>
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
@@ -188,4 +191,4 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
 }  // namespace kernels
 }  // namespace mace
 
-#endif  // MACE_KERNELS_POOLING_H
+#endif  // MACE_KERNELS_POOLING_H_
diff --git a/mace/kernels/reshape.h b/mace/kernels/reshape.h
index 544ba360a4e1c751dc802381fb99ae977f749a26..14e560789db709464400136116ba02d373207c65 100644
--- a/mace/kernels/reshape.h
+++ b/mace/kernels/reshape.h
@@ -4,6 +4,8 @@
 #ifndef MACE_KERNELS_RESHAPE_H_
 #define MACE_KERNELS_RESHAPE_H_
 
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
@@ -20,7 +22,7 @@ struct ReshapeFunctor {
                   Tensor *output,
                   StatsFuture *future) {
     output->Resize(out_shape);
-    // TODO copy on write to avoid this copy.
+    // TODO(liuqi): copy on write to avoid this copy.
     output->CopyBytes(input->raw_data(), input->size() * sizeof(T));
   }
 };
diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h
index 52c1da102926870d6e65dfa52ee68c7ff5a43f76..65e5121211d4d836d6d17809a843e0778defaecb 100644
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -4,6 +4,9 @@
 #ifndef MACE_KERNELS_RESIZE_BILINEAR_H_
 #define MACE_KERNELS_RESIZE_BILINEAR_H_
 
+#include <algorithm>
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
@@ -11,7 +14,6 @@
 namespace mace {
 namespace kernels {
 
-namespace {
 struct CachedInterpolation {
   index_t lower;  // Lower source index used in the interpolation
   index_t upper;  // Upper source index used in the interpolation
@@ -101,7 +103,6 @@ void ResizeImage(const T *images,
     }
   }
 }
-}
 
 struct ResizeBilinearFunctorBase {
   ResizeBilinearFunctorBase(const std::vector<index_t> &size,
diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h
index b08ea7ef4fcd1e235375952085e9965c7f897334..59d9d667b0a63da1e1d3ee471aecec9efd9be1e9 100644
--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -5,6 +5,8 @@
 #ifndef MACE_KERNELS_SLICE_H_
 #define MACE_KERNELS_SLICE_H_
 
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
@@ -16,7 +18,6 @@ namespace kernels {
 
 template<DeviceType D, typename T>
 struct SliceFunctor {
-
   void operator()(const Tensor *input,
                   const std::vector<Tensor *> &output_list,
                   StatsFuture *future) {
@@ -56,15 +57,13 @@ struct SliceFunctor {
 
 template<typename T>
 struct SliceFunctor<DeviceType::OPENCL, T> {
-
   void operator()(const Tensor *input,
                   const std::vector<Tensor *> &output_list,
                   StatsFuture *future);
   cl::Kernel kernel_;
-
 };
 
-}  // namepsace kernels
+}  // namespace kernels
 }  // namespace mace
 
 #endif  // MACE_KERNELS_SLICE_H_
diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h
index d5bc5717d8cfdbfc391de634f08d8fd427e5ca9d..a1c4ea2f6e5b9200f17d54906316a83cbefaa49a 100644
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -5,6 +5,10 @@
 #ifndef MACE_KERNELS_SOFTMAX_H_
 #define MACE_KERNELS_SOFTMAX_H_
 
+#include <algorithm>
+#include <functional>
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
@@ -38,7 +42,7 @@ struct SoftmaxFunctor {
         for (index_t c = 1; c < num_classes; ++c) {
           max_value = std::max(max_value, logits_ptr[pos + c]);
         }
-        // TODO: check overflow?
+        // TODO(liuqi): check overflow?
         T sum = 0;
         for (index_t c = 0; c < num_classes; ++c) {
           exp_data[c] = ::exp((logits_ptr[pos + c] - max_value));
@@ -60,7 +64,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
   std::vector<index_t> input_shape_;
 };
 
-}  // namepsace kernels
+}  // namespace kernels
 }  // namespace mace
 
 #endif  // MACE_KERNELS_SOFTMAX_H_
diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h
index ef7467b57acd0fc1d3563148ec53dd1ea4869a9f..757f784820f90fee842fc385606db4755cb52293 100644
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -2,8 +2,10 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
 
-#ifndef MACE_KERNELS_CONV_2D_H_
-#define MACE_KERNELS_CONV_2D_H_
+#ifndef MACE_KERNELS_SPACE_TO_BATCH_H_
+#define MACE_KERNELS_SPACE_TO_BATCH_H_
+
+#include <vector>
 
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -60,4 +62,4 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
 }  // namespace kernels
 }  // namespace mace
 
-#endif  // MACE_KERNELS_CONV_2D_H_
+#endif  // MACE_KERNELS_SPACE_TO_BATCH_H_
diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h
index f3b7f7d640328860f5ffdc5dc6b065e78e324896..6f483dacb06f920c54b14930dba3fd05ff845e44 100644
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -5,6 +5,8 @@
 #ifndef MACE_KERNELS_WINOGRAD_TRANSFORM_H_
 #define MACE_KERNELS_WINOGRAD_TRANSFORM_H_
 
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"