fix cpplint for mace/kernels

16703420 · wuchenghui · 564833a5 · 16703420 · 16703420 · 16703420
41 changed file
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -5,6 +5,10 @@
 #ifndef MACE_KERNELS_ACTIVATION_H_
 #define MACE_KERNELS_ACTIVATION_H_

+#include <algorithm>
+#include <string>
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -8,6 +8,7 @@
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
 #include <arm_neon.h>
 #endif
+#include <algorithm>
 #include <vector>

 #include "mace/core/future.h"
@@ -17,9 +18,7 @@
 namespace mace {
 namespace kernels {

-namespace {
 constexpr int kCostPerGroup = 1024;
-}  // namespace

 template <DeviceType D, typename T>
 struct AddNFunctor {

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -8,6 +8,7 @@
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
 #include <arm_neon.h>
 #endif
+#include <vector>

 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -159,7 +160,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
  std::vector<index_t> input_shape_;
 };

-}  // namepsace kernels
+}  // namespace kernels
 }  // namespace mace

 #endif  // MACE_KERNELS_BATCH_NORM_H_
--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -5,6 +5,8 @@
 #ifndef MACE_KERNELS_BIAS_ADD_H_
 #define MACE_KERNELS_BIAS_ADD_H_

+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
@@ -65,7 +67,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
  std::vector<index_t> input_shape_;
 };

-}  // namepsace kernels
+}  // namespace kernels
 }  // namespace mace

 #endif  // MACE_KERNELS_BIAS_ADD_H_
--- a/mace/kernels/buffer_to_image.h
+++ b/mace/kernels/buffer_to_image.h
@@ -13,13 +13,14 @@ namespace mace {
 namespace kernels {

 struct BufferToImageFunctorBase {
-  BufferToImageFunctorBase(bool i2b) : i2b_(i2b) {}
+  explicit BufferToImageFunctorBase(bool i2b) : i2b_(i2b) {}
  bool i2b_;
 };

 template <DeviceType D, typename T>
 struct BufferToImageFunctor : BufferToImageFunctorBase {
-  BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
+  explicit BufferToImageFunctor(bool i2b = false)
+      : BufferToImageFunctorBase(i2b) {}
  void operator()(Tensor *input,
                  const BufferType type,
                  Tensor *output,
@@ -30,14 +31,15 @@ struct BufferToImageFunctor : BufferToImageFunctorBase {

 template <typename T>
 struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase {
-  BufferToImageFunctor(bool i2b = false) : BufferToImageFunctorBase(i2b) {}
+  explicit BufferToImageFunctor(bool i2b = false)
+      : BufferToImageFunctorBase(i2b) {}
  void operator()(Tensor *input,
                  const BufferType type,
                  Tensor *output,
                  StatsFuture *future);
 };

-}  // namepsace kernels
+}  // namespace kernels
 }  // namespace mace

 #endif  // MACE_KERNELS_BUFFER_TO_IMAGE_H_
--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -5,6 +5,8 @@
 #ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_
 #define MACE_KERNELS_CHANNEL_SHUFFLE_H_

+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"

@@ -13,7 +15,7 @@ namespace kernels {

 template <DeviceType D, typename T>
 struct ChannelShuffleFunctor {
-  ChannelShuffleFunctor(const int groups) : groups_(groups) {}
+  explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}

  void operator()(const Tensor *input,
                  Tensor *output,
@@ -49,7 +51,7 @@ struct ChannelShuffleFunctor {

 template <typename T>
 struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
-  ChannelShuffleFunctor(const int groups) : groups_(groups) {}
+  explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}

  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);


--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -5,6 +5,8 @@
 #ifndef MACE_KERNELS_CONCAT_H_
 #define MACE_KERNELS_CONCAT_H_

+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
@@ -15,14 +17,14 @@ namespace mace {
 namespace kernels {

 struct ConcatFunctorBase {
-  ConcatFunctorBase(const int32_t axis) : axis_(axis) {}
+  explicit ConcatFunctorBase(const int32_t axis) : axis_(axis) {}

  int32_t axis_;
 };

 template <DeviceType D, typename T>
 struct ConcatFunctor : ConcatFunctorBase {
-  ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
+  explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}

  void operator()(const std::vector<const Tensor *> &input_list,
                  Tensor *output,
@@ -77,7 +79,7 @@ struct ConcatFunctor : ConcatFunctorBase {

 template <typename T>
 struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
-  ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}
+  explicit ConcatFunctor(const int32_t axis) : ConcatFunctorBase(axis) {}

  void operator()(const std::vector<const Tensor *> &input_list,
                  Tensor *output,
@@ -86,7 +88,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
  std::vector<index_t> input_shape_;
 };

-}  // namepsace kernels
+}  // namespace kernels
 }  // namespace mace

 #endif  // MACE_KERNELS_CONCAT_H_
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -8,6 +8,8 @@
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
 #include <arm_neon.h>
 #endif
+#include <algorithm>
+#include <vector>

 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -18,7 +20,6 @@

 namespace mace {
 namespace kernels {
-namespace {

 template <typename T,
          int inc_tile_size,
@@ -61,9 +62,9 @@ void Conv2dKernelFunc(const T *input_ptr,  // batch start
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
        // AArch64 NEON has 32 128-bit general purpose registers
        static_assert(inc_tile_size == 4, "input channels tile size must be 4");
-        float32x4_t in[h_count * w_count];
+        float32x4_t in[h_count * w_count];  // NOLINT(runtime/arrays)
 #else
-        T in[h_count * w_count * inc_tile_size];
+        T in[h_count * w_count * inc_tile_size];  // NOLINT(runtime/arrays)
 #endif
        for (int hi = 0; hi < h_count; ++hi) {
          for (int wi = 0; wi < w_count; ++wi) {
@@ -86,9 +87,9 @@ void Conv2dKernelFunc(const T *input_ptr,  // batch start

 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
        static_assert(inc_tile_size == 4, "input channels tile size must be 4");
-        float32x4_t weights[c_count];
+        float32x4_t weights[c_count];  // NOLINT(runtime/arrays)
 #else
-        T weights[c_count * inc_tile_size];
+        T weights[c_count * inc_tile_size];  // NOLINT(runtime/arrays)
 #endif
        for (int ci = 0; ci < c_count; ++ci) {
          const int weights_idx = ci;
@@ -126,7 +127,7 @@ void Conv2dKernelFunc(const T *input_ptr,  // batch start
      }
      // handling the remaining input channels
      for (; inc < input_channels; ++inc) {
-        T in[h_count * w_count];
+        T in[h_count * w_count];  // NOLINT(runtime/arrays)
        for (int hi = 0; hi < h_count; ++hi) {
          for (int wi = 0; wi < w_count; ++wi) {
            const int in_idx = hi * w_count + wi;
@@ -138,7 +139,7 @@ void Conv2dKernelFunc(const T *input_ptr,  // batch start
          }
        }

-        T weights[c_count];
+        T weights[c_count];  // NOLINT(runtime/arrays)
        for (int ci = 0; ci < c_count; ++ci) {
          const int weights_idx = ci;
          const int filter_offset =
@@ -173,7 +174,6 @@ void Conv2dKernelFunc(const T *input_ptr,  // batch start
    }
  }
 }
-};  // namespace

 struct Conv2dFunctorBase {
  Conv2dFunctorBase(const int *strides,
@@ -331,7 +331,7 @@ struct Conv2dFunctor : Conv2dFunctorBase {
    auto output_data = output->mutable_data<T>();

    constexpr int inc_tile_size = 4;
-// TODO Auto tuning these parameters
+// TODO(heliangliang) Auto tuning these parameters
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
    const int c_tile_size = 4;
    const int h_tile_size = 2;

--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -4,6 +4,8 @@

 #include "mace/kernels/conv_pool_2d_util.h"

+#include <vector>
+
 namespace mace {
 namespace kernels {

@@ -56,7 +58,7 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,   // NCHW
  }

  // Note: TensorFlow may padded one more on the right/bottom side
-  // TODO may be it's better to also truncate the left/top to
+  // TODO(liuqi): may be it's better to also truncate the left/top to
  // utilize the more centered features. We need to benchmark
  // based on the model accuracy.

@@ -120,7 +122,7 @@ void CalcNHWCPaddingAndOutputSize(const index_t *input_shape,   // NHWC
  }

  // Note: TensorFlow may padded one more on the right/bottom side
-  // TODO may be it's better to also truncate the left/top to
+  // TODO(liuqi): may be it's better to also truncate the left/top to
  // utilize the more centered features. We need to benchmark
  // based on the model accuracy.

@@ -219,7 +221,7 @@ void CalPaddingSize(const index_t *input_shape,   // NCHW
  }

  // Note: TensorFlow may padded one more on the right/bottom side
-  // TODO may be it's better to also truncate the left/top to
+  // TODO(liuqi): may be it's better to also truncate the left/top to
  // utilize the more centered features. We need to benchmark
  // based on the model accuracy.
  padding_size[0] = std::max<int>(

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -8,6 +8,8 @@
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
 #include <arm_neon.h>
 #endif
+#include <algorithm>
+#include <vector>

 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -17,8 +19,6 @@
 namespace mace {
 namespace kernels {

-namespace {
-
 template <typename T>
 void DepthwiseConv2dKernel(const T *input_ptr,
                           const T *filter_ptr,
@@ -233,8 +233,6 @@ void DepthwiseConv2dNoOOBCheckKernel(const T *input_ptr,
  }
 }

-}  // namespace
-
 struct DepthwiseConv2dFunctorBase {
  DepthwiseConv2dFunctorBase(const int *strides,
                             const Padding padding_type,

--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -4,6 +4,9 @@
 #ifndef MACE_KERNELS_ELTWISE_H_
 #define MACE_KERNELS_ELTWISE_H_

+#include <algorithm>
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"

--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -5,6 +5,8 @@
 #ifndef MACE_KERNELS_FULLY_CONNECTED_H_
 #define MACE_KERNELS_FULLY_CONNECTED_H_

+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"

--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -21,7 +21,6 @@
 namespace mace {
 namespace kernels {

-namespace {
 template<typename T,
  int register_tile_size,
  int h_count,
@@ -87,7 +86,6 @@ inline void MatMulKernelFunc(const T *A,
    }
  }
 }
-}  // namespace

 #define MACE_DO_MATMUL(HC, WC, KC) \
 MatMulKernelFunc<T, register_tile_size, HC, WC, KC>(a_ptr_batch_base, \
@@ -118,7 +116,6 @@ switch (k_count) { \
    LOG(FATAL) << "Unsupported k tile: " << k_count; \
 }

-
 #define MACE_CASE_W_MATMUL(HC) \
 switch (w_count) { \
  case 1: \

--- a/mace/kernels/neon/batch_norm_neon.cc
+++ b/mace/kernels/neon/batch_norm_neon.cc
@@ -78,7 +78,7 @@ void BatchNormFunctor<DeviceType::NEON, float>::operator()(
      }
    }
  }
-};
+}

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/neon/conv_2d_neon_1x1.cc
+++ b/mace/kernels/neon/conv_2d_neon_1x1.cc
@@ -296,7 +296,7 @@ void Conv2dNeonK1x1S1(const float *input,  // NCHW
      }
    }
  }
-};
+}

 void Conv2dNeonPixelK1x1S1(
    const float *input,  // NCHW
@@ -321,7 +321,7 @@ void Conv2dNeonPixelK1x1S1(

  const index_t total_pixels = height * width;
  // Process 4 * 2 = 8 pixels for each innermost loop
-  // TODO Does 64 bit v.s. 32 bit index matters? need benchmark
+  // TODO(heliangliang): Does 64 bit v.s. 32 bit index matters? need benchmark
  const index_t total_loops = total_pixels >> 3;
  const index_t loop_remaining = total_pixels & 7;

@@ -329,7 +329,7 @@ void Conv2dNeonPixelK1x1S1(
  for (index_t n = 0; n < batch; ++n) {
    for (index_t c = 0; c < channels; ++c) {
      const float *filter_ptr = filter + c * input_channels;
-      // TODO Will GCC opt these out?
+      // TODO(heliangliang): Will GCC opt these out?
      float *channel_output_start =
          output + n * channels * height * width + c * height * width;
      const float *input_ptr =
@@ -469,7 +469,7 @@ void Conv2dNeonPixelK1x1S1(
      }
    }
  }
-};
+}

 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -45,7 +45,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
    built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
    kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
-
  }

  std::vector<index_t> output_shape = input_tensors[0]->shape();
@@ -56,7 +55,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(

  if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
    std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                    &output_image_shape);
    output_tensor->ResizeImage(output_shape, output_image_shape);

    uint32_t idx = 0;
@@ -75,7 +75,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
  ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
     << "_" << output_shape[2] << "_" << output_shape[3];
  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
-};
+}

 template struct AddNFunctor<DeviceType::OPENCL, float>;


--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -32,7 +32,6 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
    kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
-
  }
  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -14,7 +14,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
    Tensor *buffer, const BufferType type, Tensor *image, StatsFuture *future) {
  std::vector<size_t> image_shape;
  if (!i2b_) {
-    CalImage2DShape(buffer->shape(), type, image_shape);
+    CalImage2DShape(buffer->shape(), type, &image_shape);
    if (type == WINOGRAD_FILTER) {
      std::vector<index_t> new_shape = CalWinogradShape(buffer->shape(), type);
      image->ResizeImage(new_shape, image_shape);

--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -39,7 +39,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, built_options);
+    kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
+                                   built_options);
  }
  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;
@@ -61,7 +62,6 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
     << output->dim(2) << "_"
     << output->dim(3);
  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
-
 }

 template

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -41,7 +41,6 @@ static void Concat2(cl::Kernel *kernel,
      built_options.emplace("-DDIVISIBLE_FOUR");
    }
    *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
-
  }
  if (!IsVecEqual(*prev_input_shape, input0->shape())) {
    uint32_t idx = 0;
@@ -140,7 +139,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
      inputs_count == 2 || divisible_four,
      "Dimensions of inputs should be divisible by 4 when inputs_count > 2.");
  std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
  output->ResizeImage(output_shape, image_shape);

  switch (inputs_count) {
@@ -155,7 +154,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
        MACE_NOT_IMPLEMENTED;
      }
  }
-};
+}

 template struct ConcatFunctor<DeviceType::OPENCL, float>;
 template struct ConcatFunctor<DeviceType::OPENCL, half>;

--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -92,7 +92,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  }

  std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
  output->ResizeImage(output_shape, output_image_shape);

  if (kernel_h == kernel_w && kernel_h <= 5 &&

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -68,7 +68,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,

    auto runtime = OpenCLRuntime::Global();
    *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
-
  }
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    uint32_t idx = 0;

--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -91,18 +91,18 @@ void DepthwiseConv2d(cl::Kernel *kernel,
    }
    kernel->setArg(idx++, *(output->opencl_image()));
    kernel->setArg(idx++, relux_max_limit);
-    kernel->setArg(idx++, static_cast<short>(input_height));
-    kernel->setArg(idx++, static_cast<short>(input_width));
-    kernel->setArg(idx++, static_cast<short>(input_channel_blocks));
-    kernel->setArg(idx++, static_cast<short>(height));
-    kernel->setArg(idx++, static_cast<short>(width));
-    kernel->setArg(idx++, static_cast<short>(filter_height));
-    kernel->setArg(idx++, static_cast<short>(filter_width));
-    kernel->setArg(idx++, static_cast<short>(paddings[0] / 2));
-    kernel->setArg(idx++, static_cast<short>(paddings[1] / 2));
+    kernel->setArg(idx++, static_cast<int16_t>(input_height));
+    kernel->setArg(idx++, static_cast<int16_t>(input_width));
+    kernel->setArg(idx++, static_cast<int16_t>(input_channel_blocks));
+    kernel->setArg(idx++, static_cast<int16_t>(height));
+    kernel->setArg(idx++, static_cast<int16_t>(width));
+    kernel->setArg(idx++, static_cast<int16_t>(filter_height));
+    kernel->setArg(idx++, static_cast<int16_t>(filter_width));
+    kernel->setArg(idx++, static_cast<int16_t>(paddings[0] / 2));
+    kernel->setArg(idx++, static_cast<int16_t>(paddings[1] / 2));
    if (stride != 1 || dilations[0] != 1 || dilations[1] != 1) {
-      kernel->setArg(idx++, static_cast<short>(dilations[0]));
-      kernel->setArg(idx++, static_cast<short>(dilations[1]));
+      kernel->setArg(idx++, static_cast<int16_t>(dilations[0]));
+      kernel->setArg(idx++, static_cast<int16_t>(dilations[1]));
    }
    *prev_input_shape = input->shape();
  }
@@ -159,7 +159,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
  }

  std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
  output->ResizeImage(output_shape, output_image_shape);

  DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),

--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -35,7 +35,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
    built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
    if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
    kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
-
  }
  if (!IsVecEqual(input_shape_, input0->shape())) {
    uint32_t idx = 0;

--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -16,12 +16,14 @@ void FCWXKernel(cl::Kernel *kernel,
                std::vector<index_t> *prev_input_shape,
                Tensor *output,
                const ActivationType activation,
-                std::vector<uint32_t> &gws,
-                std::vector<uint32_t> &lws,
+                std::vector<uint32_t> *gws,
+                std::vector<uint32_t> *lws,
                const float relux_max_limit,
                StatsFuture *future) {
  MACE_CHECK(input->dim(3) % 4 == 0)
    << "FC width kernel only support input with 4x channel.";
+  MACE_CHECK_NOTNULL(gws);
+  MACE_CHECK_NOTNULL(lws);
  auto runtime = OpenCLRuntime::Global();

  if (kernel->get() == nullptr) {
@@ -62,12 +64,11 @@ void FCWXKernel(cl::Kernel *kernel,
    const index_t output_blocks = RoundUpDiv4(output_size);
    const uint32_t wave_size = runtime->GetKernelWaveSize(*kernel);

-    gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
+    *gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};

    const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(*kernel);
-    const uint32_t inter_local_blks = kwg_size / (gws[0] * gws[1]);
-    lws = {gws[0], gws[1], inter_local_blks};
-
+    const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]);
+    *lws = {(*gws)[0], (*gws)[1], inter_local_blks};
  }
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    const index_t batch = output->dim(0);
@@ -80,21 +81,22 @@ void FCWXKernel(cl::Kernel *kernel,
      kernel->setArg(idx++, *(bias->opencl_image()));
    }
    kernel->setArg(idx++, *(output->opencl_image()));
-    kernel->setArg(idx++, (lws[0] * lws[1] * lws[2] * sizeof(float)), nullptr);
+    kernel->setArg(idx++, ((*lws)[0] * (*lws)[1] * (*lws)[2] * sizeof(float)),
+                   nullptr);
    kernel->setArg(idx++, static_cast<int>(input->dim(1)));
    kernel->setArg(idx++, static_cast<int>(input->dim(2)));
    kernel->setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
    kernel->setArg(idx++, static_cast<int>(output_blocks));
    kernel->setArg(idx++, relux_max_limit);

-    gws[2] = static_cast<uint32_t>(batch * output_blocks);
+    (*gws)[2] = static_cast<uint32_t>(batch * output_blocks);

    *prev_input_shape = input->shape();
  }
  cl::Event event;
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      *kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+      *kernel, cl::NullRange, cl::NDRange((*gws)[0], (*gws)[1], (*gws)[2]),
+      cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;

  if (future != nullptr) {
@@ -105,7 +107,6 @@ void FCWXKernel(cl::Kernel *kernel,
      }
    };
  }
-
 }

 template <typename T>
@@ -116,10 +117,12 @@ void FCWTXKernel(cl::Kernel *kernel,
                 std::vector<index_t> *prev_input_shape,
                 Tensor *output,
                 const ActivationType activation,
-                 std::vector<uint32_t> &gws,
-                 std::vector<uint32_t> &lws,
+                 std::vector<uint32_t> *gws,
+                 std::vector<uint32_t> *lws,
                 const float relux_max_limit,
                 StatsFuture *future) {
+  MACE_CHECK_NOTNULL(gws);
+  MACE_CHECK_NOTNULL(lws);
  if (kernel->get() == nullptr) {
    auto runtime = OpenCLRuntime::Global();
    std::set<std::string> built_options;
@@ -152,7 +155,7 @@ void FCWTXKernel(cl::Kernel *kernel,
    *kernel =
        runtime->BuildKernel("fully_connected", kernel_name, built_options);

-    lws = {16, 64, 1};
+    *lws = {16, 64, 1};
  }
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    uint32_t idx = 0;
@@ -171,18 +174,16 @@ void FCWTXKernel(cl::Kernel *kernel,
    const index_t batch = output->dim(0);
    const index_t output_blocks = RoundUpDiv4(output->dim(3));

-    gws = {
+    *gws = {
        static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
    };
-
    *prev_input_shape = input->shape();
  }

  std::stringstream ss;
  ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
     << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun2DKernel(*kernel, ss.str(), gws.data(), lws, future);
-
+  TuningOrRun2DKernel(*kernel, ss.str(), gws->data(), *lws, future);
 }

 template <typename T>
@@ -194,17 +195,18 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
    StatsFuture *future) {
  std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
  std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
  output->ResizeImage(output_shape, output_image_shape);

  if (weight_type_ == BufferType::WEIGHT_HEIGHT) {
    FCWTXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
-                   activation_, gws_, lws_, relux_max_limit_, future);
+                   activation_, &gws_, &lws_, relux_max_limit_, future);
  } else {
    FCWXKernel<T>(&kernel_, input, weight, bias, &input_shape_, output,
-                  activation_, gws_, lws_, relux_max_limit_, future);
+                  activation_, &gws_, &lws_, relux_max_limit_, future);
  }
-};
+}

 template struct FullyConnectedFunctor<DeviceType::OPENCL, float>;


--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -3,6 +3,11 @@
 //

 #include "mace/kernels/opencl/helper.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
 #include "mace/utils/tuner.h"
 #include "mace/utils/utils.h"

@@ -11,91 +16,92 @@ namespace kernels {

 // [(C + 3) / 4 * W, N * H]
 void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
-                           std::vector<size_t> &image_shape) {
+                           std::vector<size_t> *image_shape) {
  MACE_CHECK(shape.size() == 4);
-  image_shape.resize(2);
-  image_shape[0] = RoundUpDiv4(shape[3]) * shape[2];
-  image_shape[1] = shape[0] * shape[1];
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[3]) * shape[2];
+  (*image_shape)[1] = shape[0] * shape[1];
 }

 // [RoundUp<4>(Ic) * H * W, (Oc + 3) / 4]
 void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWOI */
-                               std::vector<size_t> &image_shape) {
+                               std::vector<size_t> *image_shape) {
  MACE_CHECK(shape.size() == 4);
-  image_shape.resize(2);
-  image_shape[0] = shape[0] * shape[1] * RoundUp<index_t>(shape[3], 4);
-  image_shape[1] = RoundUpDiv4(shape[2]);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[0] * shape[1] * RoundUp<index_t>(shape[3], 4);
+  (*image_shape)[1] = RoundUpDiv4(shape[2]);
 }

 // [H * W * M, (Ic + 3) / 4]
 void CalDepthwiseConv2dFilterImageShape(
    const std::vector<index_t> &shape, /* HWIM */
-    std::vector<size_t> &image_shape) {
+    std::vector<size_t> *image_shape) {
  MACE_CHECK(shape.size() == 4);
-  image_shape.resize(2);
-  image_shape[0] = shape[0] * shape[1] * shape[3];
-  image_shape[1] = RoundUpDiv4(shape[2]);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[0] * shape[1] * shape[3];
+  (*image_shape)[1] = RoundUpDiv4(shape[2]);
 }

 // [(size + 3) / 4, 1]
 void CalArgImageShape(const std::vector<index_t> &shape,
-                      std::vector<size_t> &image_shape) {
+                      std::vector<size_t> *image_shape) {
  MACE_CHECK(shape.size() == 1);
-  image_shape.resize(2);
-  image_shape[0] = RoundUpDiv4(shape[0]);
-  image_shape[1] = 1;
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[0]);
+  (*image_shape)[1] = 1;
 }

 // Only support 3x3 now
 // [ (Ic + 3) / 4, 16 * Oc]
 void CalWinogradFilterImageShape(
    const std::vector<index_t> &shape, /* Oc, Ic, H, W*/
-    std::vector<size_t> &image_shape) {
+    std::vector<size_t> *image_shape) {
  MACE_CHECK(shape.size() == 4);
-  image_shape.resize(2);
-  image_shape[0] = RoundUpDiv4(shape[1]);
-  image_shape[1] = (shape[0] << 4);
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[1]);
+  (*image_shape)[1] = (shape[0] << 4);
 }

 // [W * C, N * RoundUp<4>(H)]
 void CalInOutHeightImageShape(const std::vector<index_t> &shape, /* NHWC */
-                              std::vector<size_t> &image_shape) {
+                              std::vector<size_t> *image_shape) {
  MACE_CHECK(shape.size() == 4);
-  image_shape.resize(2);
-  image_shape[0] = shape[2] * shape[3];
-  image_shape[1] = shape[0] * RoundUpDiv4(shape[1]);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[2] * shape[3];
+  (*image_shape)[1] = shape[0] * RoundUpDiv4(shape[1]);
 }

 // [RoundUp<4>(W) * C, N * H]
 void CalInOutWidthImageShape(const std::vector<index_t> &shape, /* NHWC */
-                             std::vector<size_t> &image_shape) {
+                             std::vector<size_t> *image_shape) {
  MACE_CHECK(shape.size() == 4);
-  image_shape.resize(2);
-  image_shape[0] = RoundUpDiv4(shape[2]) * shape[3];
-  image_shape[1] = shape[0] * shape[1];
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[2]) * shape[3];
+  (*image_shape)[1] = shape[0] * shape[1];
 }

 // [W, (H + 3) / 4]
 void CalWeightHeightImageShape(const std::vector<index_t> &shape, /* HW */
-                               std::vector<size_t> &image_shape) {
+                               std::vector<size_t> *image_shape) {
  MACE_CHECK(shape.size() == 2);
-  image_shape.resize(2);
-  image_shape[0] = shape[1];
-  image_shape[1] = RoundUpDiv4(shape[0]);
+  image_shape->resize(2);
+  (*image_shape)[0] = shape[1];
+  (*image_shape)[1] = RoundUpDiv4(shape[0]);
 }

 // [(W + 3) / 4, H]
 void CalWeightWidthImageShape(const std::vector<index_t> &shape, /* HW */
-                              std::vector<size_t> &image_shape) {
+                              std::vector<size_t> *image_shape) {
  MACE_CHECK(shape.size() == 2);
-  image_shape.resize(2);
-  image_shape[0] = RoundUpDiv4(shape[1]);
-  image_shape[1] = shape[0];
+  image_shape->resize(2);
+  (*image_shape)[0] = RoundUpDiv4(shape[1]);
+  (*image_shape)[1] = shape[0];
 }

 void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                     const BufferType type,
-                     std::vector<size_t> &image_shape) {
+                     std::vector<size_t> *image_shape) {
+  MACE_CHECK_NOTNULL(image_shape);
  switch (type) {
    case CONV2D_FILTER:
      CalConv2dFilterImageShape(shape, image_shape);
@@ -188,7 +194,7 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
  }
 }

-void TuningOrRun3DKernel(cl::Kernel &kernel,
+void TuningOrRun3DKernel(const cl::Kernel &kernel,
                         const std::string tuning_key,
                         const uint32_t *gws,
                         const std::vector<uint32_t> &lws,
@@ -202,7 +208,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
    local_ws[2] =
        std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1]));
    return {
-        // TODO tuning these magic numbers
+        // TODO(heliangliang): tuning these magic numbers
        {local_ws[0], local_ws[1], local_ws[2], 1},
        {kwg_size / 16, 4, 4, 1},
        {kwg_size / 32, 4, 8, 1},
@@ -291,7 +297,7 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
  }
 }

-void TuningOrRun2DKernel(cl::Kernel &kernel,
+void TuningOrRun2DKernel(const cl::Kernel &kernel,
                         const std::string tuning_key,
                         const uint32_t *gws,
                         const std::vector<uint32_t> &lws,

--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -5,6 +5,9 @@
 #ifndef MACE_KERNELS_OPENCL_HELPER_H_
 #define MACE_KERNELS_OPENCL_HELPER_H_

+#include <string>
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
@@ -30,7 +33,7 @@ enum BufferType {

 void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                     const BufferType type,
-                     std::vector<size_t> &image_shape);
+                     std::vector<size_t> *image_shape);

 std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
                                      const BufferType type);
@@ -43,13 +46,13 @@ std::string DtToCLDt(const DataType dt);

 std::string DtToUpstreamCLDt(const DataType dt);

-void TuningOrRun3DKernel(cl::Kernel &kernel,
+void TuningOrRun3DKernel(const cl::Kernel &kernel,
                         const std::string tuning_key,
                         const uint32_t *gws,
                         const std::vector<uint32_t> &lws,
                         StatsFuture *future);

-void TuningOrRun2DKernel(cl::Kernel &kernel,
+void TuningOrRun2DKernel(const cl::Kernel &kernel,
                         const std::string tuning_key,
                         const uint32_t *gws,
                         const std::vector<uint32_t> &lws,
@@ -78,7 +81,6 @@ bool IsVecEqual(const std::vector<T> &input0,
      (std::equal(input0.begin(), input0.end(), input1.begin())));
 }

-namespace {
 template <typename T>
 void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
  (*ss) << v;
@@ -92,7 +94,6 @@ void AppendToStream(std::stringstream *ss,
  (*ss) << first << delimiter;
  AppendToStream(ss, delimiter, args...);
 }
-}  // namespace

 template <typename... Args>
 std::string Concat(Args... args) {

--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -17,7 +17,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
                                                      StatsFuture *future) {
  std::vector<index_t> c_shape = {A->dim(0), A->dim(1), B->dim(2), 1};
  std::vector<size_t> c_image_shape;
-  CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, c_image_shape);
+  CalImage2DShape(c_shape, BufferType::IN_OUT_HEIGHT, &c_image_shape);
  C->ResizeImage(c_shape, c_image_shape);

  const index_t batch = C->dim(0);
@@ -56,7 +56,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
  ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
     << C->dim(2) << "_" << C->dim(3);
  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
-};
+}

 template struct MatMulFunctor<DeviceType::OPENCL, float>;


--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -36,12 +36,11 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
      built_options.emplace("-DPOOL_AVG");
    }
    kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
-
  }
  if (!IsVecEqual(input_shape_, input->shape())) {
    std::vector<index_t> output_shape(4);
-    std::vector<index_t> filter_shape = {kernels_[0], kernels_[1], input->dim(3),
-                                         input->dim(3)};
+    std::vector<index_t> filter_shape = {kernels_[0], kernels_[1],
+                                         input->dim(3), input->dim(3)};

    std::vector<int> paddings(2);
    if (paddings_.empty()) {
@@ -50,12 +49,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
          padding_type_, output_shape.data(), paddings.data());
    } else {
      paddings = paddings_;
-      CalcOutputSize(input->shape().data(), filter_shape.data(), paddings_.data(),
-                     dilations_, strides_, RoundType::CEIL, output_shape.data());
+      CalcOutputSize(input->shape().data(), filter_shape.data(),
+                     paddings_.data(), dilations_, strides_, RoundType::CEIL,
+                     output_shape.data());
    }

    std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                    &output_image_shape);
    output->ResizeImage(output_shape, output_image_shape);

    uint32_t idx = 0;

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -34,7 +34,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
    kernel_ =
        runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
-
  }
  if (!IsVecEqual(input_shape_, input->shape())) {
    MACE_CHECK(out_height > 0 && out_width > 0);
@@ -42,7 +41,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(

    std::vector<size_t> output_image_shape;
    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
-                    output_image_shape);
+                    &output_image_shape);
    output->ResizeImage(output_shape, output_image_shape);

    float height_scale =
@@ -60,7 +59,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, static_cast<int32_t>(out_height));

    input_shape_ = input->shape();
-
  }

  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),

--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -24,7 +24,7 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
                                     input->dim(2), output_channels});

  std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
  for (size_t i= 0; i < outputs_count; ++i) {
    output_list[i]->ResizeImage(output_shape, image_shape);
  }

--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -33,7 +33,6 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
    kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
-
  }
  if (!IsVecEqual(input_shape_, logits->shape())) {
    uint32_t idx = 0;

--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -22,7 +22,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
    StatsFuture *future) {
  const char *kernel_name = nullptr;
  std::vector<size_t> output_image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
+                  &output_image_shape);
  if (b2s_) {
    space_tensor->ResizeImage(output_shape, output_image_shape);
    kernel_name = "batch_to_space";
@@ -42,7 +43,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
                          DtToCLCMDDt(DataTypeToEnum<T>::value));
    kernel_ =
        runtime->BuildKernel("space_to_batch", kernel_name, built_options);
-
  }
  if (!IsVecEqual(space_shape_, space_tensor->shape())) {
    uint32_t idx = 0;

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -27,7 +27,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
    auto runtime = OpenCLRuntime::Global();
    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
                                   built_options);
-
  }
  std::vector<index_t> output_shape(4);
  std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
@@ -49,7 +48,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
  if (!IsVecEqual(input_shape_, input_tensor->shape())) {
    output_shape = {16, input_tensor->dim(3), out_width, 1};
    std::vector<size_t> image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
+    CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, &image_shape);
    output_tensor->ResizeImage(output_shape, image_shape);

    uint32_t idx = 0;
@@ -83,7 +82,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
    const Tensor *bias,
    Tensor *output_tensor,
    StatsFuture *future) {
-
  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name =
        MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
@@ -125,7 +123,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
    std::vector<index_t> output_shape = {batch_, height_, width_,
                                         input_tensor->dim(1)};
    std::vector<size_t> image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
+    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
    output_tensor->ResizeImage(output_shape, image_shape);

    const uint32_t round_h = (height_ + 1) / 2;

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -2,10 +2,13 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //

-#ifndef MACE_KERNELS_POOLING_H
-#define MACE_KERNELS_POOLING_H
+#ifndef MACE_KERNELS_POOLING_H_
+#define MACE_KERNELS_POOLING_H_

+#include <algorithm>
 #include <limits>
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
@@ -188,4 +191,4 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
 }  // namespace kernels
 }  // namespace mace

-#endif  // MACE_KERNELS_POOLING_H
+#endif  // MACE_KERNELS_POOLING_H_
--- a/mace/kernels/reshape.h
+++ b/mace/kernels/reshape.h
@@ -4,6 +4,8 @@
 #ifndef MACE_KERNELS_RESHAPE_H_
 #define MACE_KERNELS_RESHAPE_H_

+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
@@ -20,7 +22,7 @@ struct ReshapeFunctor {
                  Tensor *output,
                  StatsFuture *future) {
    output->Resize(out_shape);
-    // TODO copy on write to avoid this copy.
+    // TODO(liuqi): copy on write to avoid this copy.
    output->CopyBytes(input->raw_data(), input->size() * sizeof(T));
  }
 };

--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -4,6 +4,9 @@
 #ifndef MACE_KERNELS_RESIZE_BILINEAR_H_
 #define MACE_KERNELS_RESIZE_BILINEAR_H_

+#include <algorithm>
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
@@ -11,7 +14,6 @@
 namespace mace {
 namespace kernels {

-namespace {
 struct CachedInterpolation {
  index_t lower;  // Lower source index used in the interpolation
  index_t upper;  // Upper source index used in the interpolation
@@ -101,7 +103,6 @@ void ResizeImage(const T *images,
    }
  }
 }
-}

 struct ResizeBilinearFunctorBase {
  ResizeBilinearFunctorBase(const std::vector<index_t> &size,

--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -5,6 +5,8 @@
 #ifndef MACE_KERNELS_SLICE_H_
 #define MACE_KERNELS_SLICE_H_

+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
@@ -16,7 +18,6 @@ namespace kernels {

 template<DeviceType D, typename T>
 struct SliceFunctor {
-
  void operator()(const Tensor *input,
                  const std::vector<Tensor *> &output_list,
                  StatsFuture *future) {
@@ -56,15 +57,13 @@ struct SliceFunctor {

 template<typename T>
 struct SliceFunctor<DeviceType::OPENCL, T> {
-
  void operator()(const Tensor *input,
                  const std::vector<Tensor *> &output_list,
                  StatsFuture *future);
  cl::Kernel kernel_;
-
 };

-}  // namepsace kernels
+}  // namespace kernels
 }  // namespace mace

 #endif  // MACE_KERNELS_SLICE_H_
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -5,6 +5,10 @@
 #ifndef MACE_KERNELS_SOFTMAX_H_
 #define MACE_KERNELS_SOFTMAX_H_

+#include <algorithm>
+#include <functional>
+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
@@ -38,7 +42,7 @@ struct SoftmaxFunctor {
        for (index_t c = 1; c < num_classes; ++c) {
          max_value = std::max(max_value, logits_ptr[pos + c]);
        }
-        // TODO: check overflow?
+        // TODO(liuqi): check overflow?
        T sum = 0;
        for (index_t c = 0; c < num_classes; ++c) {
          exp_data[c] = ::exp((logits_ptr[pos + c] - max_value));
@@ -60,7 +64,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
  std::vector<index_t> input_shape_;
 };

-}  // namepsace kernels
+}  // namespace kernels
 }  // namespace mace

 #endif  // MACE_KERNELS_SOFTMAX_H_
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -2,8 +2,10 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //

-#ifndef MACE_KERNELS_CONV_2D_H_
-#define MACE_KERNELS_CONV_2D_H_
+#ifndef MACE_KERNELS_SPACE_TO_BATCH_H_
+#define MACE_KERNELS_SPACE_TO_BATCH_H_
+
+#include <vector>

 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -60,4 +62,4 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
 }  // namespace kernels
 }  // namespace mace

-#endif  // MACE_KERNELS_CONV_2D_H_
+#endif  // MACE_KERNELS_SPACE_TO_BATCH_H_
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -5,6 +5,8 @@
 #ifndef MACE_KERNELS_WINOGRAD_TRANSFORM_H_
 #define MACE_KERNELS_WINOGRAD_TRANSFORM_H_

+#include <vector>
+
 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"