diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h
index d68d4f2baf502bd51ae8ab8cc059f55cd7f57cb5..745f174466fefee3683746213247d1e549306309 100644
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -8,6 +8,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
@@ -128,6 +129,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
   ActivationType activation_;
   T relux_max_limit_;
   T prelu_alpha_;
+  cl::Kernel kernel_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
index 391ce05aace61c99573e9e1ac221c00b4efd7ed3..0b4828a4491e18be8a1c35d4d82d70d1790d0abf 100644
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -7,6 +7,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
@@ -42,6 +43,8 @@ template <typename T>
 struct AddNFunctor<DeviceType::OPENCL, T> {
   void operator()(const std::vector<const Tensor *> &input_tensors,
                   Tensor *output_tensor, StatsFuture *future);
+
+  cl::Kernel kernel_;
 };
 
 }  //  namespace kernels
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index e7499274c0b6bd2612e1b3ae65400d031d896c52..bd8fc7e98b91d3a9668b76b430943570df20be12 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -9,6 +9,7 @@
 #include "mace/core/public/mace.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
@@ -133,6 +134,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
                   const float epsilon,
                   Tensor *output,
                   StatsFuture *future);
+  cl::Kernel kernel_;
 };
 
 }  //  namepsace kernels
diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h
index 53b3a8d901dffa982e0854ddc8e1777be1037f9f..5b87026debf2f760e3d82c09251b99ca1426aa3b 100644
--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -8,6 +8,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/public/mace.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
@@ -63,6 +64,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
                   const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future);
+  cl::Kernel kernel_;
 };
 
 }  //  namepsace kernels
diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h
index 4844f6dffcca6084c3146ba37bd23d65e471c007..424a4e917da65793b2a9c3e2f302f7690e66e952 100644
--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -10,6 +10,7 @@
 #include "mace/core/types.h"
 #include "mace/core/public/mace.h"
 #include "mace/core/tensor.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
@@ -81,6 +82,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase{
 
   void operator()(const std::vector<const Tensor *> &input_list,
                   Tensor *output, StatsFuture *future);
+  cl::Kernel kernel_;
 
 };
 
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index 628acceb10a21cd67bd62036876b67a1767c29c8..01c55434c85f1c5303cd555c3703861a7bf84e65 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -9,6 +9,7 @@
 #include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"
 #include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
@@ -176,6 +177,8 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
                   const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future);
+
+  cl::Kernel kernel_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h
index 92bc4e33dacc0fa339ff618053deb10ac401a534..caff18938e3ff4a4fefd65964cb36fce7fe457d7 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -9,6 +9,7 @@
 #include "mace/core/future.h"
 #include "mace/core/public/mace.h"
 #include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
@@ -57,7 +58,6 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
                   StatsFuture *future) {
     MACE_CHECK_NOTNULL(input);
     MACE_CHECK_NOTNULL(filter);
-    MACE_CHECK_NOTNULL(bias);
     MACE_CHECK_NOTNULL(output);
 
     // Create a fake conv_2d filter to calculate the paddings and output size
@@ -113,7 +113,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
     Tensor::MappingGuard output_mapper(output);
     const T *input_ptr = input->data<T>();
     const T *filter_ptr = filter->data<T>();
-    const T *bias_ptr = bias->data<T>();
+    const T *bias_ptr = bias == nullptr ? nullptr : bias->data<T>();
     T *output_ptr = output->mutable_data<T>();
 
 #pragma omp parallel for collapse(2)
@@ -153,6 +153,10 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
         }
       }
     }
+
+    output_ptr = output->mutable_data<T>();
+    DoActivation(output_ptr, output_ptr, output->NumElements(), activation_,
+                 relux_max_limit_, prelu_alpha_);
   }
 };
 
@@ -178,13 +182,15 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
                                    dilations,
                                    activation,
                                    relux_max_limit,
-                                   prelu_alpha) {}
+                                   prelu_alpha){}
 
   void operator()(const Tensor *input,
                   const Tensor *filter,
                   const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future);
+
+  cl::Kernel kernel_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h
index d88e3843888d786291e34ecbfaafdff5cbe5d788..4b486c7280d92f2684d56bd6aed5a6b9477aea61 100644
--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -7,6 +7,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
@@ -58,6 +59,8 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
                   const Tensor *B,
                   Tensor *C,
                   StatsFuture *future);
+
+  cl::Kernel kernel_;
 };
 
 }  //  namespace kernels
diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc
index 17688e9cc3e0083bfae3a5fba1b3e384f2543be1..935b3576020415a92562cefd87ea8456c440e234 100644
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -22,52 +22,60 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   const index_t channels = input->dim(3);
 
   const index_t channel_blocks = RoundUpDiv4(channels);
+  std::string tuning_key_prefix;
 
-  auto runtime = OpenCLRuntime::Global();
+  if (kernel_.get() == nullptr) {
+    auto runtime = OpenCLRuntime::Global();
 
-  std::set<std::string> built_options;
-  std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
-  built_options.emplace("-Dactivation=" + kernel_name);
-  auto dt = DataTypeToEnum<T>::value;
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  switch (activation_) {
-    case RELU:
-      built_options.emplace("-DUSE_RELU");
-      break;
-    case RELUX:
-      built_options.emplace("-DUSE_RELUX");
-      break;
-    case PRELU:
-      built_options.emplace("-DUSE_PRELU");
-      break;
-    case TANH:
-      built_options.emplace("-DUSE_TANH");
-      break;
-    case SIGMOID:
-      built_options.emplace("-DUSE_SIGMOID");
-      break;
-    defeult:
-      LOG(FATAL) << "Unknown activation type: " << activation_;
+    std::set<std::string> built_options;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
+    built_options.emplace("-Dactivation=" + kernel_name);
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    switch (activation_) {
+      case RELU:
+        tuning_key_prefix = "relu_opencl_kernel_";
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        tuning_key_prefix = "relux_opencl_kernel_";
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case PRELU:
+        tuning_key_prefix = "prelu_opencl_kernel_";
+        built_options.emplace("-DUSE_PRELU");
+        break;
+      case TANH:
+        tuning_key_prefix = "tanh_opencl_kernel_";
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        tuning_key_prefix = "sigmoid_opencl_kernel_";
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      defeult:
+        LOG(FATAL) << "Unknown activation type: " << activation_;
+    }
+    kernel_ =
+        runtime->BuildKernel("activation", kernel_name, built_options);
+    int idx = 0;
+    kernel_.setArg(
+        idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+    kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
+    kernel_.setArg(idx++, static_cast<float>(prelu_alpha_));
+    kernel_.setArg(idx++,
+                             *(static_cast<cl::Image2D *>(output->buffer())));
   }
-  cl::Kernel activation_kernel =
-      runtime->BuildKernel("activation", kernel_name, built_options);
-  int idx = 0;
-  activation_kernel.setArg(
-      idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
-  activation_kernel.setArg(idx++, relux_max_limit_);
-  activation_kernel.setArg(idx++, prelu_alpha_);
-  activation_kernel.setArg(idx++,
-                           *(static_cast<cl::Image2D *>(output->buffer())));
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
   const std::vector<uint32_t> lws = {8, 16, 8, 1};
   std::string tuning_key =
-      Concat("relu_opencl_kernel_", activation_, output->dim(0), output->dim(1),
+      Concat(tuning_key_prefix, output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(activation_kernel, tuning_key, gws, lws, future);
+  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
 }
 
 template struct ActivationFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index c16741d62421252cc888fbda6ff1a9d497a10ec9..df096e7fbf374fde681e281782860e31ba44edde 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -11,53 +11,6 @@
 namespace mace {
 namespace kernels {
 
-template <typename T>
-static void AddN(const std::vector<const Tensor *> &input_tensors,
-                 Tensor *output, StatsFuture *future) {
-  if (input_tensors.size() > 4) {
-    MACE_NOT_IMPLEMENTED;
-  }
-
-  const index_t batch = output->dim(0);
-  const index_t height = output->dim(1);
-  const index_t width = output->dim(2);
-  const index_t channels = output->dim(3);
-
-  const index_t channel_blocks = RoundUpDiv4(channels);
-  const index_t width_pixels = channel_blocks * width;
-  const index_t batch_height_pixels = batch * height;
-
-  auto runtime = OpenCLRuntime::Global();
-  std::set<std::string> built_options;
-  auto dt = DataTypeToEnum<T>::value;
-  std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
-  built_options.emplace("-Daddn=" + kernel_name);
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size()));
-  auto addn_kernel = runtime->BuildKernel("addn", kernel_name, built_options);
-
-  uint32_t idx = 0;
-  for (auto input : input_tensors) {
-    addn_kernel.setArg(idx++,
-                       *(static_cast<const cl::Image2D *>(input->buffer())));
-  }
-  addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
-
-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(width_pixels),
-      static_cast<uint32_t>(batch_height_pixels)
-  };
-  const std::vector<uint32_t> lws = {64, 16, 1};
-  std::stringstream ss;
-  ss << "addn_opencl_kernel_"
-     << output->dim(0) << "_"
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
-  TuningOrRun2DKernel(addn_kernel, ss.str(), gws, lws, future);
-}
-
 template <typename T>
 void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     const std::vector<const Tensor *> &input_tensors,
@@ -84,7 +37,44 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
   output_tensor->ResizeImage(output_shape, output_image_shape);
 
-  AddN<T>(input_tensors, output_tensor, future);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t width_pixels = channel_blocks * width;
+  const index_t batch_height_pixels = batch * height;
+
+  if (kernel_.get() == nullptr) {
+    if (input_tensors.size() > 4) {
+      MACE_NOT_IMPLEMENTED;
+    }
+    auto runtime = OpenCLRuntime::Global();
+    std::set<std::string> built_options;
+    auto dt = DataTypeToEnum<T>::value;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
+    built_options.emplace("-Daddn=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size()));
+    kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
+
+    uint32_t idx = 0;
+    for (auto input : input_tensors) {
+      kernel_.setArg(idx++,
+                         *(static_cast<const cl::Image2D *>(input->buffer())));
+    }
+    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->buffer())));
+  }
+
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(width_pixels),
+      static_cast<uint32_t>(batch_height_pixels)
+  };
+  const std::vector<uint32_t> lws = {64, 16, 1};
+  std::stringstream ss;
+  ss << "addn_opencl_kernel_"
+     << output_shape[0] << "_"
+     << output_shape[1] << "_"
+     << output_shape[2] << "_"
+     << output_shape[3];
+  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
 };
 
 template
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index 5942dcfbfa5f7ed104286ed47107c5fbb0733f9a..3b63c7ac579b902c97361ef4c9ba8b38ab650a38 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -30,55 +30,57 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   const index_t channel_blocks = RoundUpDiv4(channels);
 
-  auto runtime = OpenCLRuntime::Global();
-  std::set<std::string> built_options;
-  auto dt = DataTypeToEnum<T>::value;
-  std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
-  built_options.emplace("-Dbatch_norm=" + kernel_name);
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  if (folded_constant_) {
-    built_options.emplace("-DFOLDED_CONSTANT");
-  }
-  switch (activation_) {
-    case NOOP:
-      break;
-    case RELU:
-      built_options.emplace("-DUSE_RELU");
-      break;
-    case RELUX:
-      built_options.emplace("-DUSE_RELUX");
-      break;
-    case PRELU:
-      built_options.emplace("-DUSE_PRELU");
-      break;
-    case TANH:
-      built_options.emplace("-DUSE_TANH");
-      break;
-    case SIGMOID:
-      built_options.emplace("-DUSE_SIGMOID");
-      break;
-    defeult:
-      LOG(FATAL) << "Unknown activation type: " << activation_;
-  }
+  if (kernel_.get() == nullptr) {
+    auto runtime = OpenCLRuntime::Global();
+    std::set<std::string> built_options;
+    auto dt = DataTypeToEnum<T>::value;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
+    built_options.emplace("-Dbatch_norm=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (folded_constant_) {
+      built_options.emplace("-DFOLDED_CONSTANT");
+    }
+    switch (activation_) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case PRELU:
+        built_options.emplace("-DUSE_PRELU");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      defeult:
+        LOG(FATAL) << "Unknown activation type: " << activation_;
+    }
 
-  auto bm_kernel =
-      runtime->BuildKernel("batch_norm", kernel_name, built_options);
+    kernel_ =
+        runtime->BuildKernel("batch_norm", kernel_name, built_options);
 
-  uint32_t idx = 0;
-  bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(scale->buffer())));
-  bm_kernel.setArg(idx++,
-                   *(static_cast<const cl::Image2D *>(offset->buffer())));
-  if (!folded_constant_) {
-    bm_kernel.setArg(idx++,
-                     *(static_cast<const cl::Image2D *>(mean->buffer())));
-    bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(var->buffer())));
-    bm_kernel.setArg(idx++, epsilon);
+    uint32_t idx = 0;
+    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(scale->buffer())));
+    kernel_.setArg(idx++,
+                     *(static_cast<const cl::Image2D *>(offset->buffer())));
+    if (!folded_constant_) {
+      kernel_.setArg(idx++,
+                       *(static_cast<const cl::Image2D *>(mean->buffer())));
+      kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(var->buffer())));
+      kernel_.setArg(idx++, epsilon);
+    }
+    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+    kernel_.setArg(idx++, relux_max_limit_);
+    kernel_.setArg(idx++, prelu_alpha_);
   }
-  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
-  bm_kernel.setArg(idx++, relux_max_limit_);
-  bm_kernel.setArg(idx++, prelu_alpha_);
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width),
@@ -87,7 +89,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   std::string tuning_key =
       Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3), folded_constant_);
-  TuningOrRun3DKernel(bm_kernel, tuning_key, gws, lws, future);
+  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
 }
 
 template struct BatchNormFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc
index b85bb860534ef362b4f61332308fb48d3f915bfc..84eff1bfabcaad80d913fdd1aa0a73279883e4ad 100644
--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -24,30 +24,30 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
 
   const index_t channel_blocks = RoundUpDiv4(channels);
 
+  auto runtime = OpenCLRuntime::Global();
+  if (kernel_.get() == nullptr) {
+    std::set<std::string> built_options;
+    auto dt = DataTypeToEnum<T>::value;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
+    built_options.emplace("-Dbias_add=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
+
+    uint32_t idx = 0;
+    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
+    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+  }
+
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
-
-  auto runtime = OpenCLRuntime::Global();
-  std::set<std::string> built_options;
-  auto dt = DataTypeToEnum<T>::value;
-  std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
-  built_options.emplace("-Dbias_add=" + kernel_name);
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  auto bias_kernel = runtime->BuildKernel("bias_add", kernel_name, built_options);
-
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bias_kernel);
   const std::vector<uint32_t> lws = {8, 16, 8};
 
-  uint32_t idx = 0;
-  bias_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
-  bias_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
-  bias_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
-
   cl::Event event;
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      bias_kernel, cl::NullRange,
+      kernel_, cl::NullRange,
       cl::NDRange(gws[0], gws[1], gws[2]),
       cl::NDRange(lws[0], lws[1], lws[2]),
       nullptr, &event);
diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl
index 02ebff0aaeeb6a416f79b14203896ea40b512fe7..fe8619e20b7f567bc36dfa4bc5d6c53bd5f792fb 100644
--- a/mace/kernels/opencl/cl/activation.cl
+++ b/mace/kernels/opencl/cl/activation.cl
@@ -1,8 +1,8 @@
 #include <common.h>
 
 __kernel void activation(__read_only image2d_t input,
-                         __private const DATA_TYPE relux_max_limit,
-                         __private const DATA_TYPE prelu_alpha,
+                         __private const float relux_max_limit,
+                         __private const float prelu_alpha,
                          __write_only image2d_t output) {
   const int ch_blk = get_global_id(0);
   const int w = get_global_id(1);
diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl
index 280ba54c8fda24181df63d832d438c0c02efb502..d9c94007d8dbb0f212c2e8aac8f346a0d4c776a7 100644
--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
@@ -2,24 +2,24 @@
 
 // Only multiplier = 1 is supported
 __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
-                                __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
+                               __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
-                                __read_only image2d_t bias, /* cout%4 * cout/4 */
+    __read_only image2d_t bias, /* cout%4 * cout/4 */
 #endif
-                                __write_only image2d_t output,
-                                __private const DATA_TYPE relux_max_limit,
-                                __private const DATA_TYPE prelu_alpha,
-                                __private const short in_height,
-                                __private const short in_width,
-                                __private const short in_ch_blks,
-                                __private const short out_height,
-                                __private const short out_width,
-                                __private const short filter_height,
-                                __private const short filter_width,
-                                __private const short padding_top,
-                                __private const short padding_left,
-                                __private const short dilation_h,
-                                __private const short dilation_w) {
+                               __write_only image2d_t output,
+                               __private const DATA_TYPE relux_max_limit,
+                               __private const DATA_TYPE prelu_alpha,
+                               __private const short in_height,
+                               __private const short in_width,
+                               __private const short in_ch_blks,
+                               __private const short out_height,
+                               __private const short out_width,
+                               __private const short filter_height,
+                               __private const short filter_width,
+                               __private const short padding_top,
+                               __private const short padding_left,
+                               __private const short dilation_h,
+                               __private const short dilation_w) {
   const short out_ch_blk = get_global_id(0);
   const short out_w_blk = get_global_id(1);
   const short out_w_blks = get_global_size(1);
@@ -52,7 +52,6 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
   int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left;
   int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left;
   int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left;
-  int in_width4 = ((out_w_blk + (out_w_blks << 2)) << 1) - padding_left;
   const int height_idx = (out_h << 1) - padding_top;
 #else
   const short in_width_stride = mul24(out_w_blks, STRIDE);
@@ -90,7 +89,7 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
       READ_INPUT(3);
 
 #undef READ_INPUT
-      
+
       DATA_TYPE4 weights = READ_IMAGET(filter, SAMPLER,
                                        (int2)(filter_idx, in_ch_blk));
 
@@ -127,3 +126,120 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
   if (w >= out_width) return;
   WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
 }
+
+__kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+                                  __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
+#ifdef BIAS
+    __read_only image2d_t bias, /* cout%4 * cout/4 */
+#endif
+                                  __write_only image2d_t output,
+                                  __private const DATA_TYPE relux_max_limit,
+                                  __private const DATA_TYPE prelu_alpha,
+                                  __private const short in_height,
+                                  __private const short in_width,
+                                  __private const short in_ch_blks,
+                                  __private const short out_height,
+                                  __private const short out_width,
+                                  __private const short filter_height,
+                                  __private const short filter_width,
+                                  __private const short padding_top,
+                                  __private const short padding_left) {
+  const short out_ch_blk = get_global_id(0);
+  const short out_w_blk = get_global_id(1) << 2;
+  const short out_hb = get_global_id(2);
+  const short rounded_in_ch = in_ch_blks << 2;
+  const short in_ch_blk = out_ch_blk; // multiplier = 1
+
+#ifdef BIAS
+  DATA_TYPE4 out0 =
+     READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
+  DATA_TYPE4 out1 = out0;
+  DATA_TYPE4 out2 = out0;
+  DATA_TYPE4 out3 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
+#endif
+
+  const short out_h = out_hb % out_height;
+  const short in_width0 = out_w_blk - padding_left;
+  const short in_width1 = in_width0 + 1;
+  const short in_width2 = in_width1 + 1;
+  const short in_width3 = in_width2 + 1;
+  const short height_idx = out_h - padding_top;
+
+  const short batch_idx = mul24((out_hb / out_height), in_height);
+  const short rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width);
+
+  const short in_idx = mul24(in_ch_blk, in_width);
+  short filter_idx = 0;
+  short in_hb_idx = height_idx;
+
+  const short in_w_idx0 = select(in_idx + in_width0,
+                                 -1,
+                                 (in_width0 < 0 || in_width0 >= in_width));
+  const short in_w_idx1 = select(in_idx + in_width1,
+                                 -1,
+                                 (in_width1 < 0 || in_width1 >= in_width));
+  const short in_w_idx2 = select(in_idx + in_width2,
+                                 -1,
+                                 (in_width2 < 0 || in_width2 >= in_width));
+
+  short in_w;
+  DATA_TYPE4 in0, in1, in2, in3;
+  for (short filter_h_idx = 0; filter_h_idx < filter_height; ++filter_h_idx) {
+    short in_hb = select(in_hb_idx + batch_idx,
+                         -1,
+                         (in_hb_idx < 0 || in_hb_idx >= in_height));
+    in1 = READ_IMAGET(input, SAMPLER, (int2)(in_w_idx0, in_hb));
+    in2 = READ_IMAGET(input, SAMPLER, (int2)(in_w_idx1, in_hb));
+    in3 = READ_IMAGET(input, SAMPLER, (int2)(in_w_idx2, in_hb));
+
+    for (short filter_w_idx = 0; filter_w_idx < filter_width; ++filter_w_idx) {
+      in0 = in1;
+      in1 = in2;
+      in2 = in3;
+
+      in_w = in_width3 + filter_w_idx;
+      in_w = select(in_idx + in_w,
+                    -1,
+                    (in_w < 0 || in_w >= in_width));
+      in3 = READ_IMAGET(input, SAMPLER, (int2)(in_w, in_hb));
+
+      DATA_TYPE4 weights = READ_IMAGET(filter, SAMPLER,
+                                       (int2)(filter_idx, in_ch_blk));
+
+      out0 = mad(in0, weights, out0);
+      out1 = mad(in1, weights, out1);
+      out2 = mad(in2, weights, out2);
+      out3 = mad(in3, weights, out3);
+      ++filter_idx;
+    }
+    in_hb_idx += 1;
+  }
+
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0 = do_activation(out0, relux_max_limit, prelu_alpha);
+  out1 = do_activation(out1, relux_max_limit, prelu_alpha);
+  out2 = do_activation(out2, relux_max_limit, prelu_alpha);
+  out3 = do_activation(out3, relux_max_limit, prelu_alpha);
+#endif
+
+  const short out_x_base = mul24(out_ch_blk, out_width);
+  short w = out_w_blk;
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0);
+
+  w += 1;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1);
+
+  w += 1;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2);
+
+  w += 1;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
+}
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 48f14bea8f313e19d483fd3c1b6e57f483382413..b1712d149817244fa80df61f4ae96df8971c707f 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -11,7 +11,8 @@
 namespace mace {
 namespace kernels {
 
-static void Concat2(const Tensor *input0,
+static void Concat2(cl::Kernel *kernel,
+                    const Tensor *input0,
                     const Tensor *input1,
                     const DataType dt,
                     Tensor *output,
@@ -23,27 +24,29 @@ static void Concat2(const Tensor *input0,
 
   const int channel_blk = RoundUpDiv4(channel);
 
-  auto runtime = OpenCLRuntime::Global();
-  std::set<std::string> built_options;
-  std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
-  built_options.emplace("-Dconcat_channel=" + kernel_name);
-  if (input0->dtype() == output->dtype()) {
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-  } else {
-    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  }
-  if (input0->dim(3) % 4 == 0) {
-    built_options.emplace("-DDIVISIBLE_FOUR");
-  }
-  auto concat_kernel = runtime->BuildKernel("concat", kernel_name, built_options);
+  if (kernel->get() == nullptr) {
+    auto runtime = OpenCLRuntime::Global();
+    std::set<std::string> built_options;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
+    built_options.emplace("-Dconcat_channel=" + kernel_name);
+    if (input0->dtype() == output->dtype()) {
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    } else {
+      built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    }
+    if (input0->dim(3) % 4 == 0) {
+      built_options.emplace("-DDIVISIBLE_FOUR");
+    }
+    *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
 
-  uint32_t idx = 0;
-  concat_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input0->buffer())));
-  concat_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input1->buffer())));
-  concat_kernel.setArg(idx++, static_cast<int32_t>(input0->dim(3)));
-  concat_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+    uint32_t idx = 0;
+    kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input0->buffer())));
+    kernel->setArg(idx++, *(static_cast<const cl::Image2D *>(input1->buffer())));
+    kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
+    kernel->setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+  }
 
   const uint32_t gws[3] = {
       static_cast<uint32_t>(channel_blk),
@@ -57,7 +60,7 @@ static void Concat2(const Tensor *input0,
      << output->dim(1) << "_"
      << output->dim(2) << "_"
      << output->dim(3);
-  TuningOrRun3DKernel(concat_kernel, ss.str(), gws, lws, future);
+  TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
 }
 
 template<typename T>
@@ -90,7 +93,7 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Te
 
   switch (inputs_count) {
     case 2:
-      Concat2(input_list[0], input_list[1], DataTypeToEnum<T>::value,
+      Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
               output, future);
       break;
     default:MACE_NOT_IMPLEMENTED;
diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc
index 03883d3a6fac483298fcfbc15d39564ad5ae8c06..5a6d93cbf29e3417c93b02d40d00e3925142c249 100644
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -3,64 +3,44 @@
 //
 
 #include "mace/kernels/conv_2d.h"
-#include "mace/kernels/activation.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
 namespace kernels {
 
-extern void Conv2dOpenclK1x1S1(const Tensor *input,
-                               const Tensor *filter,
-                               const Tensor *bias,
-                               const int *padding,
-                               const int *dilations,
-                               const ActivationType activation,
-                               const float relux_max_limit,
-                               const float prelu_alpha,
-                               const DataType dt,
-                               Tensor *output,
-                               StatsFuture *future);
+extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
+                             const Tensor *input,
+                             const Tensor *filter,
+                             const Tensor *bias,
+                             const int stride,
+                             const int *padding,
+                             const int *dilations,
+                             const ActivationType activation,
+                             const float relux_max_limit,
+                             const float prelu_alpha,
+                             const DataType dt,
+                             Tensor *output,
+                             StatsFuture *future);
 
-extern void Conv2dOpenclK1x1S2(const Tensor *input,
-                               const Tensor *filter,
-                               const Tensor *bias,
-                               const int *padding,
-                               const int *dilations,
-                               const ActivationType activation,
-                               const float relux_max_limit,
-                               const float prelu_alpha,
-                               const DataType dt,
-                               Tensor *output,
-                               StatsFuture *future);
+extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
+                             const Tensor *input,
+                             const Tensor *filter,
+                             const Tensor *bias,
+                             const int stride,
+                             const int *padding,
+                             const int *dilations,
+                             const ActivationType activation,
+                             const float relux_max_limit,
+                             const float prelu_alpha,
+                             const DataType dt,
+                             Tensor *output,
+                             StatsFuture *future);
 
-extern void Conv2dOpenclK3x3S1(const Tensor *input,
-                               const Tensor *filter,
-                               const Tensor *bias,
-                               const int *padding,
-                               const int *dilations,
-                               const ActivationType activation,
-                               const float relux_max_limit,
-                               const float prelu_alpha,
-                               const DataType dt,
-                               Tensor *output,
-                               StatsFuture *future);
-
-extern void Conv2dOpenclK3x3S2(const Tensor *input,
-                               const Tensor *filter,
-                               const Tensor *bias,
-                               const int *padding,
-                               const int *dilations,
-                               const ActivationType activation,
-                               const float relux_max_limit,
-                               const float prelu_alpha,
-                               const DataType dt,
-                               Tensor *output,
-                               StatsFuture *future);
-
-extern void Conv2dOpencl(const Tensor *input,
+extern void Conv2dOpencl(cl::Kernel *kernel,
+                         const Tensor *input,
                          const Tensor *filter,
                          const Tensor *bias,
-                         const uint32_t stride,
+                         const int stride,
                          const int *padding,
                          const int *dilations,
                          const ActivationType activation,
@@ -70,24 +50,21 @@ extern void Conv2dOpencl(const Tensor *input,
                          Tensor *output,
                          StatsFuture *future);
 
-template <typename T>
+template<typename T>
 void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                       const Tensor *filter,
                                                       const Tensor *bias,
                                                       Tensor *output,
                                                       StatsFuture *future) {
   typedef void (*Conv2dOpenclFunction)(
-      const Tensor *input, const Tensor *filter, const Tensor *bias,
+      cl::Kernel *kernel,
+      const Tensor *input, const Tensor *filter, const Tensor *bias, const int stride,
       const int *padding, const int *dilations, const ActivationType activation,
       const float relux_max_limit, const float prelu_alpha, const DataType dt,
       Tensor *output, StatsFuture *future);
   // Selection matrix: kernel_size x stride_size
-  static const Conv2dOpenclFunction selector[5][2] = {
-      {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2},
-      {nullptr, nullptr},
-      {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
-      {nullptr, nullptr},
-      {nullptr, nullptr}};
+  static const Conv2dOpenclFunction selector[5] =
+      {Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
 
   index_t kernel_h = filter->dim(0);
   index_t kernel_w = filter->dim(1);
@@ -113,20 +90,23 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   output->ResizeImage(output_shape, output_image_shape);
 
   if (kernel_h == kernel_w && kernel_h <= 5 &&
-      selector[kernel_h - 1][strides_[0] - 1] != nullptr) {
-    auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
-    conv2d_func(input, filter, bias, paddings.data(), dilations_, activation_,
+      selector[kernel_h - 1] != nullptr &&
+      0 < strides_[0] && strides_[0] < 3 ) {
+    auto conv2d_func = selector[kernel_h - 1];
+    conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_,
                 relux_max_limit_, prelu_alpha_, DataTypeToEnum<T>::value,
                 output, future);
   } else {
-    Conv2dOpencl(input, filter, bias, strides_[0], paddings.data(), dilations_,
+    Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
                  activation_, relux_max_limit_, prelu_alpha_,
                  DataTypeToEnum<T>::value, output, future);
   }
 }
 
-template struct Conv2dFunctor<DeviceType::OPENCL, float>;
-template struct Conv2dFunctor<DeviceType::OPENCL, half>;
+template
+struct Conv2dFunctor<DeviceType::OPENCL, float>;
+template
+struct Conv2dFunctor<DeviceType::OPENCL, half>;
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index d96f80ceea72699b4e762ae6cacb48dc5c3e4eb8..013df8509081c708849f0f2853141b2d0e8e2256 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -3,26 +3,26 @@
 //
 
 #include "mace/kernels/conv_2d.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/activation.h"
 #include "mace/kernels/opencl/helper.h"
 #include "mace/utils/tuner.h"
-#include "mace/utils/utils.h"
 
 namespace mace {
 namespace kernels {
 
-void Conv1x1(const Tensor *input,
-             const Tensor *filter,
-             const Tensor *bias,
-             const int stride,
-             const ActivationType activation,
-             const float relux_max_limit,
-             const float prelu_alpha,
-             const DataType dt,
-             Tensor *output,
-             StatsFuture *future) {
+extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
+                             const Tensor *input,
+                             const Tensor *filter,
+                             const Tensor *bias,
+                             const int stride,
+                             const int *padding,
+                             const int *dilations,
+                             const ActivationType activation,
+                             const float relux_max_limit,
+                             const float prelu_alpha,
+                             const DataType dt,
+                             Tensor *output,
+                             StatsFuture *future) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -36,62 +36,64 @@ void Conv1x1(const Tensor *input,
   const index_t width_blocks = RoundUpDiv4(width);
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
 
-  MACE_CHECK(input_batch == batch);
+  if (kernel->get() == nullptr) {
+    MACE_CHECK(input_batch == batch);
 
-  std::set<std::string> built_options;
-  std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
-  built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  built_options.emplace("-DSTRIDE=" + ToString(stride));
-  if (bias != nullptr) {
-    built_options.emplace("-DBIAS");
-  }
-  switch (activation) {
-    case NOOP:
-      break;
-    case RELU:
-      built_options.emplace("-DUSE_RELU");
-      break;
-    case RELUX:
-      built_options.emplace("-DUSE_RELUX");
-      break;
-    case PRELU:
-      built_options.emplace("-DUSE_PRELU");
-      break;
-    case TANH:
-      built_options.emplace("-DUSE_TANH");
-      break;
-    case SIGMOID:
-      built_options.emplace("-DUSE_SIGMOID");
-      break;
-    defeult:
-      LOG(FATAL) << "Unknown activation type: " << activation;
-  }
+    std::set<std::string> built_options;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_1x1");
+    built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    built_options.emplace("-DSTRIDE=" + ToString(stride));
+    if (bias != nullptr) {
+      built_options.emplace("-DBIAS");
+    }
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case PRELU:
+        built_options.emplace("-DUSE_PRELU");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      defeult:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
 
-  auto runtime = OpenCLRuntime::Global();
-  auto conv_2d_kernel =
-      runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
+    auto runtime = OpenCLRuntime::Global();
+    *kernel =
+        runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
 
-  uint32_t idx = 0;
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Image2D *>(input->buffer())));
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Image2D *>(filter->buffer())));
-  if (bias != nullptr) {
-    conv_2d_kernel.setArg(idx++,
-                          *(static_cast<const cl::Image2D *>(bias->buffer())));
+    uint32_t idx = 0;
+    kernel->setArg(idx++,
+                          *(static_cast<const cl::Image2D *>(input->buffer())));
+    kernel->setArg(idx++,
+                          *(static_cast<const cl::Image2D *>(filter->buffer())));
+    if (bias != nullptr) {
+      kernel->setArg(idx++,
+                            *(static_cast<const cl::Image2D *>(bias->buffer())));
+    }
+    kernel->setArg(idx++,
+                          *(static_cast<const cl::Image2D *>(output->buffer())));
+    // FIXME handle flexable data type: half not supported
+    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, prelu_alpha);
+    kernel->setArg(idx++, static_cast<int>(input_height));
+    kernel->setArg(idx++, static_cast<int>(input_width));
+    kernel->setArg(idx++, static_cast<int>(input_channel_blocks));
+    kernel->setArg(idx++, static_cast<int>(height));
+    kernel->setArg(idx++, static_cast<int>(width));
   }
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Image2D *>(output->buffer())));
-  // FIXME handle flexable data type: half not supported
-  conv_2d_kernel.setArg(idx++, relux_max_limit);
-  conv_2d_kernel.setArg(idx++, prelu_alpha);
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input_height));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input_width));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width_blocks),
@@ -100,38 +102,9 @@ void Conv1x1(const Tensor *input,
   std::string tuning_key =
       Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future);
+  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
 }
 
-extern void Conv2dOpenclK1x1S1(const Tensor *input,
-                               const Tensor *filter,
-                               const Tensor *bias,
-                               const int *padding,
-                               const int *dilations,
-                               const ActivationType activation,
-                               const float relux_max_limit,
-                               const float prelu_alpha,
-                               const DataType dt,
-                               Tensor *output,
-                               StatsFuture *future) {
-  Conv1x1(input, filter, bias, 1, activation, relux_max_limit, prelu_alpha, dt,
-          output, future);
-};
-
-extern void Conv2dOpenclK1x1S2(const Tensor *input,
-                               const Tensor *filter,
-                               const Tensor *bias,
-                               const int *padding,
-                               const int *dilations,
-                               const ActivationType activation,
-                               const float relux_max_limit,
-                               const float prelu_alpha,
-                               const DataType dt,
-                               Tensor *output,
-                               StatsFuture *future) {
-  Conv1x1(input, filter, bias, 2, activation, relux_max_limit, prelu_alpha, dt,
-          output, future);
-};
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index 853dfe0a008080f60dcbdfca6c1bb1bcc0534f27..13e367e37705d1f1a241be066bf1649d835b49f9 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -13,18 +13,19 @@
 namespace mace {
 namespace kernels {
 
-static void Conv2d3x3S12(const Tensor *input,
-                         const Tensor *filter,
-                         const Tensor *bias,
-                         const uint32_t stride,
-                         const int *padding,
-                         const int *dilations,
-                         const ActivationType activation,
-                         const float relux_max_limit,
-                         const float prelu_alpha,
-                         const DataType dt,
-                         Tensor *output,
-                         StatsFuture *future) {
+extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
+                             const Tensor *input,
+                             const Tensor *filter,
+                             const Tensor *bias,
+                             const int stride,
+                             const int *padding,
+                             const int *dilations,
+                             const ActivationType activation,
+                             const float relux_max_limit,
+                             const float prelu_alpha,
+                             const DataType dt,
+                             Tensor *output,
+                             StatsFuture *future) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -35,61 +36,63 @@ static void Conv2d3x3S12(const Tensor *input,
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
   const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
 
-  std::set<std::string> built_options;
-  std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
-  built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-  built_options.emplace("-DSTRIDE=" + ToString(stride));
-  switch (activation) {
-    case NOOP:
-      break;
-    case RELU:
-      built_options.emplace("-DUSE_RELU");
-      break;
-    case RELUX:
-      built_options.emplace("-DUSE_RELUX");
-      break;
-    case PRELU:
-      built_options.emplace("-DUSE_PRELU");
-      break;
-    case TANH:
-      built_options.emplace("-DUSE_TANH");
-      break;
-    case SIGMOID:
-      built_options.emplace("-DUSE_SIGMOID");
-      break;
-    defeult:
-      LOG(FATAL) << "Unknown activation type: " << activation;
-  }
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
+    built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    built_options.emplace("-DSTRIDE=" + ToString(stride));
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case PRELU:
+        built_options.emplace("-DUSE_PRELU");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      defeult:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
 
-  auto runtime = OpenCLRuntime::Global();
-  auto conv_2d_kernel =
-      runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
+    auto runtime = OpenCLRuntime::Global();
+    *kernel =
+        runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
 
-  uint32_t idx = 0;
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Image2D *>(input->buffer())));
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Image2D *>(filter->buffer())));
-  if (bias != nullptr) {
-    conv_2d_kernel.setArg(idx++,
-                          *(static_cast<const cl::Image2D *>(bias->buffer())));
+    uint32_t idx = 0;
+    kernel->setArg(idx++,
+                          *(static_cast<const cl::Image2D *>(input->buffer())));
+    kernel->setArg(idx++,
+                          *(static_cast<const cl::Image2D *>(filter->buffer())));
+    if (bias != nullptr) {
+      kernel->setArg(idx++,
+                            *(static_cast<const cl::Image2D *>(bias->buffer())));
+    }
+    kernel->setArg(idx++,
+                          *(static_cast<const cl::Image2D *>(output->buffer())));
+    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, prelu_alpha);
+    kernel->setArg(idx++, static_cast<int>(input->dim(1)));
+    kernel->setArg(idx++, static_cast<int>(input->dim(2)));
+    kernel->setArg(idx++, static_cast<int>(input_channel_blocks));
+    kernel->setArg(idx++, static_cast<int>(height));
+    kernel->setArg(idx++, static_cast<int>(width));
+    kernel->setArg(idx++, padding[0] / 2);
+    kernel->setArg(idx++, padding[1] / 2);
+    kernel->setArg(idx++, dilations[0]);
+    kernel->setArg(idx++, dilations[1]);
   }
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Image2D *>(output->buffer())));
-  conv_2d_kernel.setArg(idx++, relux_max_limit);
-  conv_2d_kernel.setArg(idx++, prelu_alpha);
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
-  conv_2d_kernel.setArg(idx++, padding[0] / 2);
-  conv_2d_kernel.setArg(idx++, padding[1] / 2);
-  conv_2d_kernel.setArg(idx++, dilations[0]);
-  conv_2d_kernel.setArg(idx++, dilations[1]);
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width_blocks),
@@ -98,37 +101,8 @@ static void Conv2d3x3S12(const Tensor *input,
   std::string tuning_key =
       Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future);
+  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
 }
-void Conv2dOpenclK3x3S1(const Tensor *input,
-                        const Tensor *filter,
-                        const Tensor *bias,
-                        const int *padding,
-                        const int *dilations,
-                        const ActivationType activation,
-                        const float relux_max_limit,
-                        const float prelu_alpha,
-                        const DataType dt,
-                        Tensor *output,
-                        StatsFuture *future) {
-  Conv2d3x3S12(input, filter, bias, 1, padding, dilations, activation,
-               relux_max_limit, prelu_alpha, dt, output, future);
-};
-
-void Conv2dOpenclK3x3S2(const Tensor *input,
-                        const Tensor *filter,
-                        const Tensor *bias,
-                        const int *padding,
-                        const int *dilations,
-                        const ActivationType activation,
-                        const float relux_max_limit,
-                        const float prelu_alpha,
-                        const DataType dt,
-                        Tensor *output,
-                        StatsFuture *future) {
-  Conv2d3x3S12(input, filter, bias, 2, padding, dilations, activation,
-               relux_max_limit, prelu_alpha, dt, output, future);
-};
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index 1bff90f65ba8e08a46cf1f283e969f26a4e72a19..ef6401d35e421a0c1f2dc72d4d3a02d4e0934db6 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -13,18 +13,19 @@
 namespace mace {
 namespace kernels {
 
-void Conv2dOpencl(const Tensor *input,
-                  const Tensor *filter,
-                  const Tensor *bias,
-                  const uint32_t stride,
-                  const int *padding,
-                  const int *dilations,
-                  const ActivationType activation,
-                  const float relux_max_limit,
-                  const float prelu_alpha,
-                  const DataType dt,
-                  Tensor *output,
-                  StatsFuture *future) {
+extern void Conv2dOpencl(cl::Kernel *kernel,
+                         const Tensor *input,
+                         const Tensor *filter,
+                         const Tensor *bias,
+                         const int stride,
+                         const int *padding,
+                         const int *dilations,
+                         const ActivationType activation,
+                         const float relux_max_limit,
+                         const float prelu_alpha,
+                         const DataType dt,
+                         Tensor *output,
+                         StatsFuture *future) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -35,64 +36,66 @@ void Conv2dOpencl(const Tensor *input,
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
   const index_t width_blocks = RoundUpDiv4(width);
 
-  std::set<std::string> built_options;
-  std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
-  built_options.emplace("-Dconv_2d=" + kernel_name);
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-  built_options.emplace("-DSTRIDE=" + ToString(stride));
-  switch (activation) {
-    case NOOP:
-      break;
-    case RELU:
-      built_options.emplace("-DUSE_RELU");
-      break;
-    case RELUX:
-      built_options.emplace("-DUSE_RELUX");
-      break;
-    case PRELU:
-      built_options.emplace("-DUSE_PRELU");
-      break;
-    case TANH:
-      built_options.emplace("-DUSE_TANH");
-      break;
-    case SIGMOID:
-      built_options.emplace("-DUSE_SIGMOID");
-      break;
-    defeult:
-      LOG(FATAL) << "Unknown activation type: " << activation;
-  }
+  if (kernel->get() == nullptr) {
+    std::set<std::string> built_options;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
+    built_options.emplace("-Dconv_2d=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    built_options.emplace("-DSTRIDE=" + ToString(stride));
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case PRELU:
+        built_options.emplace("-DUSE_PRELU");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      defeult:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
 
-  auto runtime = OpenCLRuntime::Global();
-  auto conv_2d_kernel =
-      runtime->BuildKernel("conv_2d", kernel_name, built_options);
+    auto runtime = OpenCLRuntime::Global();
+    *kernel =
+        runtime->BuildKernel("conv_2d", kernel_name, built_options);
 
-  uint32_t idx = 0;
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Image2D *>(input->buffer())));
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Image2D *>(filter->buffer())));
-  if (bias != nullptr) {
-    conv_2d_kernel.setArg(idx++,
-                          *(static_cast<const cl::Image2D *>(bias->buffer())));
+    uint32_t idx = 0;
+    kernel->setArg(idx++,
+                          *(static_cast<const cl::Image2D *>(input->buffer())));
+    kernel->setArg(idx++,
+                          *(static_cast<const cl::Image2D *>(filter->buffer())));
+    if (bias != nullptr) {
+      kernel->setArg(idx++,
+                            *(static_cast<const cl::Image2D *>(bias->buffer())));
+    }
+    kernel->setArg(idx++,
+                          *(static_cast<const cl::Image2D *>(output->buffer())));
+    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, prelu_alpha);
+    kernel->setArg(idx++, static_cast<uint32_t>(input->dim(1)));
+    kernel->setArg(idx++, static_cast<uint32_t>(input->dim(2)));
+    kernel->setArg(idx++, static_cast<uint32_t>(input_channel_blocks));
+    kernel->setArg(idx++, static_cast<uint32_t>(height));
+    kernel->setArg(idx++, static_cast<uint32_t>(width));
+    kernel->setArg(idx++, static_cast<uint32_t>(filter->dim(0)));
+    kernel->setArg(idx++, static_cast<uint32_t>(filter->dim(1)));
+    kernel->setArg(idx++, static_cast<uint32_t>(stride));
+    kernel->setArg(idx++, padding[0] / 2);
+    kernel->setArg(idx++, padding[1] / 2);
+    kernel->setArg(idx++, dilations[0]);
+    kernel->setArg(idx++, dilations[1]);
   }
-  conv_2d_kernel.setArg(idx++,
-                        *(static_cast<const cl::Image2D *>(output->buffer())));
-  conv_2d_kernel.setArg(idx++, relux_max_limit);
-  conv_2d_kernel.setArg(idx++, prelu_alpha);
-  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
-  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
-  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(input_channel_blocks));
-  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(height));
-  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(width));
-  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(filter->dim(0)));
-  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(filter->dim(1)));
-  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(stride));
-  conv_2d_kernel.setArg(idx++, padding[0] / 2);
-  conv_2d_kernel.setArg(idx++, padding[1] / 2);
-  conv_2d_kernel.setArg(idx++, dilations[0]);
-  conv_2d_kernel.setArg(idx++, dilations[1]);
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width_blocks),
@@ -101,7 +104,7 @@ void Conv2dOpencl(const Tensor *input,
   std::string tuning_key =
       Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
-  TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future);
+  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
 }
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc
index ef4f3b4f40740cf1e25bd55b1b57ae4508a532c4..79c9196d94021ebf32ef0d96e2e46c4e0bfdd475 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -11,7 +11,8 @@
 namespace mace {
 namespace kernels {
 
-void DepthwiseConv2d(const Tensor *input,   // NHWC
+void DepthwiseConv2d(cl::Kernel *kernel,
+                     const Tensor *input,   // NHWC
                      const Tensor *filter,  // HWIM
                      const Tensor *bias,
                      const int stride,
@@ -28,80 +29,88 @@ void DepthwiseConv2d(const Tensor *input,   // NHWC
   const index_t width = output->dim(2);
   const index_t channels = output->dim(3);
 
-  const index_t input_batch = input->dim(0);
-  const index_t input_height = input->dim(1);
-  const index_t input_width = input->dim(2);
   const index_t input_channels = input->dim(3);
-
-  const index_t filter_height = filter->dim(0);
-  const index_t filter_width = filter->dim(1);
   const index_t multiplier = filter->dim(3);
-  MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported");
-  MACE_CHECK(multiplier * input_channels == channels);
-  MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=",
-             input_channels);
 
   const index_t channel_blocks = RoundUpDiv4(channels);
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
   const index_t width_blocks = RoundUpDiv4(width);
-
-  auto runtime = OpenCLRuntime::Global();
-  std::set<std::string> built_options;
-  std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
-  built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-  built_options.emplace("-DSTRIDE=" + ToString(stride));
-  switch (activation) {
-    case NOOP:
-      break;
-    case RELU:
-      built_options.emplace("-DUSE_RELU");
-      break;
-    case RELUX:
-      built_options.emplace("-DUSE_RELUX");
-      break;
-    case PRELU:
-      built_options.emplace("-DUSE_PRELU");
-      break;
-    case TANH:
-      built_options.emplace("-DUSE_TANH");
-      break;
-    case SIGMOID:
-      built_options.emplace("-DUSE_SIGMOID");
-      break;
-    defeult:
-      LOG(FATAL) << "Unknown activation type: " << activation;
-  }
-
-  auto dw_conv2d_kernel =
-      runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
-
-  uint32_t idx = 0;
-  dw_conv2d_kernel.setArg(idx++,
-                          *(static_cast<const cl::Image2D *>(input->buffer())));
-  dw_conv2d_kernel.setArg(
-      idx++, *(static_cast<const cl::Image2D *>(filter->buffer())));
-  if (bias != nullptr) {
-    dw_conv2d_kernel.setArg(
-        idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
+  if(kernel->get() == nullptr) {
+    const index_t input_batch = input->dim(0);
+    const index_t input_height = input->dim(1);
+    const index_t input_width = input->dim(2);
+
+    const index_t filter_height = filter->dim(0);
+    const index_t filter_width = filter->dim(1);
+    MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported");
+    MACE_CHECK(multiplier * input_channels == channels);
+    MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=",
+               input_channels);
+
+    auto runtime = OpenCLRuntime::Global();
+    std::set<std::string> built_options;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
+    if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
+      kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d_s1");
+      built_options.emplace("-Ddepthwise_conv2d_s1=" + kernel_name);
+    } else {
+      built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
+    }
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    built_options.emplace("-DSTRIDE=" + ToString(stride));
+    switch (activation) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case PRELU:
+        built_options.emplace("-DUSE_PRELU");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      defeult:
+        LOG(FATAL) << "Unknown activation type: " << activation;
+    }
+
+    *kernel = runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
+
+    uint32_t idx = 0;
+    kernel->setArg(idx++,
+                            *(static_cast<const cl::Image2D *>(input->buffer())));
+    kernel->setArg(
+        idx++, *(static_cast<const cl::Image2D *>(filter->buffer())));
+    if (bias != nullptr) {
+      kernel->setArg(
+          idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
+    }
+    kernel->setArg(
+        idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
+    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, prelu_alpha);
+    kernel->setArg(idx++, static_cast<short>(input_height));
+    kernel->setArg(idx++, static_cast<short>(input_width));
+    kernel->setArg(idx++, static_cast<short>(input_channel_blocks));
+    kernel->setArg(idx++, static_cast<short>(height));
+    kernel->setArg(idx++, static_cast<short>(width));
+    kernel->setArg(idx++, static_cast<short>(filter_height));
+    kernel->setArg(idx++, static_cast<short>(filter_width));
+    kernel->setArg(idx++, static_cast<short>(paddings[0] / 2));
+    kernel->setArg(idx++, static_cast<short>(paddings[1] / 2));
+    if (stride != 1 || dilations[0] != 1 || dilations[1] != 1) {
+      kernel->setArg(idx++, static_cast<short>(dilations[0]));
+      kernel->setArg(idx++, static_cast<short>(dilations[1]));
+    }
   }
-  dw_conv2d_kernel.setArg(
-      idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
-  dw_conv2d_kernel.setArg(idx++, relux_max_limit);
-  dw_conv2d_kernel.setArg(idx++, prelu_alpha);
-  dw_conv2d_kernel.setArg(idx++, static_cast<short>(input_height));
-  dw_conv2d_kernel.setArg(idx++, static_cast<short>(input_width));
-  dw_conv2d_kernel.setArg(idx++, static_cast<short>(input_channel_blocks));
-  dw_conv2d_kernel.setArg(idx++, static_cast<short>(height));
-  dw_conv2d_kernel.setArg(idx++, static_cast<short>(width));
-  dw_conv2d_kernel.setArg(idx++, static_cast<short>(filter_height));
-  dw_conv2d_kernel.setArg(idx++, static_cast<short>(filter_width));
-  dw_conv2d_kernel.setArg(idx++, static_cast<short>(paddings[0] / 2));
-  dw_conv2d_kernel.setArg(idx++, static_cast<short>(paddings[1] / 2));
-  dw_conv2d_kernel.setArg(idx++, static_cast<short>(dilations[0]));
-  dw_conv2d_kernel.setArg(idx++, static_cast<short>(dilations[1]));
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width_blocks),
@@ -109,7 +118,7 @@ void DepthwiseConv2d(const Tensor *input,   // NHWC
   const std::vector<uint32_t> lws = {8, 16, 8, 1};
   std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
                                   batch, height, width, channels, multiplier);
-  TuningOrRun3DKernel(dw_conv2d_kernel, tuning_key, gws, lws, future);
+  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
 }
 
 template <typename T>
@@ -153,7 +162,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
   output->ResizeImage(output_shape, output_image_shape);
 
-  DepthwiseConv2d(input, filter, bias, strides_[0], paddings.data(), dilations_,
+  DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
                   activation_, relux_max_limit_, prelu_alpha_,
                   DataTypeToEnum<T>::value, output, future);
 }
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index 84e102e08b3f3d78e18a07a85809b59df5b49021..451a19d068121bd55c7af4366d76a8afedfdecb6 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -121,18 +121,24 @@ std::vector<index_t> CalWinogradShape(const std::vector<index_t> &shape,
 
 std::string DtToCLDt(const DataType dt) {
   switch (dt) {
-    case DT_FLOAT:return "float";
-    case DT_HALF:return "half";
-    default:LOG(FATAL) << "Unsupported data type";
+    case DT_FLOAT:
+      return "float";
+    case DT_HALF:
+      return "half";
+    default:
+      LOG(FATAL) << "Unsupported data type";
       return "";
   }
 }
 
 std::string DtToCLCMDDt(const DataType dt) {
   switch (dt) {
-    case DT_FLOAT:return "f";
-    case DT_HALF:return "h";
-    default:LOG(FATAL) << "Not supported data type for opencl cmd data type";
+    case DT_FLOAT:
+      return "f";
+    case DT_HALF:
+      return "h";
+    default:
+      LOG(FATAL) << "Not supported data type for opencl cmd data type";
       return "";
   }
 }
@@ -140,8 +146,10 @@ std::string DtToCLCMDDt(const DataType dt) {
 std::string DtToUpstreamCLDt(const DataType dt) {
   switch (dt) {
     case DT_FLOAT:
-    case DT_HALF:return "float";
-    default:LOG(FATAL) << "Unsupported data type";
+    case DT_HALF:
+      return "float";
+    default:
+      LOG(FATAL) << "Unsupported data type";
       return "";
   }
 }
@@ -149,8 +157,10 @@ std::string DtToUpstreamCLDt(const DataType dt) {
 std::string DtToUpstreamCLCMDDt(const DataType dt) {
   switch (dt) {
     case DT_FLOAT:
-    case DT_HALF:return "f";
-    default:LOG(FATAL) << "Not supported data type for opencl cmd data type";
+    case DT_HALF:
+      return "f";
+    default:
+      LOG(FATAL) << "Not supported data type for opencl cmd data type";
       return "";
   }
 }
@@ -161,8 +171,8 @@ void TuningOrRun3DKernel(cl::Kernel &kernel,
                          const std::vector<uint32_t> &lws,
                          StatsFuture *future) {
   auto runtime = OpenCLRuntime::Global();
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel);
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
+    const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel);
     std::vector<uint32_t> local_ws(3, 0);
     local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
     local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
@@ -258,8 +268,8 @@ void TuningOrRun2DKernel(cl::Kernel &kernel,
                          const std::vector<uint32_t> &lws,
                          StatsFuture *future) {
   auto runtime = OpenCLRuntime::Global();
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel);
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
+    const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel);
     uint32_t local_ws[2];
     local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
     local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index 4406308eace8e424c2b5a2d6cf4f449d797f2ab2..44a92c611c4159248a1028eabf3a2b2903bbe76a 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -29,26 +29,28 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
   const index_t height_blocks = RoundUpDiv4(height);
   const index_t width_blocks = RoundUpDiv4(width);
 
-  auto runtime = OpenCLRuntime::Global();
-  std::set<std::string> built_options;
-  auto dt = DataTypeToEnum<T>::value;
-  std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
-  built_options.emplace("-Dmatmul=" + kernel_name);
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  auto matmul_kernel = runtime->BuildKernel("matmul", kernel_name, built_options);
+  if (kernel_.get() == nullptr) {
+    auto runtime = OpenCLRuntime::Global();
+    std::set<std::string> built_options;
+    auto dt = DataTypeToEnum<T>::value;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
+    built_options.emplace("-Dmatmul=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
 
-  uint32_t idx = 0;
-  matmul_kernel.setArg(idx++,
-                     *(static_cast<const cl::Image2D *>(A->buffer())));
-  matmul_kernel.setArg(idx++,
-                     *(static_cast<const cl::Image2D *>(B->buffer())));
-  matmul_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(C->buffer())));
-  matmul_kernel.setArg(idx++, static_cast<int>(height));
-  matmul_kernel.setArg(idx++, static_cast<int>(width));
-  matmul_kernel.setArg(idx++, static_cast<int>(A->dim(2)));
-  matmul_kernel.setArg(idx++, static_cast<int>(height_blocks));
-  matmul_kernel.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
+    uint32_t idx = 0;
+    kernel_.setArg(idx++,
+                         *(static_cast<const cl::Image2D *>(A->buffer())));
+    kernel_.setArg(idx++,
+                         *(static_cast<const cl::Image2D *>(B->buffer())));
+    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(C->buffer())));
+    kernel_.setArg(idx++, static_cast<int>(height));
+    kernel_.setArg(idx++, static_cast<int>(width));
+    kernel_.setArg(idx++, static_cast<int>(A->dim(2)));
+    kernel_.setArg(idx++, static_cast<int>(height_blocks));
+    kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
+  }
 
   const uint32_t gws[2] = {
       static_cast<uint32_t>(width_blocks),
@@ -61,7 +63,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(
      << C->dim(1) << "_"
      << C->dim(2) << "_"
      << C->dim(3);
-  TuningOrRun2DKernel(matmul_kernel, ss.str(), gws, lws, future);
+  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
 
 };
 
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index 248bf6a7a30953e6fce662f017859a97f6b44527..8a4274e8699371fa2062443ca143aaec1464aac5 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -11,68 +11,6 @@
 namespace mace {
 namespace kernels {
 
-static void Pooling(const Tensor *input,
-                    const int *stride,
-                    const int *paddings,
-                    const int pooling_size,
-                    const PoolingType type,
-                    const DataType dt,
-                    Tensor *output,
-                    StatsFuture *future) {
-  index_t batch = output->dim(0);
-  index_t out_height = output->dim(1);
-  index_t out_width = output->dim(2);
-  index_t channels = output->dim(3);
-
-  index_t channel_blocks = (channels + 3) / 4;
-
-  auto runtime = OpenCLRuntime::Global();
-  std::set<std::string> built_options;
-  std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
-  built_options.emplace("-Dpooling=" + kernel_name);
-  if (type == MAX && input->dtype() == output->dtype()) {
-    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    built_options.emplace(dt == DT_HALF ? "-DFP16" : "");
-  } else {
-    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  }
-  if (type == AVG) {
-    built_options.emplace("-DPOOL_AVG");
-  }
-  auto pooling_kernel = runtime->BuildKernel("pooling", kernel_name, built_options);
-
-  uint32_t idx = 0;
-  pooling_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-  pooling_kernel.setArg(idx++, static_cast<int32_t>(out_height));
-  pooling_kernel.setArg(idx++, paddings[0] / 2);
-  pooling_kernel.setArg(idx++, paddings[1] / 2);
-  pooling_kernel.setArg(idx++, stride[0]);
-  pooling_kernel.setArg(idx++, pooling_size);
-  pooling_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
-
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(channel_blocks),
-      static_cast<uint32_t>(out_width),
-      static_cast<uint32_t>(batch * out_height),
-  };
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
-  std::vector<uint32_t> lws(4, 1);
-  lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-  lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
-  lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
-  std::stringstream ss;
-  ss << "pooling_opencl_kernel_"
-     << output->dim(0) << "_"
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
-  TuningOrRun3DKernel(pooling_kernel, ss.str(), gws, lws, future);
-}
-
 template<typename T>
 void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                                                        Tensor *output,
@@ -95,8 +33,57 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
   output->ResizeImage(output_shape, output_image_shape);
 
-  Pooling(input, strides_, paddings.data(), kernels_[0], pooling_type_,
-          DataTypeToEnum<T>::value, output, future);
+  index_t batch = output->dim(0);
+  index_t out_height = output->dim(1);
+  index_t out_width = output->dim(2);
+  index_t channels = output->dim(3);
+
+  index_t channel_blocks = (channels + 3) / 4;
+
+  if (kernel_.get() == nullptr) {
+    const DataType dt = DataTypeToEnum<T>::value;
+    auto runtime = OpenCLRuntime::Global();
+    std::set<std::string> built_options;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
+    built_options.emplace("-Dpooling=" + kernel_name);
+    if (pooling_type_ == MAX && input->dtype() == output->dtype()) {
+      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+      built_options.emplace(dt == DT_HALF ? "-DFP16" : "");
+    } else {
+      built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+      built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    }
+    if (pooling_type_ == AVG) {
+      built_options.emplace("-DPOOL_AVG");
+    }
+    kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
+
+    uint32_t idx = 0;
+    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
+    kernel_.setArg(idx++, paddings[0] / 2);
+    kernel_.setArg(idx++, paddings[1] / 2);
+    kernel_.setArg(idx++, strides_[0]);
+    kernel_.setArg(idx++, kernels_[0]);
+    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+  }
+
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(channel_blocks),
+      static_cast<uint32_t>(out_width),
+      static_cast<uint32_t>(batch * out_height),
+  };
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
+  std::stringstream ss;
+  ss << "pooling_opencl_kernel_"
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
+  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
 
 }
 
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index 97550999b1a98b77abab858e7d8d21436802c4d8..f4e910459fa9e8534c14c2d68b132c325a754d16 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -21,40 +21,42 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
   const index_t channels = input->dim(3);
 
   const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t out_height = out_height_;
+  const index_t out_width = out_width_;
 
-  index_t out_height = out_height_;
-  index_t out_width = out_width_;
-  MACE_CHECK(out_height > 0 && out_width > 0);
-  std::vector<index_t> output_shape {batch, out_height, out_width, channels};
-  if (input->is_image()) {
-    std::vector<size_t> output_image_shape;
-    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
-    output->ResizeImage(output_shape, output_image_shape);
-  } else {
-    output->Resize(output_shape);
-  }
+  if (kernel_.get() == nullptr) {
+    MACE_CHECK(out_height > 0 && out_width > 0);
+    std::vector<index_t> output_shape{batch, out_height, out_width, channels};
+    if (input->is_image()) {
+      std::vector<size_t> output_image_shape;
+      CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
+      output->ResizeImage(output_shape, output_image_shape);
+    } else {
+      output->Resize(output_shape);
+    }
 
-  float height_scale =
-      CalculateResizeScale(in_height, out_height, align_corners_);
-  float width_scale = CalculateResizeScale(in_width, out_width, align_corners_);
+    float height_scale =
+        CalculateResizeScale(in_height, out_height, align_corners_);
+    float width_scale = CalculateResizeScale(in_width, out_width, align_corners_);
 
-  auto runtime = OpenCLRuntime::Global();
-  std::set<std::string> built_options;
-  std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
-  built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
-  auto dt = DataTypeToEnum<T>::value;
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  auto rb_kernel  = runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
+    auto runtime = OpenCLRuntime::Global();
+    std::set<std::string> built_options;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
+    built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    kernel_ = runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
 
-  uint32_t idx = 0;
-  rb_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
-  rb_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
-  rb_kernel.setArg(idx++, height_scale);
-  rb_kernel.setArg(idx++, width_scale);
-  rb_kernel.setArg(idx++, static_cast<int32_t>(in_height));
-  rb_kernel.setArg(idx++, static_cast<int32_t>(in_width));
-  rb_kernel.setArg(idx++, static_cast<int32_t>(out_height));
+    uint32_t idx = 0;
+    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+    kernel_.setArg(idx++, height_scale);
+    kernel_.setArg(idx++, width_scale);
+    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
+    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
+    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
+  }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(out_width),
@@ -66,7 +68,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
      << output->dim(1) << "_"
      << output->dim(2) << "_"
      << output->dim(3);
-  TuningOrRun3DKernel(rb_kernel, ss.str(), gws, lws, future);
+  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
 }
 
 template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
index 25e99661afe7d3afc6dbc407a5c9b7260c986ea9..55a487757ebcc399d61813db5935259454dfd935 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -23,21 +23,23 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
   const index_t channel_blocks = RoundUpDiv4(channels);
   const int remain_channels = channel_blocks * 4 - channels;
 
-  auto runtime = OpenCLRuntime::Global();
+  if (kernel_.get() == nullptr) {
+    auto runtime = OpenCLRuntime::Global();
 
-  std::set<std::string> built_options;
-  std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
-  built_options.emplace("-Dsoftmax=" + kernel_name);
-  auto dt = DataTypeToEnum<T>::value;
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-  cl::Kernel softmax_kernel = runtime->BuildKernel("softmax", kernel_name, built_options);
+    std::set<std::string> built_options;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
+    built_options.emplace("-Dsoftmax=" + kernel_name);
+    auto dt = DataTypeToEnum<T>::value;
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
 
-  uint32_t idx = 0;
-  softmax_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(logits->buffer())));
-  softmax_kernel.setArg(idx++, static_cast<int>(channels));
-  softmax_kernel.setArg(idx++, remain_channels);
-  softmax_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+    uint32_t idx = 0;
+    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(logits->buffer())));
+    kernel_.setArg(idx++, static_cast<int>(channels));
+    kernel_.setArg(idx++, remain_channels);
+    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+  }
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
@@ -48,7 +50,7 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
      << output->dim(1) << "_"
      << output->dim(2) << "_"
      << output->dim(3);
-  TuningOrRun3DKernel(softmax_kernel, ss.str(), gws, lws, future);
+  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
 }
 
 template
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
index 744fe0273084751a403dbe28553d93042e784c93..eccd6cef666d06f879d150e03c89169972318cdd 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -20,9 +20,9 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
                                                             const std::vector<index_t> &output_shape,
                                                             Tensor *batch_tensor,
                                                             StatsFuture *future) {
+  const char *kernel_name = nullptr;
   std::vector<size_t> output_image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, output_image_shape);
-  const char *kernel_name = nullptr;
   if (b2s_) {
     space_tensor->ResizeImage(output_shape, output_image_shape);
     kernel_name = "batch_to_space";
@@ -30,32 +30,34 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
     batch_tensor->ResizeImage(output_shape, output_image_shape);
     kernel_name = "space_to_batch";
   }
-  std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-  auto runtime = OpenCLRuntime::Global();
-  std::set<std::string> built_options;
-  std::stringstream kernel_name_ss;
-  kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
-  built_options.emplace(kernel_name_ss.str());
-  built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
-  auto s2b_kernel = runtime->BuildKernel("space_to_batch", kernel_name, built_options);
+  if (kernel_.get() == nullptr) {
+    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
+    auto runtime = OpenCLRuntime::Global();
+    std::set<std::string> built_options;
+    std::stringstream kernel_name_ss;
+    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
+    built_options.emplace(kernel_name_ss.str());
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
+    kernel_ = runtime->BuildKernel("space_to_batch", kernel_name, built_options);
 
-  uint32_t idx = 0;
-  if (b2s_) {
-    s2b_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(batch_tensor->buffer())));
-    s2b_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(space_tensor->buffer())));
-  } else {
-    s2b_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(space_tensor->buffer())));
-    s2b_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(batch_tensor->buffer())));
+    uint32_t idx = 0;
+    if (b2s_) {
+      kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(batch_tensor->buffer())));
+      kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(space_tensor->buffer())));
+    } else {
+      kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(space_tensor->buffer())));
+      kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(batch_tensor->buffer())));
+    }
+    kernel_.setArg(idx++, block_shape_[0]);
+    kernel_.setArg(idx++, block_shape_[1]);
+    kernel_.setArg(idx++, paddings_[0]);
+    kernel_.setArg(idx++, paddings_[2]);
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
   }
-  s2b_kernel.setArg(idx++, block_shape_[0]);
-  s2b_kernel.setArg(idx++, block_shape_[1]);
-  s2b_kernel.setArg(idx++, paddings_[0]);
-  s2b_kernel.setArg(idx++, paddings_[2]);
-  s2b_kernel.setArg(idx++, static_cast<int32_t>(space_tensor->dim(1)));
-  s2b_kernel.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
-  s2b_kernel.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
-  s2b_kernel.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
 
   const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
   const uint32_t gws[3] = {chan_blk,
@@ -68,7 +70,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
      << batch_tensor->dim(1) << "_"
      << batch_tensor->dim(2) << "_"
      << batch_tensor->dim(3);
-  TuningOrRun3DKernel(s2b_kernel, ss.str(), gws, lws, future);
+  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
 }
 
 template struct SpaceToBatchFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index 273806061f3bd068725451cc88787c59bc79bce5..a842ba719174505e115fd26bc8bd8cd7f2301898 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -25,31 +25,34 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
   const index_t round_h = (output_shape[1] + 1) / 2;
   const index_t round_w = (output_shape[2] + 1) / 2;
   const index_t out_width = input_tensor->dim(0) * round_h * round_w;
-  output_shape = {16, input_tensor->dim(3), out_width, 1};
-  std::vector<size_t> image_shape;
-  CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
-  output_tensor->ResizeImage(output_shape, image_shape);
 
-  string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
-  std::set<std::string> built_options;
-  built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
-  auto runtime = OpenCLRuntime::Global();
-  auto wino_kernel = runtime->BuildKernel("winograd_transform",
-                                         obfuscated_kernel_name,
-                                         built_options);
-
-  uint32_t idx = 0;
-  wino_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->buffer())));
-  wino_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->buffer())));
-  wino_kernel.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
-  wino_kernel.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
-  wino_kernel.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
-  wino_kernel.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
-  wino_kernel.setArg(idx++, static_cast<uint32_t>(round_w));
-  wino_kernel.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
-  wino_kernel.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
+  if (kernel_.get() == nullptr) {
+    output_shape = {16, input_tensor->dim(3), out_width, 1};
+    std::vector<size_t> image_shape;
+    CalImage2DShape(output_shape, BufferType::IN_OUT_HEIGHT, image_shape);
+    output_tensor->ResizeImage(output_shape, image_shape);
+
+    string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
+    std::set<std::string> built_options;
+    built_options.emplace("-Dwinograd_transform_2x2=" + obfuscated_kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+    auto runtime = OpenCLRuntime::Global();
+    kernel_ = runtime->BuildKernel("winograd_transform",
+                                   obfuscated_kernel_name,
+                                   built_options);
+
+    uint32_t idx = 0;
+    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->buffer())));
+    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->buffer())));
+    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
+    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(2)));
+    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(3)));
+    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
+    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
+    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
+    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
+  }
 
   const uint32_t gws[2] = {static_cast<size_t>(out_width),
                          static_cast<size_t>(RoundUpDiv4(input_tensor->dim(3)))};
@@ -60,7 +63,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
      << input_tensor->dim(1) << "_"
      << input_tensor->dim(2) << "_"
      << input_tensor->dim(3);
-  TuningOrRun2DKernel(wino_kernel, ss.str(), gws, lws, future);
+  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
 }
 
 template<typename T>
@@ -73,53 +76,55 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
   output_tensor->ResizeImage(output_shape, image_shape);
 
-  string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
-  std::set<std::string> built_options;
-  built_options.emplace("-Dwinograd_inverse_transform_2x2=" + obfuscated_kernel_name);
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
-  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
-  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-  switch (activation_) {
-    case NOOP:
-      break;
-    case RELU:
-      built_options.emplace("-DUSE_RELU");
-      break;
-    case RELUX:
-      built_options.emplace("-DUSE_RELUX");
-      break;
-    case PRELU:
-      built_options.emplace("-DUSE_PRELU");
-      break;
-    case TANH:
-      built_options.emplace("-DUSE_TANH");
-      break;
-    case SIGMOID:
-      built_options.emplace("-DUSE_SIGMOID");
-      break;
-    defeult:
-      LOG(FATAL) << "Unknown activation type: " << activation_;
-  }
-
-  auto runtime = OpenCLRuntime::Global();
-  auto wino_kernel = runtime->BuildKernel("winograd_transform",
-                                         obfuscated_kernel_name,
-                                         built_options);
-
-  const uint32_t round_h = (height_ + 1) / 2;
-  const uint32_t round_w = (width_ + 1) / 2;
-  uint32_t idx = 0;
-  wino_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->buffer())));
-  if (bias != nullptr) {
-    wino_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
+  if (kernel_.get() == nullptr) {
+    string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
+    std::set<std::string> built_options;
+    built_options.emplace("-Dwinograd_inverse_transform_2x2=" + obfuscated_kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+    switch (activation_) {
+      case NOOP:
+        break;
+      case RELU:
+        built_options.emplace("-DUSE_RELU");
+        break;
+      case RELUX:
+        built_options.emplace("-DUSE_RELUX");
+        break;
+      case PRELU:
+        built_options.emplace("-DUSE_PRELU");
+        break;
+      case TANH:
+        built_options.emplace("-DUSE_TANH");
+        break;
+      case SIGMOID:
+        built_options.emplace("-DUSE_SIGMOID");
+        break;
+      defeult:
+        LOG(FATAL) << "Unknown activation type: " << activation_;
+    }
+
+    auto runtime = OpenCLRuntime::Global();
+    kernel_ = runtime->BuildKernel("winograd_transform",
+                                   obfuscated_kernel_name,
+                                   built_options);
+
+    const uint32_t round_h = (height_ + 1) / 2;
+    const uint32_t round_w = (width_ + 1) / 2;
+    uint32_t idx = 0;
+    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->buffer())));
+    if (bias != nullptr) {
+      kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
+    }
+    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->buffer())));
+    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
+    kernel_.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
+    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
+    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
+    kernel_.setArg(idx++, relux_max_limit_);
+    kernel_.setArg(idx++, prelu_alpha_);
   }
-  wino_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->buffer())));
-  wino_kernel.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
-  wino_kernel.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
-  wino_kernel.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
-  wino_kernel.setArg(idx++, static_cast<uint32_t>(round_w));
-  wino_kernel.setArg(idx++, relux_max_limit_);
-  wino_kernel.setArg(idx++, prelu_alpha_);
 
   const uint32_t gws[2] = {static_cast<size_t>(input_tensor->dim(2)),
                          static_cast<size_t>(RoundUpDiv4(input_tensor->dim(1)))};
@@ -131,7 +136,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
      << input_tensor->dim(1) << "_"
      << input_tensor->dim(2) << "_"
      << input_tensor->dim(3);
-  TuningOrRun2DKernel(wino_kernel, ss.str(), gws, lws, future);
+  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
 }
 
 template
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index 1c1d90b2fd035eec8c644404c7f721414d1952a1..99d2363f8b003a3802c67b7177d286fd19ca1643 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -9,6 +9,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 
@@ -171,6 +172,8 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
   void operator()(const Tensor *input_tensor,
                   Tensor *output_tensor,
                   StatsFuture *future);
+
+  cl::Kernel kernel_;
 };
 
 }  //  namespace kernels
diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h
index 8d3d7fe3a4baa73ada8aff5e59bbc7d7c3a46c71..43e6a2df6140e5a2700bbe8dea528e72411724e0 100644
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -6,6 +6,7 @@
 
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
@@ -169,6 +170,8 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T> : ResizeBilinearFunctorBase
       : ResizeBilinearFunctorBase(size, align_corners) {}
 
   void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
+
+  cl::Kernel kernel_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h
index c686c60cec60098200e7da0db31472321dbe41ae..b29514a29e9ec4e1e37df3e643878ca914529b74 100644
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -8,6 +8,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/public/mace.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
@@ -55,6 +56,8 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
   void operator()(const Tensor *logits,
                   Tensor *output,
                   StatsFuture *future);
+
+  cl::Kernel kernel_;
 };
 
 }  //  namepsace kernels
diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h
index 9e2fd8762c4a7e8fa5d0d3b405d8701a6a914ed7..31b31bf944feff5e6bf3f3685601b356de530ef1 100644
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -8,6 +8,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/core/public/mace.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
@@ -51,6 +52,8 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T>: SpaceToBatchFunctorBase{
                   Tensor *batch_tensor,
                   StatsFuture *future);
 
+  cl::Kernel kernel_;
+
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h
index a71bda24b120f3eab77171dd2836c606151b6486..d6ba1e6251972a97dec91ac3da4fface16bf1bdd 100644
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -9,6 +9,7 @@
 #include "mace/core/tensor.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/kernels/activation.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 
 namespace mace {
 namespace kernels {
@@ -43,6 +44,8 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T> : WinogradTransformFuncto
   void operator()(const Tensor *input,
                   Tensor *output,
                   StatsFuture *future);
+
+  cl::Kernel kernel_;
 };
 
 struct WinogradInverseTransformFunctorBase {
@@ -100,6 +103,8 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> : WinogradInverseT
                   const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future);
+
+  cl::Kernel kernel_;
 };
 
 }  // namespace kernels
diff --git a/mace/ops/depthwise_conv2d.h b/mace/ops/depthwise_conv2d.h
index ed4ff152e444a5a9a5712e7112e01922e7fefd8b..04ad7f1b521ab0447a31e6bbcd15cc3bdab83f73 100644
--- a/mace/ops/depthwise_conv2d.h
+++ b/mace/ops/depthwise_conv2d.h
@@ -22,9 +22,11 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
         functor_(this->strides_.data(),
                  this->padding_,
                  this->dilations_.data(),
-                 kernels::ActivationType::NOOP,
-                 0.0f,
-                 0.0f) {}
+                 kernels::StringToActivationType(
+                     OperatorBase::GetSingleArgument<std::string>("activation",
+                                                                  "NOOP")),
+                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
+                 OperatorBase::GetSingleArgument<float>("alpha", 0.0f)) {}
 
   bool Run(StatsFuture *future) override {
     const Tensor *input = this->Input(INPUT);
diff --git a/mace/ops/depthwise_conv_2d_benchmark.cc b/mace/ops/depthwise_conv_2d_benchmark.cc
index 3a963dd0408afc4069ce654b4a5002d3d8252504..1f7dfa3ca6bb3ecbf2e1a2528af165bd2ab7c8b6 100644
--- a/mace/ops/depthwise_conv_2d_benchmark.cc
+++ b/mace/ops/depthwise_conv_2d_benchmark.cc
@@ -94,16 +94,16 @@ static void DepthwiseConv2d(int iters,
   BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
 
 BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1, float);
+//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1, float);
 BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1, float);
+//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1, float);
+//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1, float);
+//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1, float);
+//BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1, float);
+//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1, float);
+//BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1, float);
+//BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1, float);
+//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1, float);
+//BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1, float);
 
 }  //  namespace mace