diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 3c8b013ac51bfd219afef3a91ab600e195c0b99e..5b7ccdd8a7f24aec1247fe0a60b22c1b915f37eb 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -50,6 +50,21 @@ double OpenCLProfilingTimer::ElapsedMicros() {
   return (stop_nanos_ - start_nanos_) / 1000.0;
 }
 
+double OpenCLProfilingTimer::AccumulatedMicros() {
+  return accumulated_micros_;
+}
+
+void OpenCLProfilingTimer::AccumulateTiming(){
+  StopTiming();
+  accumulated_micros_ += (stop_nanos_ - start_nanos_) / 1000.0;
+}
+
+void OpenCLProfilingTimer::ClearTiming() {
+  start_nanos_ = 0;
+  stop_nanos_ = 0;
+  accumulated_micros_ = 0;
+}
+
 OpenCLRuntime *OpenCLRuntime::Global() {
   static OpenCLRuntime instance;
   return &instance;
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index 7245b926997459da7c52992524f635bc041d0c92..ff596459eaac19b69deb40c4a60440f9a1e484ac 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -18,16 +18,20 @@
 namespace mace {
 
 class OpenCLProfilingTimer : public Timer {
-  public:
-    explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event) {};
-    void StartTiming() override;
-    void StopTiming() override;
-    double ElapsedMicros() override;
+ public:
+  explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event), accumulated_micros_(0) {};
+  void StartTiming() override;
+  void StopTiming() override;
+  void AccumulateTiming() override;
+  void ClearTiming() override;
+  double ElapsedMicros() override;
+  double AccumulatedMicros() override;
 
-  private:
-    const cl::Event *event_;
-    double start_nanos_;
-    double stop_nanos_;
+ private:
+  const cl::Event *event_;
+  double start_nanos_;
+  double stop_nanos_;
+  double accumulated_micros_;
 };
 
 class OpenCLRuntime {
@@ -40,15 +44,15 @@ class OpenCLRuntime {
 
   void GetCallStats(const cl::Event &event, CallStats *stats);
   uint32_t GetDeviceMaxWorkGroupSize();
-  uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel);
+  uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
   cl::Kernel BuildKernel(const std::string &program_name,
                          const std::string &kernel_name,
                          const std::set<std::string> &build_options);
  private:
   OpenCLRuntime();
   ~OpenCLRuntime();
-  OpenCLRuntime(const OpenCLRuntime&) = delete;
-  OpenCLRuntime &operator=(const OpenCLRuntime&) = delete;
+  OpenCLRuntime(const OpenCLRuntime &) = delete;
+  OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
 
   void BuildProgram(const std::string &program_file_name,
                     const std::string &binary_file_name,
diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc
index 44eaa47e52a9558a27f8ba70128b7c06eb457a65..5575a0b1b70868e18a859131065ad4b498b27e43 100644
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -63,52 +63,11 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
-  const uint32_t kwg_size =
-      runtime->GetKernelMaxWorkGroupSize(activation_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(height * batch,
-                                     kwg_size / (local_ws[0] * local_ws[1]));
-    return {
-        {local_ws[0], local_ws[1], local_ws[2]},
-        {kwg_size / 16, 4, 4},
-        {kwg_size / 32, 4, 8},
-        {kwg_size / 32, 8, 4},
-        {kwg_size / 64, 8, 8},
-        {kwg_size / 64, 16, 4},
-        {kwg_size / 128, 8, 16},
-        {kwg_size / 128, 16, 8},
-        {kwg_size / 128, 32, 4},
-        {1, kwg_size / 32, 32},
-        {1, kwg_size / 64, 64},
-        {1, kwg_size / 128, 128},
-        {3, 15, 9},
-        {7, 15, 9},
-        {9, 7, 15},
-        {15, 7, 9},
-        {1, kwg_size, 1},
-        {4, 15, 8},  // SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        activation_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-    return error;
-  };
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
   std::string tuning_key =
       Concat("relu_opencl_kernel_", activation_, output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
-      tuning_key, lws, params_generator, func, &timer);
-  SetFuture(future, event);
+  TuningOrRun3DKernel(activation_kernel, tuning_key, gws, lws, future);
 }
 
 template struct ActivationFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index b4079dc39b729e589de1596651470aee84347c14..261efde071ee3b200c3a35290e685b43297ec956 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -49,56 +49,14 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
       static_cast<uint32_t>(width_pixels),
       static_cast<uint32_t>(batch_height_pixels)
   };
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
-  std::vector<uint32_t> lws = {64, 16};
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    uint32_t local_ws[2];
-    local_ws[0] = std::min<uint32_t>(width_pixels, kwg_size);
-    local_ws[1] = std::min<uint32_t>(batch_height_pixels, kwg_size / local_ws[0]);
-    return {{local_ws[0], local_ws[1]},
-            {local_ws[1], local_ws[0]},
-            {kwg_size / 4, 4},
-            {kwg_size / 16, 16},
-            {kwg_size / 32, 32},
-            {kwg_size / 64, 64},
-            {kwg_size / 128, 128},
-            {kwg_size / 256, 256},
-            {kwg_size / 512, 512},
-            {kwg_size, 1},
-            {1, kwg_size}
-    };
-  };
-  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        addn_kernel, cl::NullRange,
-        cl::NDRange(gws[0], gws[1]),
-        cl::NDRange(params[0], params[1]),
-        nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-    return error;
-  };
+  std::vector<uint32_t> lws = {64, 16, 1};
   std::stringstream ss;
   ss << "addn_opencl_kernel_"
      << output->dim(0) << "_"
      << output->dim(1) << "_"
      << output->dim(2) << "_"
      << output->dim(3);
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
-                                                     lws,
-                                                     params_generator,
-                                                     func,
-                                                     &timer);
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
+  TuningOrRun2DKernel(addn_kernel, ss.str(), gws, lws, future);
 }
 
 template <typename T>
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index 2d6c95a37963b6ffceb9d216d58017cedd01cb00..02ab76a85eedcdeb735c69937a326522fcf6b273 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -83,51 +83,11 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(height * batch,
-                                     kwg_size / (local_ws[0] * local_ws[1]));
-    return {
-        {local_ws[0], local_ws[1], local_ws[2]},
-        {kwg_size / 16, 4, 4},
-        {kwg_size / 32, 4, 8},
-        {kwg_size / 32, 8, 4},
-        {kwg_size / 64, 8, 8},
-        {kwg_size / 64, 16, 4},
-        {kwg_size / 128, 8, 16},
-        {kwg_size / 128, 16, 8},
-        {kwg_size / 128, 32, 4},
-        {1, kwg_size / 32, 32},
-        {1, kwg_size / 64, 64},
-        {1, kwg_size / 128, 128},
-        {3, 15, 9},
-        {7, 15, 9},
-        {9, 7, 15},
-        {15, 7, 9},
-        {1, kwg_size, 1},
-        {8, 128, 1},  // SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-    return error;
-  };
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
   std::string tuning_key =
       Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3), folded_constant_);
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
-      tuning_key, lws, params_generator, func, &timer);
-  SetFuture(future, event);
+  TuningOrRun3DKernel(bm_kernel, tuning_key, gws, lws, future);
 }
 
 template struct BatchNormFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index a02b155236c31a4f57177f51691ba7864aec91cc..b47a096efd2d2472e50b510e722e7142740fb332 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -50,65 +50,14 @@ static void Concat2(const Tensor *input0,
       static_cast<uint32_t>(width),
       static_cast<uint32_t>(batch * height),
   };
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel);
-  std::vector<uint32_t> lws = {8, 16, 8};
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blk, kwg_size);
-    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{local_ws[0], local_ws[1], local_ws[2]},
-            {local_ws[2], local_ws[1], local_ws[0]},
-            {kwg_size / 16, 4, 4},
-            {kwg_size / 32, 4, 8},
-            {kwg_size / 32, 8, 4},
-            {kwg_size / 64, 8, 8},
-            {kwg_size / 64, 16, 4},
-            {kwg_size / 128, 8, 16},
-            {kwg_size / 128, 16, 8},
-            {kwg_size / 128, 32, 4},
-            {1, kwg_size / 32, 32},
-            {1, kwg_size / 64, 64},
-            {1, kwg_size / 128, 128},
-            {3, 15, 9},
-            {7, 15, 9},
-            {9, 7, 15},
-            {15, 7, 9},
-            {1, kwg_size, 1},
-            {4, 15, 8}, //SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        concat_kernel, cl::NullRange,
-        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]),
-        nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-    return error;
-  };
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
   std::stringstream ss;
   ss << "concat_opencl_kernel_"
      << output->dim(0) << "_"
      << output->dim(1) << "_"
      << output->dim(2) << "_"
      << output->dim(3);
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
-                                                     lws,
-                                                     params_generator,
-                                                     func,
-                                                     &timer);
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
+  TuningOrRun3DKernel(concat_kernel, ss.str(), gws, lws, future);
 }
 
 template<typename T>
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index a8e9192d2c410f5e5bd7c5802b8f332f50c5b400..d62fdf56535372d7fa98da2dad16395656c078bb 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -96,51 +96,11 @@ void Conv1x1(const Tensor *input,
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width_blocks),
                            static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 15, 8};
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(height * batch,
-                                     kwg_size / (local_ws[0] * local_ws[1]));
-    return {
-        {local_ws[0], local_ws[1], local_ws[2]},
-        {kwg_size / 16, 4, 4},
-        {kwg_size / 32, 4, 8},
-        {kwg_size / 32, 8, 4},
-        {kwg_size / 64, 8, 8},
-        {kwg_size / 64, 16, 4},
-        {kwg_size / 128, 8, 16},
-        {kwg_size / 128, 16, 8},
-        {kwg_size / 128, 32, 4},
-        {1, kwg_size / 32, 32},
-        {1, kwg_size / 64, 64},
-        {1, kwg_size / 128, 128},
-        {3, 15, 9},
-        {7, 15, 9},
-        {9, 7, 15},
-        {15, 7, 9},
-        {1, kwg_size, 1},
-        {4, 15, 8},  // SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-    return error;
-  };
+  std::vector<uint32_t> lws = {8, 15, 8, 1};
   std::string tuning_key =
       Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
-      tuning_key, lws, params_generator, func, &timer);
-  SetFuture(future, event);
+  TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future);
 }
 
 extern void Conv2dOpenclK1x1S1(const Tensor *input,
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index 9779107b4a1bd1f524d4faa2766bbb37776b603d..3875403862fd97e58f6e6279e0d4e6a92ab9a96c 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -94,52 +94,11 @@ static void Conv2d3x3S12(const Tensor *input,
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width_blocks),
                            static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {4, 15, 8};
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(height * batch,
-                                     kwg_size / (local_ws[0] * local_ws[1]));
-    return {
-        {local_ws[0], local_ws[1], local_ws[2]},
-        {local_ws[2], local_ws[1], local_ws[0]},
-        {kwg_size / 16, 4, 4},
-        {kwg_size / 32, 4, 8},
-        {kwg_size / 32, 8, 4},
-        {kwg_size / 64, 8, 8},
-        {kwg_size / 64, 16, 4},
-        {kwg_size / 128, 8, 16},
-        {kwg_size / 128, 16, 8},
-        {kwg_size / 128, 32, 4},
-        {1, kwg_size / 32, 32},
-        {1, kwg_size / 64, 64},
-        {1, kwg_size / 128, 128},
-        {3, 15, 9},
-        {7, 15, 9},
-        {9, 7, 15},
-        {15, 7, 9},
-        {1, kwg_size, 1},
-        {4, 15, 8},  // SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-    return error;
-  };
+  std::vector<uint32_t> lws = {4, 15, 8, 1};
   std::string tuning_key =
       Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
-      tuning_key, lws, params_generator, func, &timer);
-  SetFuture(future, event);
+  TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future);
 }
 void Conv2dOpenclK3x3S1(const Tensor *input,
                         const Tensor *filter,
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index 8929579907b006ffeaf9b3ac3bb25260077880ee..2a96d8647668ab1abd5bc61c07f1af1c4b37a99e 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -96,52 +96,11 @@ void Conv2dOpencl(const Tensor *input,
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width_blocks),
                            static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(height * batch,
-                                     kwg_size / (local_ws[0] * local_ws[1]));
-    return {
-        {local_ws[0], local_ws[1], local_ws[2]},
-        {local_ws[2], local_ws[1], local_ws[0]},
-        {kwg_size / 16, 4, 4},
-        {kwg_size / 32, 4, 8},
-        {kwg_size / 32, 8, 4},
-        {kwg_size / 64, 8, 8},
-        {kwg_size / 64, 16, 4},
-        {kwg_size / 128, 8, 16},
-        {kwg_size / 128, 16, 8},
-        {kwg_size / 128, 32, 4},
-        {1, kwg_size / 32, 32},
-        {1, kwg_size / 64, 64},
-        {1, kwg_size / 128, 128},
-        {3, 15, 9},
-        {7, 15, 9},
-        {9, 7, 15},
-        {15, 7, 9},
-        {1, kwg_size, 1},
-        {4, 15, 8},  // SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-    return error;
-  };
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
   std::string tuning_key =
       Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
-      tuning_key, lws, params_generator, func, &timer);
-  SetFuture(future, event);
+  TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future);
 }
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index 2c1dc264bd5ac1ddaeeaf47ea54a6e8b9e32e13a..e220d34463212887bbaaf927288a15ad9549ba32 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -4,13 +4,14 @@
 
 #include "mace/kernels/opencl/helper.h"
 #include "mace/utils/utils.h"
+#include "mace/utils/tuner.h"
 
 namespace mace {
 namespace kernels {
 
 // [(c+3)/4*W, N * H]
 void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
-                        std::vector<size_t> &image_shape) {
+                           std::vector<size_t> &image_shape) {
   MACE_CHECK(shape.size() == 4);
   image_shape.resize(2);
   image_shape[0] = RoundUpDiv4(shape[3]) * shape[2];
@@ -39,41 +40,30 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                      const BufferType type,
                      std::vector<size_t> &image_shape) {
   switch (type) {
-    case FILTER:
-      CalFilterImageShape(shape, image_shape);
+    case FILTER:CalFilterImageShape(shape, image_shape);
       break;
-    case IN_OUT:
-      CalInOutputImageShape(shape, image_shape);
+    case IN_OUT:CalInOutputImageShape(shape, image_shape);
       break;
-    case ARGUMENT:
-      CalArgImageShape(shape, image_shape);
+    case ARGUMENT:CalArgImageShape(shape, image_shape);
       break;
-    default:
-      LOG(FATAL) << "Mace not supported yet.";
+    default:LOG(FATAL) << "Mace not supported yet.";
   }
 }
 
-
 std::string DtToCLDt(const DataType dt) {
   switch (dt) {
-    case DT_FLOAT:
-      return "float";
-    case DT_HALF:
-      return "half";
-    default:
-      LOG(FATAL) << "Unsupported data type";
+    case DT_FLOAT:return "float";
+    case DT_HALF:return "half";
+    default:LOG(FATAL) << "Unsupported data type";
       return "";
   }
 }
 
 std::string DtToCLCMDDt(const DataType dt) {
   switch (dt) {
-    case DT_FLOAT:
-      return "f";
-    case DT_HALF:
-      return "h";
-    default:
-      LOG(FATAL) << "Not supported data type for opencl cmd data type";
+    case DT_FLOAT:return "f";
+    case DT_HALF:return "h";
+    default:LOG(FATAL) << "Not supported data type for opencl cmd data type";
       return "";
   }
 }
@@ -81,10 +71,8 @@ std::string DtToCLCMDDt(const DataType dt) {
 std::string DtToUpstreamCLDt(const DataType dt) {
   switch (dt) {
     case DT_FLOAT:
-    case DT_HALF:
-      return "float";
-    default:
-      LOG(FATAL) << "Unsupported data type";
+    case DT_HALF:return "float";
+    default:LOG(FATAL) << "Unsupported data type";
       return "";
   }
 }
@@ -92,13 +80,200 @@ std::string DtToUpstreamCLDt(const DataType dt) {
 std::string DtToUpstreamCLCMDDt(const DataType dt) {
   switch (dt) {
     case DT_FLOAT:
-    case DT_HALF:
-      return "f";
-    default:
-      LOG(FATAL) << "Not supported data type for opencl cmd data type";
+    case DT_HALF:return "f";
+    default:LOG(FATAL) << "Not supported data type for opencl cmd data type";
       return "";
   }
 }
 
+void TuningOrRun3DKernel(cl::Kernel &kernel,
+                         const std::string tuning_key,
+                         const uint32_t *gws,
+                         std::vector<uint32_t> &lws,
+                         StatsFuture *future) {
+  auto runtime = OpenCLRuntime::Global();
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel);
+  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
+    std::vector<uint32_t> local_ws(3, 0);
+    local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
+    local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
+    local_ws[2] = std::min<uint32_t>(gws[2],
+                                     kwg_size / (local_ws[0] * local_ws[1]));
+    return {
+        {local_ws[0], local_ws[1], local_ws[2], 1},
+        {kwg_size / 16, 4, 4, 1},
+        {kwg_size / 32, 4, 8, 1},
+        {kwg_size / 32, 8, 4, 1},
+        {kwg_size / 64, 8, 8, 1},
+        {kwg_size / 64, 16, 4, 1},
+        {kwg_size / 128, 8, 16, 1},
+        {kwg_size / 128, 16, 8, 1},
+        {kwg_size / 128, 32, 4, 1},
+        {1, kwg_size / 32, 32, 1},
+        {1, kwg_size / 64, 64, 1},
+        {1, kwg_size / 128, 128, 1},
+        {3, 15, 9, 1},
+        {7, 15, 9, 1},
+        {9, 7, 15, 1},
+        {15, 7, 9, 1},
+        {1, kwg_size, 1, 1},
+        {4, 15, 8, 1},  // SNPE size
+    };
+  };
+  cl::Event event;
+  auto func = [&](const std::vector<uint32_t> &params,
+                  Timer *timer,
+                  std::vector<uint32_t> *tuning_result) -> cl_int {
+    MACE_CHECK(params.size() == 4) << "Tuning parameters of 3D kernel must be 4D";
+    cl_int error = CL_SUCCESS;
+    if (timer == nullptr) {
+      uint32_t num_blocks = params[3];
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->ClearTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->AccumulateTiming();
+      tuning_result->assign(params.begin(), params.end());
+
+      if (LimitKernelTime()) {
+        double elapse_time = timer->AccumulatedMicros();
+        timer->ClearTiming();
+        uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+        (*tuning_result)[3] = num_blocks;
+        const uint32_t block_size = gws[2] / num_blocks;
+        if (gws[2] % num_blocks > 0) num_blocks++;
+        for (uint32_t i = 0; i < num_blocks; ++i) {
+          uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+          error = runtime->command_queue().enqueueNDRangeKernel(
+              kernel,
+              cl::NDRange(0, 0, i * block_size),
+              cl::NDRange(gws[0], gws[1], gws2),
+              cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+          MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+          timer->AccumulateTiming();
+        }
+      }
+    }
+    return error;
+  };
+  OpenCLProfilingTimer timer(&event);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
+      tuning_key, lws, params_generator, func, &timer);
+
+  if (future != nullptr) {
+    future->wait_fn = [event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        OpenCLRuntime::Global()->GetCallStats(event, stats);
+      }
+    };
+  }
+}
+
+void TuningOrRun2DKernel(cl::Kernel &kernel,
+                         const std::string tuning_key,
+                         const uint32_t *gws,
+                         std::vector<uint32_t> &lws,
+                         StatsFuture *future) {
+  auto runtime = OpenCLRuntime::Global();
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel);
+  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
+    uint32_t local_ws[2];
+    local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
+    local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
+    return {{local_ws[0], local_ws[1], 1},
+            {local_ws[1], local_ws[0], 1},
+            {kwg_size / 4, 4, 1},
+            {kwg_size / 16, 16, 1},
+            {kwg_size / 32, 32, 1},
+            {kwg_size / 64, 64, 1},
+            {kwg_size / 128, 128, 1},
+            {kwg_size / 256, 256, 1},
+            {kwg_size / 512, 512, 1},
+            {kwg_size, 1, 1},
+            {1, kwg_size, 1}
+    };
+  };
+  cl::Event event;
+  auto func = [&](const std::vector<uint32_t> &params,
+                  Timer *timer,
+                  std::vector<uint32_t> *tuning_result) -> cl_int {
+    MACE_CHECK(params.size() == 3) << "Tuning parameters of 2D kernel must be 3d";
+    cl_int error = CL_SUCCESS;
+    if (timer == nullptr) {
+      uint32_t num_blocks = params[2];
+      const uint32_t block_size = gws[1] / num_blocks;
+      if (gws[1] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel,
+            cl::NDRange(0, i * block_size),
+            cl::NDRange(gws[0], gws1),
+            cl::NDRange(params[0], params[1]),
+            nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->ClearTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel, cl::NullRange,
+          cl::NDRange(gws[0], gws[1]),
+          cl::NDRange(params[0], params[1]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->AccumulateTiming();
+      tuning_result->assign(params.begin(), params.end());
+
+      if (LimitKernelTime()) {
+        double elapse_time = timer->AccumulatedMicros();
+        timer->ClearTiming();
+        uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
+        (*tuning_result)[2] = num_blocks;
+        const uint32_t block_size = gws[1] / num_blocks;
+        if (gws[1] % num_blocks > 0) num_blocks++;
+        for (uint32_t i = 0; i < num_blocks; ++i) {
+          uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
+          error = runtime->command_queue().enqueueNDRangeKernel(
+              kernel,
+              cl::NDRange(0, i * block_size),
+              cl::NDRange(gws[0], gws1),
+              cl::NDRange(params[0], params[1]), nullptr, &event);
+          MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+          timer->AccumulateTiming();
+        }
+      }
+    }
+    return error;
+  };
+  OpenCLProfilingTimer timer(&event);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(tuning_key,
+                                                     lws,
+                                                     params_generator,
+                                                     func,
+                                                     &timer);
+  if (future != nullptr) {
+    future->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+
+}
+
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h
index 2927dbfff77000166027cd377ff05dc1337bcc00..466064b6d8b6ab98a09ec001fb46cace22447b78 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -14,9 +14,11 @@
 namespace mace {
 namespace kernels {
 
+const float kMaxKernelExeTime = 1000.0; // microseconds
+
 enum BufferType {
   FILTER = 0,
-  IN_OUT= 1,
+  IN_OUT = 1,
   ARGUMENT = 2
 };
 
@@ -32,6 +34,19 @@ std::string DtToCLDt(const DataType dt);
 
 std::string DtToUpstreamCLDt(const DataType dt);
 
+void TuningOrRun3DKernel(cl::Kernel &kernel,
+                         const std::string tuning_key,
+                         const uint32_t *gws,
+                         std::vector<uint32_t> &lws,
+                         StatsFuture *future);
+
+
+void TuningOrRun2DKernel(cl::Kernel &kernel,
+                         const std::string tuning_key,
+                         const uint32_t *gws,
+                         std::vector<uint32_t> &lws,
+                         StatsFuture *future);
+
 inline void SetFuture(StatsFuture *future, const cl::Event &event) {
   if (future != nullptr) {
     future->wait_fn = [event](CallStats *stats) {
@@ -43,10 +58,15 @@ inline void SetFuture(StatsFuture *future, const cl::Event &event) {
   }
 }
 
+inline bool LimitKernelTime() {
+  const char *flag = getenv("MACE_LIMIT_OPENCL_KERNEL_TIME");
+  return flag != nullptr && strlen(flag) == 1 && flag[0] == '1';
+}
+
 namespace {
 template<typename T>
 void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
-    (*ss) << v;
+  (*ss) << v;
 }
 
 template<typename T, typename... Args>
@@ -54,8 +74,8 @@ void AppendToStream(std::stringstream *ss,
                     const std::string &delimiter,
                     T first,
                     Args... args) {
-    (*ss) << first << delimiter;
-    AppendToStream(ss, delimiter, args...);
+  (*ss) << first << delimiter;
+  AppendToStream(ss, delimiter, args...);
 }
 }  // namespace
 
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index 79a6f102930e69cec40ce2447fa5a4dcc83bbf2b..b147c15ad1e34def84560c4fd81da2988d1b8c89 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -60,67 +60,17 @@ static void Pooling(const Tensor *input,
       static_cast<uint32_t>(batch * out_height),
   };
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
-  std::vector<uint32_t> lws(3, 0);
+  std::vector<uint32_t> lws(4, 1);
   lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
   lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
   lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{local_ws[0], local_ws[1], local_ws[2]},
-            {kwg_size / 16, 4, 4},
-            {kwg_size / 32, 4, 8},
-            {kwg_size / 32, 8, 4},
-            {kwg_size / 64, 8, 8},
-            {kwg_size / 64, 16, 4},
-            {kwg_size / 128, 8, 16},
-            {kwg_size / 128, 16, 8},
-            {kwg_size / 128, 32, 4},
-            {1, kwg_size / 32, 32},
-            {1, kwg_size / 64, 64},
-            {1, kwg_size / 128, 128},
-            {3, 15, 9},
-            {7, 15, 9},
-            {9, 7, 15},
-            {15, 7, 9},
-            {1, kwg_size, 1},
-            {4, 15, 8}, //SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        pooling_kernel, cl::NullRange,
-        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]),
-        nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-    return error;
-  };
   std::stringstream ss;
   ss << "pooling_opencl_kernel_"
      << output->dim(0) << "_"
      << output->dim(1) << "_"
      << output->dim(2) << "_"
      << output->dim(3);
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
-                                                     lws,
-                                                     params_generator,
-                                                     func,
-                                                     &timer);
-
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
+  TuningOrRun3DKernel(pooling_kernel, ss.str(), gws, lws, future);
 }
 
 template<typename T>
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index dc0d8cd08cbd0eeb24a6a46c23bdb37813ebbba2..f8d3aed2a3cb232aafe54d9713dd8efd7635bddb 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -59,60 +59,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(out_width),
                            static_cast<uint32_t>(out_height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{local_ws[0], local_ws[1], local_ws[2]},
-            {kwg_size / 16, 4, 4},
-            {kwg_size / 32, 4, 8},
-            {kwg_size / 32, 8, 4},
-            {kwg_size / 64, 8, 8},
-            {kwg_size / 64, 16, 4},
-            {kwg_size / 128, 8, 16},
-            {kwg_size / 128, 16, 8},
-            {kwg_size / 128, 32, 4},
-            {1, kwg_size / 32, 32},
-            {1, kwg_size / 64, 64},
-            {1, kwg_size / 128, 128},
-            {1, kwg_size, 1},
-            {4, 15, 8}, //SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        rb_kernel, cl::NullRange,
-        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]),
-        nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-    return error;
-  };
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
   std::stringstream ss;
   ss << "resize_bilinear_opencl_kernel_"
      << output->dim(0) << "_"
      << output->dim(1) << "_"
      << output->dim(2) << "_"
      << output->dim(3);
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
-                                                     lws,
-                                                     params_generator,
-                                                     func,
-                                                     &timer);
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
+  TuningOrRun3DKernel(rb_kernel, ss.str(), gws, lws, future);
 }
 
 template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
index bfc75e73f8786a67ae7dc19723f3e5ff03d6f476..e47a4f8956397424475dd14026b205a0b698485c 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -41,64 +41,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(softmax_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
-            {local_ws[2], local_ws[1], local_ws[0]},
-            {kwg_size / 16, 4, 4},
-            {kwg_size / 32, 4, 8},
-            {kwg_size / 32, 8, 4},
-            {kwg_size / 64, 8, 8},
-            {kwg_size / 64, 16, 4},
-            {kwg_size / 128, 8, 16},
-            {kwg_size / 128, 16, 8},
-            {kwg_size / 128, 32, 4},
-            {1, kwg_size / 32, 32},
-            {1, kwg_size / 64, 64},
-            {1, kwg_size / 128, 128},
-            {3, 15, 9},
-            {7, 15, 9},
-            {9, 7, 15},
-            {15, 7, 9},
-            {1, kwg_size, 1}};
-  };
-  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        softmax_kernel, cl::NullRange,
-        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]),
-        nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-    return error;
-  };
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
   std::stringstream ss;
   ss << "softmax_opencl_kernel_"
      << output->dim(0) << "_"
      << output->dim(1) << "_"
      << output->dim(2) << "_"
      << output->dim(3);
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
-                                                     lws,
-                                                     params_generator,
-                                                     func,
-                                                     &timer);
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
+  TuningOrRun3DKernel(softmax_kernel, ss.str(), gws, lws, future);
 }
 
 template
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
index 1fd5bf1a5d1199cbf1fb4139b2c4d83b7b0d9408..8ef3f7c45e4c9bd61c0d02aa6e7d0e0dfdb75d82 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -61,58 +61,14 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
   const uint32_t gws[3] = {chan_blk,
                            static_cast<uint32_t>(batch_tensor->dim(2)),
                            static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
-  const std::vector<uint32_t> lws = {8, 16, 8};
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(s2b_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(chan_blk, kwg_size);
-    local_ws[1] = std::min<uint32_t>(32, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(32, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{local_ws[0], local_ws[1], local_ws[2]},
-            {4, 32, 8},
-            {4, 64, 4},
-            {4, 128, 2},
-            {8, 16, 8},
-            {8, 32, 4},
-            {8, 64, 2},
-            {16, 8, 8},
-            {16, 16, 4},
-            {16, 32, 2},
-            {32, 8, 4},
-            {32, 16, 2},
-            {64, 4, 4}};
-  };
-  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        s2b_kernel, cl::NullRange,
-        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]),
-        nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-    return error;
-  };
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
   std::stringstream ss;
   ss << kernel_name << "_"
      << batch_tensor->dim(0) << "_"
      << batch_tensor->dim(1) << "_"
      << batch_tensor->dim(2) << "_"
      << batch_tensor->dim(3);
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
-                                                     lws,
-                                                     params_generator,
-                                                     func,
-                                                     &timer);
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
+  TuningOrRun3DKernel(s2b_kernel, ss.str(), gws, lws, future);
 }
 
 template struct SpaceToBatchFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/utils/timer.h b/mace/utils/timer.h
index cee4411e278abc3dce303c15f02ae8c37acfef1a..ca0c2b3ca04a8af260b9cbfacbcca2a5a02906cb 100644
--- a/mace/utils/timer.h
+++ b/mace/utils/timer.h
@@ -10,29 +10,50 @@
 namespace mace {
 
 class Timer {
-  public:
-    virtual void StartTiming() = 0;
-    virtual void StopTiming() = 0;
-    virtual double ElapsedMicros() = 0;
+ public:
+  virtual void StartTiming() = 0;
+  virtual void StopTiming() = 0;
+  virtual void AccumulateTiming() = 0;
+  virtual void ClearTiming() = 0;
+  virtual double ElapsedMicros() = 0;
+  virtual double AccumulatedMicros() = 0;
 };
 
 class WallClockTimer : public Timer {
-  public:
-    void StartTiming() override {
-      start_micros_ = mace::utils::NowMicros();
-    }
-
-    void StopTiming() override {
-      stop_micros_ = mace::utils::NowMicros();
-    }
-
-    double ElapsedMicros() override {
-      return stop_micros_ - start_micros_;
-    }
-
-  private:
-    double start_micros_;
-    double stop_micros_;
+ public:
+  WallClockTimer() : accumulated_micros_(0) {}
+
+  void StartTiming() override {
+    start_micros_ = mace::utils::NowMicros();
+  }
+
+  void StopTiming() override {
+    stop_micros_ = mace::utils::NowMicros();
+  }
+
+  void AccumulateTiming() override {
+    StopTiming();
+    accumulated_micros_ += stop_micros_ - start_micros_;
+  }
+
+  void ClearTiming() override {
+    start_micros_ = 0;
+    stop_micros_ = 0;
+    accumulated_micros_ = 0;
+  }
+
+  double ElapsedMicros() override {
+    return stop_micros_ - start_micros_;
+  }
+
+  double AccumulatedMicros() override {
+    return accumulated_micros_;
+  }
+
+ private:
+  double start_micros_;
+  double stop_micros_;
+  double accumulated_micros_;
 };
 
 }  // namespace mace
diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h
index b7364e66a72b5861d8d67801c79b921029b6c04a..369152819afb67c554c8c057777fc91d9b3e1349 100644
--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -41,10 +41,10 @@ class Tuner {
   template <typename RetType>
   RetType TuneOrRun(
       const std::string param_key,
-      const std::vector<param_type> &default_param,
+      std::vector<param_type> &default_param,
       const std::function<std::vector<std::vector<param_type>>()>
           &param_generator,
-      const std::function<RetType(const std::vector<param_type> &)> &func,
+      const std::function<RetType(const std::vector<param_type> &, Timer *, std::vector<param_type> *)> &func,
       Timer *timer) {
     std::string obfucated_param_key = MACE_OBFUSCATE_SYMBOL(param_key);
     if (IsTuning() && param_generator != nullptr) {
@@ -60,12 +60,12 @@ class Tuner {
       if (param_table_.find(obfucated_param_key) != param_table_.end()) {
         VLOG(1) << param_key << ": "
                 << internal::MakeString(param_table_[obfucated_param_key]);
-        return func(param_table_[obfucated_param_key]);
+        return func(param_table_[obfucated_param_key], nullptr, nullptr);
       } else {
 #ifndef MACE_DISABLE_NO_TUNING_WARNING
         LOG(WARNING) << "Fallback to default parameter: " << param_key;
 #endif
-        return func(default_param);
+        return func(default_param, nullptr, nullptr);
       }
     }
   }
@@ -119,18 +119,17 @@ class Tuner {
 
   template <typename RetType>
   inline RetType Run(
-      const std::function<RetType(const std::vector<param_type> &)> &func,
-      const std::vector<param_type> &params,
+      const std::function<RetType(const std::vector<param_type> &, Timer *, std::vector<param_type> *)> &func,
+      std::vector<param_type> &params,
       Timer *timer,
       int num_runs,
-      double *time_us) {
+      double *time_us,
+      std::vector<param_type> *tuning_result) {
     RetType res;
     int64_t total_time_us = 0;
     for (int i = 0; i < num_runs; ++i) {
-      timer->StartTiming();
-      res = func(params);
-      timer->StopTiming();
-      total_time_us += timer->ElapsedMicros();
+      res = func(params, timer, tuning_result);
+      total_time_us += timer->AccumulatedMicros();
     }
 
     *time_us = total_time_us * 1.0 / num_runs;
@@ -141,24 +140,25 @@ class Tuner {
   inline RetType Tune(
       const std::function<std::vector<std::vector<param_type>>()>
           &param_generator,
-      const std::function<RetType(const std::vector<param_type> &)> &func,
+      const std::function<RetType(const std::vector<param_type> &, Timer *, std::vector<param_type> *)> &func,
       Timer *timer,
       std::vector<param_type> *opt_params) {
     RetType res;
     double opt_time = std::numeric_limits<double>::max();
     auto params = param_generator();
-    for (const auto &param : params) {
+    std::vector<param_type> tuning_result;
+    for (auto param : params) {
       double tmp_time = 0.0;
       // warm up
-      Run<RetType>(func, param, timer, 2, &tmp_time);
+      Run<RetType>(func, param, timer, 2, &tmp_time, &tuning_result);
 
       // run
-      RetType tmp_res = Run<RetType>(func, param, timer, 10, &tmp_time);
+      RetType tmp_res = Run<RetType>(func, param, timer, 10, &tmp_time, &tuning_result);
 
       // Check the execution time
       if (tmp_time < opt_time) {
         opt_time = tmp_time;
-        *opt_params = param;
+        *opt_params = tuning_result;
         res = tmp_res;
       }
     }
diff --git a/tools/export_lib.sh b/tools/export_lib.sh
index abcaf6145b9b8583eecf669088d8258197117adb..cb0028376b5b25da5eb4431c9d9b9633077f3d1e 100755
--- a/tools/export_lib.sh
+++ b/tools/export_lib.sh
@@ -68,7 +68,6 @@ build_target()
     --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
     --copt="-Werror=return-type" \
     --copt="-DMACE_OBFUSCATE_LITERALS" \
-    $TUNING_MODE_BUILD_FLAGS \
     $DSP_MODE_BUILD_FLAGS || exit -1
 }
 
diff --git a/tools/wino_conv.py b/tools/wino_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8cdf3d8e88586b10dd3256de3670978c2a2e5f2
--- /dev/null
+++ b/tools/wino_conv.py
@@ -0,0 +1,141 @@
+import numpy as np
+import math
+import tensorflow as tf
+
+A_T = np.array([[1, 1, 1, 0], [0, 1, -1, -1]]).astype(np.float32)
+A = np.transpose(A_T)
+B_T = np.array([
+  [1, 0, -1, 0],
+  [0, 1, 1, 0],
+  [0, -1, 1, 0],
+  [0, 1, 0, -1]
+]).astype(np.float32)
+B = np.transpose(B_T)
+G = np.array([
+  [1, 0, 0],
+  [0.5, 0.5, 0.5],
+  [0.5, -0.5, 0.5],
+  [0, 0, 1],
+]).astype(np.float32)
+G_T = np.transpose(G)
+
+
+def output_shape(input_shape, filter_shape):
+  out_shape = np.zeros(4).astype(np.int32)
+  out_shape[0] = input_shape[0]
+  out_shape[1] = filter_shape[0]
+  out_shape[2] = input_shape[2] - 2
+  out_shape[3] = input_shape[3] - 2
+  return out_shape
+
+
+def winog_conv(input, filter):
+  m = 2
+  r = 3
+  alpha = m + r - 1
+  input_shape = input.shape
+  filter_shape = filter.shape
+  out_shape = output_shape(input_shape, filter_shape)
+
+  K = filter_shape[0]
+  C = input_shape[1]
+  U = np.zeros((K * 16, C))
+
+  for k in range(K):
+    for c in range(C):
+      u = np.dot(np.dot(G, filter[k, c, :, :]), G_T)
+      for i in range(4):
+        for j in range(4) :
+          U[(i * 4 + j) * K + k, c] = u[i, j]
+
+  print 'filter out: ', U.shape
+  print U[0, 0]
+  U.astype(np.float32).tofile("filter_out")
+
+  rounded_h = int(math.ceil(out_shape[2] / 2.0))
+  rounded_w = int(math.ceil(out_shape[3] / 2.0))
+  P = input_shape[0] * rounded_h * rounded_w
+  V = np.zeros((C * 16, P))
+  for p in range(P):
+    for c in range(C):
+      n = p / (rounded_w * rounded_h)
+      t = p % (rounded_h * rounded_w)
+      h_idx = t / rounded_w
+      w_idx = t % rounded_w
+      h_start = h_idx * 2
+      w_start = w_idx * 2
+      h_end = min(h_start+4, input_shape[2])
+      w_end = min(w_start+4, input_shape[3])
+      d = np.zeros((4, 4))
+      d[0:h_end-h_start, 0:w_end-w_start] = input[n, c, h_start:h_end, w_start:w_end]
+      v = np.dot(np.dot(B_T, d), B)
+      for i in range(4):
+        for j in range(4):
+          V[(i*4+j)*C + c, p] = v[i, j]
+
+  tmp = V.reshape(16, C, P, 1)
+  print 'input out: ', tmp.shape
+  tmp.astype(np.float32).tofile("C")
+  M = np.zeros((16 * K, P))
+  for i in range(alpha * alpha):
+    u = U[i * K : (i+1) * K, :]
+    v = V[i * C : (i+1) * C, :]
+    M[i * K : (i+1) * K, :] = np.dot(u, v)
+
+  print 'M shape: ', M.shape
+  M.astype(np.float32).tofile("gemm")
+  res = np.zeros((out_shape[0], out_shape[2], out_shape[3], out_shape[1]))
+  for k in range(K):
+    for b in range(P):
+      m = np.zeros((4, 4))
+      for i in range(4):
+        for j in range(4):
+          m[i][j] = M[(i*4+j) * K + k, b]
+      y = np.dot(np.dot(A_T, m), A)
+      for i in range(2):
+        for j in range(2):
+          n = b / (rounded_h * rounded_w)
+          t = b % (rounded_h * rounded_w)
+          p = (t / rounded_w) * 2 + i
+          q = (t % rounded_w) * 2 + j
+          if p >= out_shape[2] or q >= out_shape[3]:
+            continue
+          res[n, p, q, k] = y[i, j]
+
+  print 'Res shape: ', res.shape
+  res.astype(np.float32).tofile("res")
+
+  return res
+
+def tf_conv(input, filter):
+  conv_op = tf.nn.conv2d(input, filter, [1, 1, 1, 1], 'VALID')
+  with tf.Session() as sess:
+    res = sess.run(conv_op)
+  return res
+
+
+def main():
+  input = np.random.random([7, 61, 71, 31]).astype(np.float32)
+  # input = np.fromfile(file="A", dtype=np.float32)
+  # input = input.reshape(1, 3, 3, 5)
+  print 'input shape: ', input.shape
+  input.tofile("A")
+  filter = np.random.random([3, 3, 31, 31]).astype(np.float32)
+  tf_out = tf_conv(input, filter)
+  input = input.transpose((0, 3, 1, 2))
+  filter = filter.transpose((3, 2, 0, 1))
+  print 'filter shape: ', filter.shape
+  filter.tofile("filter_in")
+  winog_out = winog_conv(input, filter)
+  res = np.allclose(tf_out, winog_out)
+  if res:
+    print "=========Pass========="
+  else:
+    print "=========Failed========="
+    print "TF: ", tf_out
+    print "Winograd: ", winog_out
+
+
+if __name__ == '__main__':
+  main()
+