diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc
index 473e5fb58b1dd958180f5c9786ccba0ece232d90..5575a0b1b70868e18a859131065ad4b498b27e43 100644
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -64,84 +64,10 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
   std::vector<uint32_t> lws = {8, 16, 8, 1};
-  const uint32_t kwg_size =
-      runtime->GetKernelMaxWorkGroupSize(activation_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(height * batch,
-                                     kwg_size / (local_ws[0] * local_ws[1]));
-    return {
-        {local_ws[0], local_ws[1], local_ws[2], 1},
-        {kwg_size / 16, 4, 4, 1},
-        {kwg_size / 32, 4, 8, 1},
-        {kwg_size / 32, 8, 4, 1},
-        {kwg_size / 64, 8, 8, 1},
-        {kwg_size / 64, 16, 4, 1},
-        {kwg_size / 128, 8, 16, 1},
-        {kwg_size / 128, 16, 8, 1},
-        {kwg_size / 128, 32, 4, 1},
-        {1, kwg_size / 32, 32, 1},
-        {1, kwg_size / 64, 64, 1},
-        {1, kwg_size / 128, 128, 1},
-        {3, 15, 9, 1},
-        {7, 15, 9, 1},
-        {9, 7, 15, 1},
-        {15, 7, 9, 1},
-        {1, kwg_size, 1, 1},
-        {4, 15, 8, 1},  // SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = CL_SUCCESS;
-    if (timer == nullptr) {
-      uint32_t num_blocks = params.back();
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            activation_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      }
-    } else {
-      timer->StartTiming();
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          activation_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      timer->StopTiming();
-      double elapse_time = timer->ElapsedMicros();
-      timer->ClearTiming();
-      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
-      params.back() = num_blocks;
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            activation_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-        timer->AccumulateTiming();
-      }
-    }
-    return error;
-  };
   std::string tuning_key =
       Concat("relu_opencl_kernel_", activation_, output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
-      tuning_key, lws, params_generator, func, &timer);
-  SetFuture(future, event);
+  TuningOrRun3DKernel(activation_kernel, tuning_key, gws, lws, future);
 }
 
 template struct ActivationFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index 946e74cf8a6e2e5b7f6e0c0f2052c55ef2ce42e7..261efde071ee3b200c3a35290e685b43297ec956 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -49,89 +49,14 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
       static_cast<uint32_t>(width_pixels),
       static_cast<uint32_t>(batch_height_pixels)
   };
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
   std::vector<uint32_t> lws = {64, 16, 1};
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    uint32_t local_ws[2];
-    local_ws[0] = std::min<uint32_t>(width_pixels, kwg_size);
-    local_ws[1] = std::min<uint32_t>(batch_height_pixels, kwg_size / local_ws[0]);
-    return {{local_ws[0], local_ws[1], 1},
-            {local_ws[1], local_ws[0], 1},
-            {kwg_size / 4, 4, 1},
-            {kwg_size / 16, 16, 1},
-            {kwg_size / 32, 32, 1},
-            {kwg_size / 64, 64, 1},
-            {kwg_size / 128, 128, 1},
-            {kwg_size / 256, 256, 1},
-            {kwg_size / 512, 512, 1},
-            {kwg_size, 1, 1},
-            {1, kwg_size, 1}
-    };
-  };
-  cl::Event event;
-  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = CL_SUCCESS;
-    if (timer == nullptr) {
-      uint32_t num_blocks = params.back();
-      const uint32_t block_size = gws[1] / num_blocks;
-      if (gws[1] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            addn_kernel,
-            cl::NDRange(0, i * block_size),
-            cl::NDRange(gws[0], gws1),
-            cl::NDRange(params[0], params[1]),
-            nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      }
-    } else {
-      timer->StartTiming();
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          addn_kernel, cl::NullRange,
-          cl::NDRange(gws[0], gws[1]),
-          cl::NDRange(params[0], params[1]), nullptr, &event);
-      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      timer->StopTiming();
-      double elapse_time = timer->ElapsedMicros();
-      timer->ClearTiming();
-      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
-      params.back() = num_blocks;
-      const uint32_t block_size = gws[1] / num_blocks;
-      if (gws[1] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            addn_kernel,
-            cl::NDRange(0, i * block_size),
-            cl::NDRange(gws[0], gws1),
-            cl::NDRange(params[0], params[1]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-        timer->AccumulateTiming();
-      }
-    }
-    return error;
-  };
   std::stringstream ss;
   ss << "addn_opencl_kernel_"
      << output->dim(0) << "_"
      << output->dim(1) << "_"
      << output->dim(2) << "_"
      << output->dim(3);
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
-                                                     lws,
-                                                     params_generator,
-                                                     func,
-                                                     &timer);
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
+  TuningOrRun2DKernel(addn_kernel, ss.str(), gws, lws, future);
 }
 
 template <typename T>
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index 29a5f2fa694256e3a087a2374d65da3f8e35da9c..02ab76a85eedcdeb735c69937a326522fcf6b273 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -84,83 +84,10 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
   std::vector<uint32_t> lws = {8, 16, 8, 1};
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(height * batch,
-                                     kwg_size / (local_ws[0] * local_ws[1]));
-    return {
-        {local_ws[0], local_ws[1], local_ws[2], 1},
-        {kwg_size / 16, 4, 4, 1},
-        {kwg_size / 32, 4, 8, 1},
-        {kwg_size / 32, 8, 4, 1},
-        {kwg_size / 64, 8, 8, 1},
-        {kwg_size / 64, 16, 4, 1},
-        {kwg_size / 128, 8, 16, 1},
-        {kwg_size / 128, 16, 8, 1},
-        {kwg_size / 128, 32, 4, 1},
-        {1, kwg_size / 32, 32, 1},
-        {1, kwg_size / 64, 64, 1},
-        {1, kwg_size / 128, 128, 1},
-        {3, 15, 9, 1},
-        {7, 15, 9, 1},
-        {9, 7, 15, 1},
-        {15, 7, 9, 1},
-        {1, kwg_size, 1, 1},
-        {8, 128, 1, 1},  // SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = CL_SUCCESS;
-    if (timer == nullptr) {
-      uint32_t num_blocks = params.back();
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            bm_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      }
-    } else {
-      timer->StartTiming();
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      timer->StopTiming();
-      double elapse_time = timer->ElapsedMicros();
-      timer->ClearTiming();
-      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
-      params.back() = num_blocks;
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            bm_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-        timer->AccumulateTiming();
-      }
-    }
-    return error;
-  };
   std::string tuning_key =
       Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3), folded_constant_);
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
-      tuning_key, lws, params_generator, func, &timer);
-  SetFuture(future, event);
+  TuningOrRun3DKernel(bm_kernel, tuning_key, gws, lws, future);
 }
 
 template struct BatchNormFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 23082529f94abcbb560bdc78a18c19ff4e2e612a..b47a096efd2d2472e50b510e722e7142740fb332 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -50,96 +50,14 @@ static void Concat2(const Tensor *input0,
       static_cast<uint32_t>(width),
       static_cast<uint32_t>(batch * height),
   };
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel);
   std::vector<uint32_t> lws = {8, 16, 8, 1};
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blk, kwg_size);
-    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{local_ws[0], local_ws[1], local_ws[2], 1},
-            {local_ws[2], local_ws[1], local_ws[0], 1},
-            {kwg_size / 16, 4, 4, 1},
-            {kwg_size / 32, 4, 8, 1},
-            {kwg_size / 32, 8, 4, 1},
-            {kwg_size / 64, 8, 8, 1},
-            {kwg_size / 64, 16, 4, 1},
-            {kwg_size / 128, 8, 16, 1},
-            {kwg_size / 128, 16, 8, 1},
-            {kwg_size / 128, 32, 4, 1},
-            {1, kwg_size / 32, 32, 1},
-            {1, kwg_size / 64, 64, 1},
-            {1, kwg_size / 128, 128, 1},
-            {3, 15, 9, 1},
-            {7, 15, 9, 1},
-            {9, 7, 15, 1},
-            {15, 7, 9, 1},
-            {1, kwg_size, 1, 1},
-            {4, 15, 8, 1}, //SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = CL_SUCCESS;
-    if (timer == nullptr) {
-      uint32_t num_blocks = params.back();
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            concat_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      }
-    } else {
-      timer->StartTiming();
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          concat_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      timer->StopTiming();
-      double elapse_time = timer->ElapsedMicros();
-      timer->ClearTiming();
-      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
-      params.back() = num_blocks;
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            concat_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-        timer->AccumulateTiming();
-      }
-    }
-    return error;
-  };
   std::stringstream ss;
   ss << "concat_opencl_kernel_"
      << output->dim(0) << "_"
      << output->dim(1) << "_"
      << output->dim(2) << "_"
      << output->dim(3);
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
-                                                     lws,
-                                                     params_generator,
-                                                     func,
-                                                     &timer);
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
+  TuningOrRun3DKernel(concat_kernel, ss.str(), gws, lws, future);
 }
 
 template<typename T>
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index e4b4ab93942e8b2f39d1b1ff9216ae60c61fe216..d62fdf56535372d7fa98da2dad16395656c078bb 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -97,83 +97,10 @@ void Conv1x1(const Tensor *input,
                            static_cast<uint32_t>(width_blocks),
                            static_cast<uint32_t>(height * batch)};
   std::vector<uint32_t> lws = {8, 15, 8, 1};
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(height * batch,
-                                     kwg_size / (local_ws[0] * local_ws[1]));
-    return {
-        {local_ws[0], local_ws[1], local_ws[2], 1},
-        {kwg_size / 16, 4, 4, 1},
-        {kwg_size / 32, 4, 8, 1},
-        {kwg_size / 32, 8, 4, 1},
-        {kwg_size / 64, 8, 8, 1},
-        {kwg_size / 64, 16, 4, 1},
-        {kwg_size / 128, 8, 16, 1},
-        {kwg_size / 128, 16, 8, 1},
-        {kwg_size / 128, 32, 4, 1},
-        {1, kwg_size / 32, 32, 1},
-        {1, kwg_size / 64, 64, 1},
-        {1, kwg_size / 128, 128, 1},
-        {3, 15, 9, 1},
-        {7, 15, 9, 1},
-        {9, 7, 15, 1},
-        {15, 7, 9, 1},
-        {1, kwg_size, 1, 1},
-        {4, 15, 8, 1},  // SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = CL_SUCCESS;
-    if (timer == nullptr) {
-      uint32_t num_blocks = params.back();
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            conv_2d_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      }
-    } else {
-      timer->StartTiming();
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      timer->StopTiming();
-      double elapse_time = timer->ElapsedMicros();
-      timer->ClearTiming();
-      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
-      params.back() = num_blocks;
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            conv_2d_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-        timer->AccumulateTiming();
-      }
-    }
-    return error;
-  };
   std::string tuning_key =
       Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
-      tuning_key, lws, params_generator, func, &timer);
-  SetFuture(future, event);
+  TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future);
 }
 
 extern void Conv2dOpenclK1x1S1(const Tensor *input,
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index a374ea514d603c711b70a39f8a9879e6abe81088..3875403862fd97e58f6e6279e0d4e6a92ab9a96c 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -95,83 +95,10 @@ static void Conv2d3x3S12(const Tensor *input,
                            static_cast<uint32_t>(width_blocks),
                            static_cast<uint32_t>(height * batch)};
   std::vector<uint32_t> lws = {4, 15, 8, 1};
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(height * batch,
-                                     kwg_size / (local_ws[0] * local_ws[1]));
-    return {
-        {local_ws[0], local_ws[1], local_ws[2], 1},
-        {kwg_size / 16, 4, 4, 1},
-        {kwg_size / 32, 4, 8, 1},
-        {kwg_size / 32, 8, 4, 1},
-        {kwg_size / 64, 8, 8, 1},
-        {kwg_size / 64, 16, 4, 1},
-        {kwg_size / 128, 8, 16, 1},
-        {kwg_size / 128, 16, 8, 1},
-        {kwg_size / 128, 32, 4, 1},
-        {1, kwg_size / 32, 32, 1},
-        {1, kwg_size / 64, 64, 1},
-        {1, kwg_size / 128, 128, 1},
-        {3, 15, 9, 1},
-        {7, 15, 9, 1},
-        {9, 7, 15, 1},
-        {15, 7, 9, 1},
-        {1, kwg_size, 1, 1},
-        {4, 15, 8, 1},  // SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = CL_SUCCESS;
-    if (timer == nullptr) {
-      uint32_t num_blocks = params.back();
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            conv_2d_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      }
-    } else {
-      timer->StartTiming();
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      timer->StopTiming();
-      double elapse_time = timer->ElapsedMicros();
-      timer->ClearTiming();
-      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
-      params.back() = num_blocks;
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            conv_2d_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-        timer->AccumulateTiming();
-      }
-    }
-    return error;
-  };
   std::string tuning_key =
       Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
-      tuning_key, lws, params_generator, func, &timer);
-  SetFuture(future, event);
+  TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future);
 }
 void Conv2dOpenclK3x3S1(const Tensor *input,
                         const Tensor *filter,
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index d671d4d884e69f65ac8a6390d7fce4008d8422f2..2a96d8647668ab1abd5bc61c07f1af1c4b37a99e 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -97,83 +97,10 @@ void Conv2dOpencl(const Tensor *input,
                            static_cast<uint32_t>(width_blocks),
                            static_cast<uint32_t>(height * batch)};
   std::vector<uint32_t> lws = {8, 16, 8, 1};
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(height * batch,
-                                     kwg_size / (local_ws[0] * local_ws[1]));
-    return {
-        {local_ws[0], local_ws[1], local_ws[2], 1},
-        {kwg_size / 16, 4, 4, 1},
-        {kwg_size / 32, 4, 8, 1},
-        {kwg_size / 32, 8, 4, 1},
-        {kwg_size / 64, 8, 8, 1},
-        {kwg_size / 64, 16, 4, 1},
-        {kwg_size / 128, 8, 16, 1},
-        {kwg_size / 128, 16, 8, 1},
-        {kwg_size / 128, 32, 4, 1},
-        {1, kwg_size / 32, 32, 1},
-        {1, kwg_size / 64, 64, 1},
-        {1, kwg_size / 128, 128, 1},
-        {3, 15, 9, 1},
-        {7, 15, 9, 1},
-        {9, 7, 15, 1},
-        {15, 7, 9, 1},
-        {1, kwg_size, 1, 1},
-        {4, 15, 8, 1},  // SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = CL_SUCCESS;
-    if (timer == nullptr) {
-      uint32_t num_blocks = params.back();
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            conv_2d_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      }
-    } else {
-      timer->StartTiming();
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      timer->StopTiming();
-      double elapse_time = timer->ElapsedMicros();
-      timer->ClearTiming();
-      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
-      params.back() = num_blocks;
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            conv_2d_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-        timer->AccumulateTiming();
-      }
-    }
-    return error;
-  };
   std::string tuning_key =
       Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
-      tuning_key, lws, params_generator, func, &timer);
-  SetFuture(future, event);
+  TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future);
 }
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index 2c1dc264bd5ac1ddaeeaf47ea54a6e8b9e32e13a..783a30243407653cc660375b542f2c8f896ac52e 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -4,6 +4,7 @@
 
 #include "mace/kernels/opencl/helper.h"
 #include "mace/utils/utils.h"
+#include "mace/utils/tuner.h"
 
 namespace mace {
 namespace kernels {
@@ -100,5 +101,181 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
   }
 }
 
+
+void TuningOrRun3DKernel(cl::Kernel &kernel,
+                         const std::string tuning_key,
+                         const uint32_t *gws,
+                         std::vector<uint32_t> &lws,
+                         StatsFuture *future) {
+  auto runtime = OpenCLRuntime::Global();
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel);
+  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
+    std::vector<uint32_t> local_ws(3, 0);
+    local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
+    local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
+    local_ws[2] = std::min<uint32_t>(gws[2],
+                                     kwg_size / (local_ws[0] * local_ws[1]));
+    return {
+        {local_ws[0], local_ws[1], local_ws[2], 1},
+        {kwg_size / 16, 4, 4, 1},
+        {kwg_size / 32, 4, 8, 1},
+        {kwg_size / 32, 8, 4, 1},
+        {kwg_size / 64, 8, 8, 1},
+        {kwg_size / 64, 16, 4, 1},
+        {kwg_size / 128, 8, 16, 1},
+        {kwg_size / 128, 16, 8, 1},
+        {kwg_size / 128, 32, 4, 1},
+        {1, kwg_size / 32, 32, 1},
+        {1, kwg_size / 64, 64, 1},
+        {1, kwg_size / 128, 128, 1},
+        {3, 15, 9, 1},
+        {7, 15, 9, 1},
+        {9, 7, 15, 1},
+        {15, 7, 9, 1},
+        {1, kwg_size, 1, 1},
+        {4, 15, 8, 1},  // SNPE size
+    };
+  };
+  cl::Event event;
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
+    cl_int error = CL_SUCCESS;
+    if (timer == nullptr) {
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
+    return error;
+  };
+  OpenCLProfilingTimer timer(&event);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
+      tuning_key, lws, params_generator, func, &timer);
+
+  if (future != nullptr) {
+    future->wait_fn = [event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        OpenCLRuntime::Global()->GetCallStats(event, stats);
+      }
+    };
+  }
+}
+
+void TuningOrRun2DKernel(cl::Kernel &kernel,
+                         const std::string tuning_key,
+                         const uint32_t *gws,
+                         std::vector<uint32_t> &lws,
+                         StatsFuture *future) {
+  auto runtime = OpenCLRuntime::Global();
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel);
+  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
+    uint32_t local_ws[2];
+    local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
+    local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
+    return {{local_ws[0], local_ws[1], 1},
+            {local_ws[1], local_ws[0], 1},
+            {kwg_size / 4, 4, 1},
+            {kwg_size / 16, 16, 1},
+            {kwg_size / 32, 32, 1},
+            {kwg_size / 64, 64, 1},
+            {kwg_size / 128, 128, 1},
+            {kwg_size / 256, 256, 1},
+            {kwg_size / 512, 512, 1},
+            {kwg_size, 1, 1},
+            {1, kwg_size, 1}
+    };
+  };
+  cl::Event event;
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
+    cl_int error = CL_SUCCESS;
+    if (timer == nullptr) {
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[1] / num_blocks;
+      if (gws[1] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel,
+            cl::NDRange(0, i * block_size),
+            cl::NDRange(gws[0], gws1),
+            cl::NDRange(params[0], params[1]),
+            nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel, cl::NullRange,
+          cl::NDRange(gws[0], gws[1]),
+          cl::NDRange(params[0], params[1]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[1] / num_blocks;
+      if (gws[1] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel,
+            cl::NDRange(0, i * block_size),
+            cl::NDRange(gws[0], gws1),
+            cl::NDRange(params[0], params[1]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
+    return error;
+  };
+  OpenCLProfilingTimer timer(&event);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(tuning_key,
+                                                     lws,
+                                                     params_generator,
+                                                     func,
+                                                     &timer);
+  if (future != nullptr) {
+    future->wait_fn = [runtime, event](CallStats *stats) {
+      event.wait();
+      if (stats != nullptr) {
+        runtime->GetCallStats(event, stats);
+      }
+    };
+  }
+
+}
+
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h
index 34e787a8b97ee4246e7e9a61c87e0c474a2a06ec..cfbef59f7038285d462d635d172b892bd6de56a1 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -18,7 +18,7 @@ const float kMaxKernelExeTime = 1000.0; // microseconds
 
 enum BufferType {
   FILTER = 0,
-  IN_OUT= 1,
+  IN_OUT = 1,
   ARGUMENT = 2
 };
 
@@ -34,6 +34,19 @@ std::string DtToCLDt(const DataType dt);
 
 std::string DtToUpstreamCLDt(const DataType dt);
 
+void TuningOrRun3DKernel(cl::Kernel &kernel,
+                         const std::string tuning_key,
+                         const uint32_t *gws,
+                         std::vector<uint32_t> &lws,
+                         StatsFuture *future);
+
+
+void TuningOrRun2DKernel(cl::Kernel &kernel,
+                         const std::string tuning_key,
+                         const uint32_t *gws,
+                         std::vector<uint32_t> &lws,
+                         StatsFuture *future);
+
 inline void SetFuture(StatsFuture *future, const cl::Event &event) {
   if (future != nullptr) {
     future->wait_fn = [event](CallStats *stats) {
@@ -48,7 +61,7 @@ inline void SetFuture(StatsFuture *future, const cl::Event &event) {
 namespace {
 template<typename T>
 void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
-    (*ss) << v;
+  (*ss) << v;
 }
 
 template<typename T, typename... Args>
@@ -56,8 +69,8 @@ void AppendToStream(std::stringstream *ss,
                     const std::string &delimiter,
                     T first,
                     Args... args) {
-    (*ss) << first << delimiter;
-    AppendToStream(ss, delimiter, args...);
+  (*ss) << first << delimiter;
+  AppendToStream(ss, delimiter, args...);
 }
 }  // namespace
 
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index 194ee133042bf62c3fdcc70cff97a8c910b0aeaf..b147c15ad1e34def84560c4fd81da2988d1b8c89 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -64,95 +64,13 @@ static void Pooling(const Tensor *input,
   lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
   lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
   lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {
-        {local_ws[0], local_ws[1], local_ws[2], 1},
-        {kwg_size / 16, 4, 4, 1},
-        {kwg_size / 32, 4, 8, 1},
-        {kwg_size / 32, 8, 4, 1},
-        {kwg_size / 64, 8, 8, 1},
-        {kwg_size / 64, 16, 4, 1},
-        {kwg_size / 128, 8, 16, 1},
-        {kwg_size / 128, 16, 8, 1},
-        {kwg_size / 128, 32, 4, 1},
-        {1, kwg_size / 32, 32, 1},
-        {1, kwg_size / 64, 64, 1},
-        {1, kwg_size / 128, 128, 1},
-        {3, 15, 9, 1},
-        {7, 15, 9, 1},
-        {9, 7, 15, 1},
-        {15, 7, 9, 1},
-        {1, kwg_size, 1, 1},
-        {4, 15, 8, 1},  // SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = CL_SUCCESS;
-    if (timer == nullptr) {
-      uint32_t num_blocks = params.back();
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            pooling_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      }
-    } else {
-      timer->StartTiming();
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          pooling_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      timer->StopTiming();
-      double elapse_time = timer->ElapsedMicros();
-      timer->ClearTiming();
-      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
-      params.back() = num_blocks;
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            pooling_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-        timer->AccumulateTiming();
-      }
-    }
-    return error;
-  };
   std::stringstream ss;
   ss << "pooling_opencl_kernel_"
      << output->dim(0) << "_"
      << output->dim(1) << "_"
      << output->dim(2) << "_"
      << output->dim(3);
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
-                                                     lws,
-                                                     params_generator,
-                                                     func,
-                                                     &timer);
-
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
+  TuningOrRun3DKernel(pooling_kernel, ss.str(), gws, lws, future);
 }
 
 template<typename T>
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index 0ad87eeaf2ebcc43625411f39a76ba388be367bf..f8d3aed2a3cb232aafe54d9713dd8efd7635bddb 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -60,95 +60,13 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
                            static_cast<uint32_t>(out_width),
                            static_cast<uint32_t>(out_height * batch)};
   std::vector<uint32_t> lws = {8, 16, 8, 1};
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {
-        {local_ws[0], local_ws[1], local_ws[2], 1},
-        {kwg_size / 16, 4, 4, 1},
-        {kwg_size / 32, 4, 8, 1},
-        {kwg_size / 32, 8, 4, 1},
-        {kwg_size / 64, 8, 8, 1},
-        {kwg_size / 64, 16, 4, 1},
-        {kwg_size / 128, 8, 16, 1},
-        {kwg_size / 128, 16, 8, 1},
-        {kwg_size / 128, 32, 4, 1},
-        {1, kwg_size / 32, 32, 1},
-        {1, kwg_size / 64, 64, 1},
-        {1, kwg_size / 128, 128, 1},
-        {3, 15, 9, 1},
-        {7, 15, 9, 1},
-        {9, 7, 15, 1},
-        {15, 7, 9, 1},
-        {1, kwg_size, 1, 1},
-        {4, 15, 8, 1},  // SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = CL_SUCCESS;
-    if (timer == nullptr) {
-      uint32_t num_blocks = params.back();
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            rb_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      }
-    } else {
-      timer->StartTiming();
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          rb_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      timer->StopTiming();
-      double elapse_time = timer->ElapsedMicros();
-      timer->ClearTiming();
-      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
-      params.back() = num_blocks;
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            rb_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-        timer->AccumulateTiming();
-      }
-    }
-    return error;
-  };
   std::stringstream ss;
   ss << "resize_bilinear_opencl_kernel_"
      << output->dim(0) << "_"
      << output->dim(1) << "_"
      << output->dim(2) << "_"
      << output->dim(3);
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
-                                                     lws,
-                                                     params_generator,
-                                                     func,
-                                                     &timer);
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
+  TuningOrRun3DKernel(rb_kernel, ss.str(), gws, lws, future);
 }
 
 template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
index ca9c5fdbe868508e963eb737f3eeb19744c770f7..e47a4f8956397424475dd14026b205a0b698485c 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -42,95 +42,13 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
   std::vector<uint32_t> lws = {8, 16, 8, 1};
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(softmax_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {
-        {local_ws[0], local_ws[1], local_ws[2], 1},
-        {kwg_size / 16, 4, 4, 1},
-        {kwg_size / 32, 4, 8, 1},
-        {kwg_size / 32, 8, 4, 1},
-        {kwg_size / 64, 8, 8, 1},
-        {kwg_size / 64, 16, 4, 1},
-        {kwg_size / 128, 8, 16, 1},
-        {kwg_size / 128, 16, 8, 1},
-        {kwg_size / 128, 32, 4, 1},
-        {1, kwg_size / 32, 32, 1},
-        {1, kwg_size / 64, 64, 1},
-        {1, kwg_size / 128, 128, 1},
-        {3, 15, 9, 1},
-        {7, 15, 9, 1},
-        {9, 7, 15, 1},
-        {15, 7, 9, 1},
-        {1, kwg_size, 1, 1},
-        {4, 15, 8, 1},  // SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = CL_SUCCESS;
-    if (timer == nullptr) {
-      uint32_t num_blocks = params.back();
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            softmax_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      }
-    } else {
-      timer->StartTiming();
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          softmax_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      timer->StopTiming();
-      double elapse_time = timer->ElapsedMicros();
-      timer->ClearTiming();
-      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
-      params.back() = num_blocks;
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            softmax_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-        timer->AccumulateTiming();
-      }
-    }
-    return error;
-  };
   std::stringstream ss;
   ss << "softmax_opencl_kernel_"
      << output->dim(0) << "_"
      << output->dim(1) << "_"
      << output->dim(2) << "_"
      << output->dim(3);
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
-                                                     lws,
-                                                     params_generator,
-                                                     func,
-                                                     &timer);
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
+  TuningOrRun3DKernel(softmax_kernel, ss.str(), gws, lws, future);
 }
 
 template
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
index cf4762fc495cc0c3fa3af61bfba6ff40722cd0c7..8ef3f7c45e4c9bd61c0d02aa6e7d0e0dfdb75d82 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -62,95 +62,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
                            static_cast<uint32_t>(batch_tensor->dim(2)),
                            static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
   std::vector<uint32_t> lws = {8, 16, 8, 1};
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(s2b_kernel);
-  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(chan_blk, kwg_size);
-    local_ws[1] = std::min<uint32_t>(32, kwg_size / local_ws[0]);
-    local_ws[2] = std::min<uint32_t>(32, kwg_size / (local_ws[0] * local_ws[1]));
-    return {
-        {local_ws[0], local_ws[1], local_ws[2], 1},
-        {kwg_size / 16, 4, 4, 1},
-        {kwg_size / 32, 4, 8, 1},
-        {kwg_size / 32, 8, 4, 1},
-        {kwg_size / 64, 8, 8, 1},
-        {kwg_size / 64, 16, 4, 1},
-        {kwg_size / 128, 8, 16, 1},
-        {kwg_size / 128, 16, 8, 1},
-        {kwg_size / 128, 32, 4, 1},
-        {1, kwg_size / 32, 32, 1},
-        {1, kwg_size / 64, 64, 1},
-        {1, kwg_size / 128, 128, 1},
-        {3, 15, 9, 1},
-        {7, 15, 9, 1},
-        {9, 7, 15, 1},
-        {15, 7, 9, 1},
-        {1, kwg_size, 1, 1},
-        {4, 15, 8, 1},  // SNPE size
-    };
-  };
-  cl::Event event;
-  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = CL_SUCCESS;
-    if (timer == nullptr) {
-      uint32_t num_blocks = params.back();
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            s2b_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      }
-    } else {
-      timer->StartTiming();
-      error = runtime->command_queue().enqueueNDRangeKernel(
-          s2b_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-      timer->StopTiming();
-      double elapse_time = timer->ElapsedMicros();
-      timer->ClearTiming();
-      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
-      params.back() = num_blocks;
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
-      for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            s2b_kernel,
-            cl::NDRange(0, 0, i * block_size),
-            cl::NDRange(gws[0], gws[1], gws2),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
-        timer->AccumulateTiming();
-      }
-    }
-    return error;
-  };
   std::stringstream ss;
   ss << kernel_name << "_"
      << batch_tensor->dim(0) << "_"
      << batch_tensor->dim(1) << "_"
      << batch_tensor->dim(2) << "_"
      << batch_tensor->dim(3);
-  OpenCLProfilingTimer timer(&event);
-  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
-                                                     lws,
-                                                     params_generator,
-                                                     func,
-                                                     &timer);
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
+  TuningOrRun3DKernel(s2b_kernel, ss.str(), gws, lws, future);
 }
 
 template struct SpaceToBatchFunctor<DeviceType::OPENCL, float>;