From a9dce8ec1e809ee48769f9044b6b6495ffd52e3b Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Wed, 24 Jan 2018 15:53:15 +0800
Subject: [PATCH] Add block tuning to limit the execution time less than 1ms.

---
 mace/core/runtime/opencl/opencl_runtime.cc    | 15 ++++
 mace/core/runtime/opencl/opencl_runtime.h     | 28 +++---
 mace/kernels/opencl/activation_opencl.cc      | 83 +++++++++++------
 mace/kernels/opencl/addn.cc                   | 73 ++++++++++-----
 mace/kernels/opencl/batch_norm_opencl.cc      | 83 +++++++++++------
 mace/kernels/opencl/concat.cc                 | 87 ++++++++++++------
 mace/kernels/opencl/conv_2d_opencl_1x1.cc     | 83 +++++++++++------
 mace/kernels/opencl/conv_2d_opencl_3x3.cc     | 84 ++++++++++++------
 mace/kernels/opencl/conv_2d_opencl_general.cc | 84 ++++++++++++------
 mace/kernels/opencl/helper.h                  |  2 +
 mace/kernels/opencl/pooling_opencl.cc         | 86 ++++++++++++------
 mace/kernels/opencl/resize_bilinear_opencl.cc | 82 ++++++++++++-----
 mace/kernels/opencl/softmax_opencl.cc         | 88 +++++++++++++------
 mace/kernels/opencl/space_to_batch_opencl.cc  | 82 ++++++++++++-----
 mace/utils/timer.h                            | 61 ++++++++-----
 mace/utils/tuner.h                            | 22 +++--
 16 files changed, 724 insertions(+), 319 deletions(-)

diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 3c8b013a..5b7ccdd8 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -50,6 +50,21 @@ double OpenCLProfilingTimer::ElapsedMicros() {
   return (stop_nanos_ - start_nanos_) / 1000.0;
 }
 
+double OpenCLProfilingTimer::AccumulatedMicros() {
+  return accumulated_micros_;
+}
+
+void OpenCLProfilingTimer::AccumulateTiming(){
+  StopTiming();
+  accumulated_micros_ += (stop_nanos_ - start_nanos_) / 1000.0;
+}
+
+void OpenCLProfilingTimer::ClearTiming() {
+  start_nanos_ = 0;
+  stop_nanos_ = 0;
+  accumulated_micros_ = 0;
+}
+
 OpenCLRuntime *OpenCLRuntime::Global() {
   static OpenCLRuntime instance;
   return &instance;
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index 7245b926..ff596459 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -18,16 +18,20 @@
 namespace mace {
 
 class OpenCLProfilingTimer : public Timer {
-  public:
-    explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event) {};
-    void StartTiming() override;
-    void StopTiming() override;
-    double ElapsedMicros() override;
+ public:
+  explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event), accumulated_micros_(0) {};
+  void StartTiming() override;
+  void StopTiming() override;
+  void AccumulateTiming() override;
+  void ClearTiming() override;
+  double ElapsedMicros() override;
+  double AccumulatedMicros() override;
 
-  private:
-    const cl::Event *event_;
-    double start_nanos_;
-    double stop_nanos_;
+ private:
+  const cl::Event *event_;
+  double start_nanos_;
+  double stop_nanos_;
+  double accumulated_micros_;
 };
 
 class OpenCLRuntime {
@@ -40,15 +44,15 @@ class OpenCLRuntime {
 
   void GetCallStats(const cl::Event &event, CallStats *stats);
   uint32_t GetDeviceMaxWorkGroupSize();
-  uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel);
+  uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
   cl::Kernel BuildKernel(const std::string &program_name,
                          const std::string &kernel_name,
                          const std::set<std::string> &build_options);
  private:
   OpenCLRuntime();
   ~OpenCLRuntime();
-  OpenCLRuntime(const OpenCLRuntime&) = delete;
-  OpenCLRuntime &operator=(const OpenCLRuntime&) = delete;
+  OpenCLRuntime(const OpenCLRuntime &) = delete;
+  OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
 
   void BuildProgram(const std::string &program_file_name,
                     const std::string &binary_file_name,
diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc
index 44eaa47e..473e5fb5 100644
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -63,7 +63,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
   const uint32_t kwg_size =
       runtime->GetKernelMaxWorkGroupSize(activation_kernel);
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
@@ -73,33 +73,66 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     local_ws[2] = std::min<uint32_t>(height * batch,
                                      kwg_size / (local_ws[0] * local_ws[1]));
     return {
-        {local_ws[0], local_ws[1], local_ws[2]},
-        {kwg_size / 16, 4, 4},
-        {kwg_size / 32, 4, 8},
-        {kwg_size / 32, 8, 4},
-        {kwg_size / 64, 8, 8},
-        {kwg_size / 64, 16, 4},
-        {kwg_size / 128, 8, 16},
-        {kwg_size / 128, 16, 8},
-        {kwg_size / 128, 32, 4},
-        {1, kwg_size / 32, 32},
-        {1, kwg_size / 64, 64},
-        {1, kwg_size / 128, 128},
-        {3, 15, 9},
-        {7, 15, 9},
-        {9, 7, 15},
-        {15, 7, 9},
-        {1, kwg_size, 1},
-        {4, 15, 8},  // SNPE size
+        {local_ws[0], local_ws[1], local_ws[2], 1},
+        {kwg_size / 16, 4, 4, 1},
+        {kwg_size / 32, 4, 8, 1},
+        {kwg_size / 32, 8, 4, 1},
+        {kwg_size / 64, 8, 8, 1},
+        {kwg_size / 64, 16, 4, 1},
+        {kwg_size / 128, 8, 16, 1},
+        {kwg_size / 128, 16, 8, 1},
+        {kwg_size / 128, 32, 4, 1},
+        {1, kwg_size / 32, 32, 1},
+        {1, kwg_size / 64, 64, 1},
+        {1, kwg_size / 128, 128, 1},
+        {3, 15, 9, 1},
+        {7, 15, 9, 1},
+        {9, 7, 15, 1},
+        {15, 7, 9, 1},
+        {1, kwg_size, 1, 1},
+        {4, 15, 8, 1},  // SNPE size
     };
   };
   cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        activation_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
+    cl_int error = CL_SUCCESS;
+    if (timer == nullptr) {
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            activation_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          activation_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            activation_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
     return error;
   };
   std::string tuning_key =
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index b4079dc3..946e74cf 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -50,33 +50,66 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
       static_cast<uint32_t>(batch_height_pixels)
   };
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
-  std::vector<uint32_t> lws = {64, 16};
+  std::vector<uint32_t> lws = {64, 16, 1};
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     uint32_t local_ws[2];
     local_ws[0] = std::min<uint32_t>(width_pixels, kwg_size);
     local_ws[1] = std::min<uint32_t>(batch_height_pixels, kwg_size / local_ws[0]);
-    return {{local_ws[0], local_ws[1]},
-            {local_ws[1], local_ws[0]},
-            {kwg_size / 4, 4},
-            {kwg_size / 16, 16},
-            {kwg_size / 32, 32},
-            {kwg_size / 64, 64},
-            {kwg_size / 128, 128},
-            {kwg_size / 256, 256},
-            {kwg_size / 512, 512},
-            {kwg_size, 1},
-            {1, kwg_size}
+    return {{local_ws[0], local_ws[1], 1},
+            {local_ws[1], local_ws[0], 1},
+            {kwg_size / 4, 4, 1},
+            {kwg_size / 16, 16, 1},
+            {kwg_size / 32, 32, 1},
+            {kwg_size / 64, 64, 1},
+            {kwg_size / 128, 128, 1},
+            {kwg_size / 256, 256, 1},
+            {kwg_size / 512, 512, 1},
+            {kwg_size, 1, 1},
+            {1, kwg_size, 1}
     };
   };
   cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        addn_kernel, cl::NullRange,
-        cl::NDRange(gws[0], gws[1]),
-        cl::NDRange(params[0], params[1]),
-        nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
+    cl_int error = CL_SUCCESS;
+    if (timer == nullptr) {
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[1] / num_blocks;
+      if (gws[1] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            addn_kernel,
+            cl::NDRange(0, i * block_size),
+            cl::NDRange(gws[0], gws1),
+            cl::NDRange(params[0], params[1]),
+            nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          addn_kernel, cl::NullRange,
+          cl::NDRange(gws[0], gws[1]),
+          cl::NDRange(params[0], params[1]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[1] / num_blocks;
+      if (gws[1] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            addn_kernel,
+            cl::NDRange(0, i * block_size),
+            cl::NDRange(gws[0], gws1),
+            cl::NDRange(params[0], params[1]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
     return error;
   };
   std::stringstream ss;
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index 2d6c95a3..29a5f2fa 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -83,7 +83,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     std::vector<uint32_t> local_ws(3, 0);
@@ -92,33 +92,66 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     local_ws[2] = std::min<uint32_t>(height * batch,
                                      kwg_size / (local_ws[0] * local_ws[1]));
     return {
-        {local_ws[0], local_ws[1], local_ws[2]},
-        {kwg_size / 16, 4, 4},
-        {kwg_size / 32, 4, 8},
-        {kwg_size / 32, 8, 4},
-        {kwg_size / 64, 8, 8},
-        {kwg_size / 64, 16, 4},
-        {kwg_size / 128, 8, 16},
-        {kwg_size / 128, 16, 8},
-        {kwg_size / 128, 32, 4},
-        {1, kwg_size / 32, 32},
-        {1, kwg_size / 64, 64},
-        {1, kwg_size / 128, 128},
-        {3, 15, 9},
-        {7, 15, 9},
-        {9, 7, 15},
-        {15, 7, 9},
-        {1, kwg_size, 1},
-        {8, 128, 1},  // SNPE size
+        {local_ws[0], local_ws[1], local_ws[2], 1},
+        {kwg_size / 16, 4, 4, 1},
+        {kwg_size / 32, 4, 8, 1},
+        {kwg_size / 32, 8, 4, 1},
+        {kwg_size / 64, 8, 8, 1},
+        {kwg_size / 64, 16, 4, 1},
+        {kwg_size / 128, 8, 16, 1},
+        {kwg_size / 128, 16, 8, 1},
+        {kwg_size / 128, 32, 4, 1},
+        {1, kwg_size / 32, 32, 1},
+        {1, kwg_size / 64, 64, 1},
+        {1, kwg_size / 128, 128, 1},
+        {3, 15, 9, 1},
+        {7, 15, 9, 1},
+        {9, 7, 15, 1},
+        {15, 7, 9, 1},
+        {1, kwg_size, 1, 1},
+        {8, 128, 1, 1},  // SNPE size
     };
   };
   cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
+    cl_int error = CL_SUCCESS;
+    if (timer == nullptr) {
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            bm_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            bm_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
     return error;
   };
   std::string tuning_key =
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index a02b1552..23082529 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -51,42 +51,73 @@ static void Concat2(const Tensor *input0,
       static_cast<uint32_t>(batch * height),
   };
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel);
-  std::vector<uint32_t> lws = {8, 16, 8};
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     std::vector<uint32_t> local_ws(3, 0);
     local_ws[0] = std::min<uint32_t>(channel_blk, kwg_size);
     local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
     local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{local_ws[0], local_ws[1], local_ws[2]},
-            {local_ws[2], local_ws[1], local_ws[0]},
-            {kwg_size / 16, 4, 4},
-            {kwg_size / 32, 4, 8},
-            {kwg_size / 32, 8, 4},
-            {kwg_size / 64, 8, 8},
-            {kwg_size / 64, 16, 4},
-            {kwg_size / 128, 8, 16},
-            {kwg_size / 128, 16, 8},
-            {kwg_size / 128, 32, 4},
-            {1, kwg_size / 32, 32},
-            {1, kwg_size / 64, 64},
-            {1, kwg_size / 128, 128},
-            {3, 15, 9},
-            {7, 15, 9},
-            {9, 7, 15},
-            {15, 7, 9},
-            {1, kwg_size, 1},
-            {4, 15, 8}, //SNPE size
+    return {{local_ws[0], local_ws[1], local_ws[2], 1},
+            {local_ws[2], local_ws[1], local_ws[0], 1},
+            {kwg_size / 16, 4, 4, 1},
+            {kwg_size / 32, 4, 8, 1},
+            {kwg_size / 32, 8, 4, 1},
+            {kwg_size / 64, 8, 8, 1},
+            {kwg_size / 64, 16, 4, 1},
+            {kwg_size / 128, 8, 16, 1},
+            {kwg_size / 128, 16, 8, 1},
+            {kwg_size / 128, 32, 4, 1},
+            {1, kwg_size / 32, 32, 1},
+            {1, kwg_size / 64, 64, 1},
+            {1, kwg_size / 128, 128, 1},
+            {3, 15, 9, 1},
+            {7, 15, 9, 1},
+            {9, 7, 15, 1},
+            {15, 7, 9, 1},
+            {1, kwg_size, 1, 1},
+            {4, 15, 8, 1}, //SNPE size
     };
   };
   cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        concat_kernel, cl::NullRange,
-        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]),
-        nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
+    cl_int error = CL_SUCCESS;
+    if (timer == nullptr) {
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            concat_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          concat_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            concat_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
     return error;
   };
   std::stringstream ss;
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index a8e9192d..e4b4ab93 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -96,7 +96,7 @@ void Conv1x1(const Tensor *input,
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width_blocks),
                            static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 15, 8};
+  std::vector<uint32_t> lws = {8, 15, 8, 1};
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     std::vector<uint32_t> local_ws(3, 0);
@@ -105,33 +105,66 @@ void Conv1x1(const Tensor *input,
     local_ws[2] = std::min<uint32_t>(height * batch,
                                      kwg_size / (local_ws[0] * local_ws[1]));
     return {
-        {local_ws[0], local_ws[1], local_ws[2]},
-        {kwg_size / 16, 4, 4},
-        {kwg_size / 32, 4, 8},
-        {kwg_size / 32, 8, 4},
-        {kwg_size / 64, 8, 8},
-        {kwg_size / 64, 16, 4},
-        {kwg_size / 128, 8, 16},
-        {kwg_size / 128, 16, 8},
-        {kwg_size / 128, 32, 4},
-        {1, kwg_size / 32, 32},
-        {1, kwg_size / 64, 64},
-        {1, kwg_size / 128, 128},
-        {3, 15, 9},
-        {7, 15, 9},
-        {9, 7, 15},
-        {15, 7, 9},
-        {1, kwg_size, 1},
-        {4, 15, 8},  // SNPE size
+        {local_ws[0], local_ws[1], local_ws[2], 1},
+        {kwg_size / 16, 4, 4, 1},
+        {kwg_size / 32, 4, 8, 1},
+        {kwg_size / 32, 8, 4, 1},
+        {kwg_size / 64, 8, 8, 1},
+        {kwg_size / 64, 16, 4, 1},
+        {kwg_size / 128, 8, 16, 1},
+        {kwg_size / 128, 16, 8, 1},
+        {kwg_size / 128, 32, 4, 1},
+        {1, kwg_size / 32, 32, 1},
+        {1, kwg_size / 64, 64, 1},
+        {1, kwg_size / 128, 128, 1},
+        {3, 15, 9, 1},
+        {7, 15, 9, 1},
+        {9, 7, 15, 1},
+        {15, 7, 9, 1},
+        {1, kwg_size, 1, 1},
+        {4, 15, 8, 1},  // SNPE size
     };
   };
   cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
+    cl_int error = CL_SUCCESS;
+    if (timer == nullptr) {
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            conv_2d_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            conv_2d_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
     return error;
   };
   std::string tuning_key =
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index 9779107b..a374ea51 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -94,7 +94,7 @@ static void Conv2d3x3S12(const Tensor *input,
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width_blocks),
                            static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {4, 15, 8};
+  std::vector<uint32_t> lws = {4, 15, 8, 1};
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     std::vector<uint32_t> local_ws(3, 0);
@@ -103,34 +103,66 @@ static void Conv2d3x3S12(const Tensor *input,
     local_ws[2] = std::min<uint32_t>(height * batch,
                                      kwg_size / (local_ws[0] * local_ws[1]));
     return {
-        {local_ws[0], local_ws[1], local_ws[2]},
-        {local_ws[2], local_ws[1], local_ws[0]},
-        {kwg_size / 16, 4, 4},
-        {kwg_size / 32, 4, 8},
-        {kwg_size / 32, 8, 4},
-        {kwg_size / 64, 8, 8},
-        {kwg_size / 64, 16, 4},
-        {kwg_size / 128, 8, 16},
-        {kwg_size / 128, 16, 8},
-        {kwg_size / 128, 32, 4},
-        {1, kwg_size / 32, 32},
-        {1, kwg_size / 64, 64},
-        {1, kwg_size / 128, 128},
-        {3, 15, 9},
-        {7, 15, 9},
-        {9, 7, 15},
-        {15, 7, 9},
-        {1, kwg_size, 1},
-        {4, 15, 8},  // SNPE size
+        {local_ws[0], local_ws[1], local_ws[2], 1},
+        {kwg_size / 16, 4, 4, 1},
+        {kwg_size / 32, 4, 8, 1},
+        {kwg_size / 32, 8, 4, 1},
+        {kwg_size / 64, 8, 8, 1},
+        {kwg_size / 64, 16, 4, 1},
+        {kwg_size / 128, 8, 16, 1},
+        {kwg_size / 128, 16, 8, 1},
+        {kwg_size / 128, 32, 4, 1},
+        {1, kwg_size / 32, 32, 1},
+        {1, kwg_size / 64, 64, 1},
+        {1, kwg_size / 128, 128, 1},
+        {3, 15, 9, 1},
+        {7, 15, 9, 1},
+        {9, 7, 15, 1},
+        {15, 7, 9, 1},
+        {1, kwg_size, 1, 1},
+        {4, 15, 8, 1},  // SNPE size
     };
   };
   cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
+    cl_int error = CL_SUCCESS;
+    if (timer == nullptr) {
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            conv_2d_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            conv_2d_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
     return error;
   };
   std::string tuning_key =
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index 89295799..d671d4d8 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -96,7 +96,7 @@ void Conv2dOpencl(const Tensor *input,
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width_blocks),
                            static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     std::vector<uint32_t> local_ws(3, 0);
@@ -105,34 +105,66 @@ void Conv2dOpencl(const Tensor *input,
     local_ws[2] = std::min<uint32_t>(height * batch,
                                      kwg_size / (local_ws[0] * local_ws[1]));
     return {
-        {local_ws[0], local_ws[1], local_ws[2]},
-        {local_ws[2], local_ws[1], local_ws[0]},
-        {kwg_size / 16, 4, 4},
-        {kwg_size / 32, 4, 8},
-        {kwg_size / 32, 8, 4},
-        {kwg_size / 64, 8, 8},
-        {kwg_size / 64, 16, 4},
-        {kwg_size / 128, 8, 16},
-        {kwg_size / 128, 16, 8},
-        {kwg_size / 128, 32, 4},
-        {1, kwg_size / 32, 32},
-        {1, kwg_size / 64, 64},
-        {1, kwg_size / 128, 128},
-        {3, 15, 9},
-        {7, 15, 9},
-        {9, 7, 15},
-        {15, 7, 9},
-        {1, kwg_size, 1},
-        {4, 15, 8},  // SNPE size
+        {local_ws[0], local_ws[1], local_ws[2], 1},
+        {kwg_size / 16, 4, 4, 1},
+        {kwg_size / 32, 4, 8, 1},
+        {kwg_size / 32, 8, 4, 1},
+        {kwg_size / 64, 8, 8, 1},
+        {kwg_size / 64, 16, 4, 1},
+        {kwg_size / 128, 8, 16, 1},
+        {kwg_size / 128, 16, 8, 1},
+        {kwg_size / 128, 32, 4, 1},
+        {1, kwg_size / 32, 32, 1},
+        {1, kwg_size / 64, 64, 1},
+        {1, kwg_size / 128, 128, 1},
+        {3, 15, 9, 1},
+        {7, 15, 9, 1},
+        {9, 7, 15, 1},
+        {15, 7, 9, 1},
+        {1, kwg_size, 1, 1},
+        {4, 15, 8, 1},  // SNPE size
     };
   };
   cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
+    cl_int error = CL_SUCCESS;
+    if (timer == nullptr) {
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            conv_2d_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            conv_2d_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
     return error;
   };
   std::string tuning_key =
diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h
index 2927dbff..34e787a8 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -14,6 +14,8 @@
 namespace mace {
 namespace kernels {
 
+const float kMaxKernelExeTime = 1000.0; // microseconds
+
 enum BufferType {
   FILTER = 0,
   IN_OUT= 1,
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index 79a6f102..194ee133 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -60,7 +60,7 @@ static void Pooling(const Tensor *input,
       static_cast<uint32_t>(batch * out_height),
   };
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
-  std::vector<uint32_t> lws(3, 0);
+  std::vector<uint32_t> lws(4, 1);
   lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
   lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
   lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
@@ -69,35 +69,67 @@ static void Pooling(const Tensor *input,
     local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
     local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
     local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{local_ws[0], local_ws[1], local_ws[2]},
-            {kwg_size / 16, 4, 4},
-            {kwg_size / 32, 4, 8},
-            {kwg_size / 32, 8, 4},
-            {kwg_size / 64, 8, 8},
-            {kwg_size / 64, 16, 4},
-            {kwg_size / 128, 8, 16},
-            {kwg_size / 128, 16, 8},
-            {kwg_size / 128, 32, 4},
-            {1, kwg_size / 32, 32},
-            {1, kwg_size / 64, 64},
-            {1, kwg_size / 128, 128},
-            {3, 15, 9},
-            {7, 15, 9},
-            {9, 7, 15},
-            {15, 7, 9},
-            {1, kwg_size, 1},
-            {4, 15, 8}, //SNPE size
+    return {
+        {local_ws[0], local_ws[1], local_ws[2], 1},
+        {kwg_size / 16, 4, 4, 1},
+        {kwg_size / 32, 4, 8, 1},
+        {kwg_size / 32, 8, 4, 1},
+        {kwg_size / 64, 8, 8, 1},
+        {kwg_size / 64, 16, 4, 1},
+        {kwg_size / 128, 8, 16, 1},
+        {kwg_size / 128, 16, 8, 1},
+        {kwg_size / 128, 32, 4, 1},
+        {1, kwg_size / 32, 32, 1},
+        {1, kwg_size / 64, 64, 1},
+        {1, kwg_size / 128, 128, 1},
+        {3, 15, 9, 1},
+        {7, 15, 9, 1},
+        {9, 7, 15, 1},
+        {15, 7, 9, 1},
+        {1, kwg_size, 1, 1},
+        {4, 15, 8, 1},  // SNPE size
     };
   };
   cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        pooling_kernel, cl::NullRange,
-        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]),
-        nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
+    cl_int error = CL_SUCCESS;
+    if (timer == nullptr) {
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            pooling_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          pooling_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            pooling_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
     return error;
   };
   std::stringstream ss;
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index dc0d8cd0..0ad87eea 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -59,38 +59,74 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(out_width),
                            static_cast<uint32_t>(out_height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     std::vector<uint32_t> local_ws(3, 0);
     local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
     local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
     local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{local_ws[0], local_ws[1], local_ws[2]},
-            {kwg_size / 16, 4, 4},
-            {kwg_size / 32, 4, 8},
-            {kwg_size / 32, 8, 4},
-            {kwg_size / 64, 8, 8},
-            {kwg_size / 64, 16, 4},
-            {kwg_size / 128, 8, 16},
-            {kwg_size / 128, 16, 8},
-            {kwg_size / 128, 32, 4},
-            {1, kwg_size / 32, 32},
-            {1, kwg_size / 64, 64},
-            {1, kwg_size / 128, 128},
-            {1, kwg_size, 1},
-            {4, 15, 8}, //SNPE size
+    return {
+        {local_ws[0], local_ws[1], local_ws[2], 1},
+        {kwg_size / 16, 4, 4, 1},
+        {kwg_size / 32, 4, 8, 1},
+        {kwg_size / 32, 8, 4, 1},
+        {kwg_size / 64, 8, 8, 1},
+        {kwg_size / 64, 16, 4, 1},
+        {kwg_size / 128, 8, 16, 1},
+        {kwg_size / 128, 16, 8, 1},
+        {kwg_size / 128, 32, 4, 1},
+        {1, kwg_size / 32, 32, 1},
+        {1, kwg_size / 64, 64, 1},
+        {1, kwg_size / 128, 128, 1},
+        {3, 15, 9, 1},
+        {7, 15, 9, 1},
+        {9, 7, 15, 1},
+        {15, 7, 9, 1},
+        {1, kwg_size, 1, 1},
+        {4, 15, 8, 1},  // SNPE size
     };
   };
   cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        rb_kernel, cl::NullRange,
-        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]),
-        nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
+    cl_int error = CL_SUCCESS;
+    if (timer == nullptr) {
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            rb_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          rb_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            rb_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
     return error;
   };
   std::stringstream ss;
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
index bfc75e73..ca9c5fdb 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -41,42 +41,74 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(width),
                            static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(softmax_kernel);
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     std::vector<uint32_t> local_ws(3, 0);
     local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
     local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
     local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
-            {local_ws[2], local_ws[1], local_ws[0]},
-            {kwg_size / 16, 4, 4},
-            {kwg_size / 32, 4, 8},
-            {kwg_size / 32, 8, 4},
-            {kwg_size / 64, 8, 8},
-            {kwg_size / 64, 16, 4},
-            {kwg_size / 128, 8, 16},
-            {kwg_size / 128, 16, 8},
-            {kwg_size / 128, 32, 4},
-            {1, kwg_size / 32, 32},
-            {1, kwg_size / 64, 64},
-            {1, kwg_size / 128, 128},
-            {3, 15, 9},
-            {7, 15, 9},
-            {9, 7, 15},
-            {15, 7, 9},
-            {1, kwg_size, 1}};
+    return {
+        {local_ws[0], local_ws[1], local_ws[2], 1},
+        {kwg_size / 16, 4, 4, 1},
+        {kwg_size / 32, 4, 8, 1},
+        {kwg_size / 32, 8, 4, 1},
+        {kwg_size / 64, 8, 8, 1},
+        {kwg_size / 64, 16, 4, 1},
+        {kwg_size / 128, 8, 16, 1},
+        {kwg_size / 128, 16, 8, 1},
+        {kwg_size / 128, 32, 4, 1},
+        {1, kwg_size / 32, 32, 1},
+        {1, kwg_size / 64, 64, 1},
+        {1, kwg_size / 128, 128, 1},
+        {3, 15, 9, 1},
+        {7, 15, 9, 1},
+        {9, 7, 15, 1},
+        {15, 7, 9, 1},
+        {1, kwg_size, 1, 1},
+        {4, 15, 8, 1},  // SNPE size
+    };
   };
   cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        softmax_kernel, cl::NullRange,
-        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]),
-        nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
+    cl_int error = CL_SUCCESS;
+    if (timer == nullptr) {
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            softmax_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          softmax_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            softmax_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
     return error;
   };
   std::stringstream ss;
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
index 1fd5bf1a..cf4762fc 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -61,36 +61,74 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
   const uint32_t gws[3] = {chan_blk,
                            static_cast<uint32_t>(batch_tensor->dim(2)),
                            static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
-  const std::vector<uint32_t> lws = {8, 16, 8};
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(s2b_kernel);
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     std::vector<uint32_t> local_ws(3, 0);
     local_ws[0] = std::min<uint32_t>(chan_blk, kwg_size);
     local_ws[1] = std::min<uint32_t>(32, kwg_size / local_ws[0]);
     local_ws[2] = std::min<uint32_t>(32, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{local_ws[0], local_ws[1], local_ws[2]},
-            {4, 32, 8},
-            {4, 64, 4},
-            {4, 128, 2},
-            {8, 16, 8},
-            {8, 32, 4},
-            {8, 64, 2},
-            {16, 8, 8},
-            {16, 16, 4},
-            {16, 32, 2},
-            {32, 8, 4},
-            {32, 16, 2},
-            {64, 4, 4}};
+    return {
+        {local_ws[0], local_ws[1], local_ws[2], 1},
+        {kwg_size / 16, 4, 4, 1},
+        {kwg_size / 32, 4, 8, 1},
+        {kwg_size / 32, 8, 4, 1},
+        {kwg_size / 64, 8, 8, 1},
+        {kwg_size / 64, 16, 4, 1},
+        {kwg_size / 128, 8, 16, 1},
+        {kwg_size / 128, 16, 8, 1},
+        {kwg_size / 128, 32, 4, 1},
+        {1, kwg_size / 32, 32, 1},
+        {1, kwg_size / 64, 64, 1},
+        {1, kwg_size / 128, 128, 1},
+        {3, 15, 9, 1},
+        {7, 15, 9, 1},
+        {9, 7, 15, 1},
+        {15, 7, 9, 1},
+        {1, kwg_size, 1, 1},
+        {4, 15, 8, 1},  // SNPE size
+    };
   };
   cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        s2b_kernel, cl::NullRange,
-        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]),
-        nullptr, &event);
-
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
+    cl_int error = CL_SUCCESS;
+    if (timer == nullptr) {
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            s2b_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          s2b_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            s2b_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
     return error;
   };
   std::stringstream ss;
diff --git a/mace/utils/timer.h b/mace/utils/timer.h
index cee4411e..ca0c2b3c 100644
--- a/mace/utils/timer.h
+++ b/mace/utils/timer.h
@@ -10,29 +10,50 @@
 namespace mace {
 
 class Timer {
-  public:
-    virtual void StartTiming() = 0;
-    virtual void StopTiming() = 0;
-    virtual double ElapsedMicros() = 0;
+ public:
+  virtual void StartTiming() = 0;
+  virtual void StopTiming() = 0;
+  virtual void AccumulateTiming() = 0;
+  virtual void ClearTiming() = 0;
+  virtual double ElapsedMicros() = 0;
+  virtual double AccumulatedMicros() = 0;
 };
 
 class WallClockTimer : public Timer {
-  public:
-    void StartTiming() override {
-      start_micros_ = mace::utils::NowMicros();
-    }
-
-    void StopTiming() override {
-      stop_micros_ = mace::utils::NowMicros();
-    }
-
-    double ElapsedMicros() override {
-      return stop_micros_ - start_micros_;
-    }
-
-  private:
-    double start_micros_;
-    double stop_micros_;
+ public:
+  WallClockTimer() : accumulated_micros_(0) {}
+
+  void StartTiming() override {
+    start_micros_ = mace::utils::NowMicros();
+  }
+
+  void StopTiming() override {
+    stop_micros_ = mace::utils::NowMicros();
+  }
+
+  void AccumulateTiming() override {
+    StopTiming();
+    accumulated_micros_ += stop_micros_ - start_micros_;
+  }
+
+  void ClearTiming() override {
+    start_micros_ = 0;
+    stop_micros_ = 0;
+    accumulated_micros_ = 0;
+  }
+
+  double ElapsedMicros() override {
+    return stop_micros_ - start_micros_;
+  }
+
+  double AccumulatedMicros() override {
+    return accumulated_micros_;
+  }
+
+ private:
+  double start_micros_;
+  double stop_micros_;
+  double accumulated_micros_;
 };
 
 }  // namespace mace
diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h
index b7364e66..e2797fa9 100644
--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -41,10 +41,10 @@ class Tuner {
   template <typename RetType>
   RetType TuneOrRun(
       const std::string param_key,
-      const std::vector<param_type> &default_param,
+      std::vector<param_type> &default_param,
       const std::function<std::vector<std::vector<param_type>>()>
           &param_generator,
-      const std::function<RetType(const std::vector<param_type> &)> &func,
+      const std::function<RetType(std::vector<param_type> &, Timer *)> &func,
       Timer *timer) {
     std::string obfucated_param_key = MACE_OBFUSCATE_SYMBOL(param_key);
     if (IsTuning() && param_generator != nullptr) {
@@ -60,12 +60,12 @@ class Tuner {
       if (param_table_.find(obfucated_param_key) != param_table_.end()) {
         VLOG(1) << param_key << ": "
                 << internal::MakeString(param_table_[obfucated_param_key]);
-        return func(param_table_[obfucated_param_key]);
+        return func(param_table_[obfucated_param_key], nullptr);
       } else {
 #ifndef MACE_DISABLE_NO_TUNING_WARNING
         LOG(WARNING) << "Fallback to default parameter: " << param_key;
 #endif
-        return func(default_param);
+        return func(default_param, nullptr);
       }
     }
   }
@@ -119,18 +119,16 @@ class Tuner {
 
   template <typename RetType>
   inline RetType Run(
-      const std::function<RetType(const std::vector<param_type> &)> &func,
-      const std::vector<param_type> &params,
+      const std::function<RetType(std::vector<param_type> &, Timer *)> &func,
+      std::vector<param_type> &params,
       Timer *timer,
       int num_runs,
       double *time_us) {
     RetType res;
     int64_t total_time_us = 0;
     for (int i = 0; i < num_runs; ++i) {
-      timer->StartTiming();
-      res = func(params);
-      timer->StopTiming();
-      total_time_us += timer->ElapsedMicros();
+      res = func(params, timer);
+      total_time_us += timer->AccumulatedMicros();
     }
 
     *time_us = total_time_us * 1.0 / num_runs;
@@ -141,13 +139,13 @@ class Tuner {
   inline RetType Tune(
       const std::function<std::vector<std::vector<param_type>>()>
           &param_generator,
-      const std::function<RetType(const std::vector<param_type> &)> &func,
+      const std::function<RetType(std::vector<param_type> &, Timer *)> &func,
       Timer *timer,
       std::vector<param_type> *opt_params) {
     RetType res;
     double opt_time = std::numeric_limits<double>::max();
     auto params = param_generator();
-    for (const auto &param : params) {
+    for (auto param : params) {
       double tmp_time = 0.0;
       // warm up
       Run<RetType>(func, param, timer, 2, &tmp_time);
-- 
GitLab