Add block tuning to limit the execution time less than 1ms.

a9dce8ec · liuqi · 537b4600 · a9dce8ec · a9dce8ec · a9dce8ec
16 changed file
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -50,6 +50,21 @@ double OpenCLProfilingTimer::ElapsedMicros() {
  return (stop_nanos_ - start_nanos_) / 1000.0;
 }
+double OpenCLProfilingTimer::AccumulatedMicros() {
+  return accumulated_micros_;
+}
+void OpenCLProfilingTimer::AccumulateTiming(){
+  StopTiming();
+  accumulated_micros_ += (stop_nanos_ - start_nanos_) / 1000.0;
+}
+void OpenCLProfilingTimer::ClearTiming() {
+  start_nanos_ = 0;
+  stop_nanos_ = 0;
+  accumulated_micros_ = 0;
+}
 OpenCLRuntime *OpenCLRuntime::Global() {
  static OpenCLRuntime instance;
  return &instance;

--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -18,16 +18,20 @@
 namespace mace {
 class OpenCLProfilingTimer : public Timer {
-  public:
+ public:
-    explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event) {};
+  explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event), accumulated_micros_(0) {};
-    void StartTiming() override;
+  void StartTiming() override;
-    void StopTiming() override;
+  void StopTiming() override;
-    double ElapsedMicros() override;
+  void AccumulateTiming() override;
+  void ClearTiming() override;
+  double ElapsedMicros() override;
+  double AccumulatedMicros() override;
-  private:
+ private:
-    const cl::Event *event_;
+  const cl::Event *event_;
-    double start_nanos_;
+  double start_nanos_;
-    double stop_nanos_;
+  double stop_nanos_;
+  double accumulated_micros_;
 };
 class OpenCLRuntime {
@@ -40,15 +44,15 @@ class OpenCLRuntime {
  void GetCallStats(const cl::Event &event, CallStats *stats);
  uint32_t GetDeviceMaxWorkGroupSize();
-  uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel);
+  uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
  cl::Kernel BuildKernel(const std::string &program_name,
                         const std::string &kernel_name,
                         const std::set<std::string> &build_options);
 private:
  OpenCLRuntime();
  ~OpenCLRuntime();
-  OpenCLRuntime(const OpenCLRuntime&) = delete;
+  OpenCLRuntime(const OpenCLRuntime &) = delete;
-  OpenCLRuntime &operator=(const OpenCLRuntime&) = delete;
+  OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
  void BuildProgram(const std::string &program_file_name,
                    const std::string &binary_file_name,

--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -63,7 +63,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                           static_cast<uint32_t>(width),
                           static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
  const uint32_t kwg_size =
      runtime->GetKernelMaxWorkGroupSize(activation_kernel);
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
@@ -73,33 +73,66 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    local_ws[2] = std::min<uint32_t>(height * batch,
                                     kwg_size / (local_ws[0] * local_ws[1]));
    return {
-        {local_ws[0], local_ws[1], local_ws[2]},
+        {local_ws[0], local_ws[1], local_ws[2], 1},
-        {kwg_size / 16, 4, 4},
+        {kwg_size / 16, 4, 4, 1},
-        {kwg_size / 32, 4, 8},
+        {kwg_size / 32, 4, 8, 1},
-        {kwg_size / 32, 8, 4},
+        {kwg_size / 32, 8, 4, 1},
-        {kwg_size / 64, 8, 8},
+        {kwg_size / 64, 8, 8, 1},
-        {kwg_size / 64, 16, 4},
+        {kwg_size / 64, 16, 4, 1},
-        {kwg_size / 128, 8, 16},
+        {kwg_size / 128, 8, 16, 1},
-        {kwg_size / 128, 16, 8},
+        {kwg_size / 128, 16, 8, 1},
-        {kwg_size / 128, 32, 4},
+        {kwg_size / 128, 32, 4, 1},
-        {1, kwg_size / 32, 32},
+        {1, kwg_size / 32, 32, 1},
-        {1, kwg_size / 64, 64},
+        {1, kwg_size / 64, 64, 1},
-        {1, kwg_size / 128, 128},
+        {1, kwg_size / 128, 128, 1},
-        {3, 15, 9},
+        {3, 15, 9, 1},
-        {7, 15, 9},
+        {7, 15, 9, 1},
-        {9, 7, 15},
+        {9, 7, 15, 1},
-        {15, 7, 9},
+        {15, 7, 9, 1},
-        {1, kwg_size, 1},
+        {1, kwg_size, 1, 1},
-        {4, 15, 8},  // SNPE size
+        {4, 15, 8, 1},  // SNPE size
    };
  };
  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+    cl_int error = CL_SUCCESS;
-        activation_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+    if (timer == nullptr) {
-        cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            activation_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          activation_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            activation_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
    return error;
  };
  std::string tuning_key =

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -50,33 +50,66 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
      static_cast<uint32_t>(batch_height_pixels)
  };
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
-  std::vector<uint32_t> lws = {64, 16};
+  std::vector<uint32_t> lws = {64, 16, 1};
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    uint32_t local_ws[2];
    local_ws[0] = std::min<uint32_t>(width_pixels, kwg_size);
    local_ws[1] = std::min<uint32_t>(batch_height_pixels, kwg_size / local_ws[0]);
-    return {{local_ws[0], local_ws[1]},
+    return {{local_ws[0], local_ws[1], 1},
-            {local_ws[1], local_ws[0]},
+            {local_ws[1], local_ws[0], 1},
-            {kwg_size / 4, 4},
+            {kwg_size / 4, 4, 1},
-            {kwg_size / 16, 16},
+            {kwg_size / 16, 16, 1},
-            {kwg_size / 32, 32},
+            {kwg_size / 32, 32, 1},
-            {kwg_size / 64, 64},
+            {kwg_size / 64, 64, 1},
-            {kwg_size / 128, 128},
+            {kwg_size / 128, 128, 1},
-            {kwg_size / 256, 256},
+            {kwg_size / 256, 256, 1},
-            {kwg_size / 512, 512},
+            {kwg_size / 512, 512, 1},
-            {kwg_size, 1},
+            {kwg_size, 1, 1},
-            {1, kwg_size}
+            {1, kwg_size, 1}
    };
  };
  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+    cl_int error = CL_SUCCESS;
-        addn_kernel, cl::NullRange,
+    if (timer == nullptr) {
-        cl::NDRange(gws[0], gws[1]),
+      uint32_t num_blocks = params.back();
-        cl::NDRange(params[0], params[1]),
+      const uint32_t block_size = gws[1] / num_blocks;
-        nullptr, &event);
+      if (gws[1] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            addn_kernel,
+            cl::NDRange(0, i * block_size),
+            cl::NDRange(gws[0], gws1),
+            cl::NDRange(params[0], params[1]),
+            nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          addn_kernel, cl::NullRange,
+          cl::NDRange(gws[0], gws[1]),
+          cl::NDRange(params[0], params[1]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[1] / num_blocks;
+      if (gws[1] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            addn_kernel,
+            cl::NDRange(0, i * block_size),
+            cl::NDRange(gws[0], gws1),
+            cl::NDRange(params[0], params[1]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
    return error;
  };
  std::stringstream ss;

--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -83,7 +83,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                           static_cast<uint32_t>(width),
                           static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    std::vector<uint32_t> local_ws(3, 0);
@@ -92,33 +92,66 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    local_ws[2] = std::min<uint32_t>(height * batch,
                                     kwg_size / (local_ws[0] * local_ws[1]));
    return {
-        {local_ws[0], local_ws[1], local_ws[2]},
+        {local_ws[0], local_ws[1], local_ws[2], 1},
-        {kwg_size / 16, 4, 4},
+        {kwg_size / 16, 4, 4, 1},
-        {kwg_size / 32, 4, 8},
+        {kwg_size / 32, 4, 8, 1},
-        {kwg_size / 32, 8, 4},
+        {kwg_size / 32, 8, 4, 1},
-        {kwg_size / 64, 8, 8},
+        {kwg_size / 64, 8, 8, 1},
-        {kwg_size / 64, 16, 4},
+        {kwg_size / 64, 16, 4, 1},
-        {kwg_size / 128, 8, 16},
+        {kwg_size / 128, 8, 16, 1},
-        {kwg_size / 128, 16, 8},
+        {kwg_size / 128, 16, 8, 1},
-        {kwg_size / 128, 32, 4},
+        {kwg_size / 128, 32, 4, 1},
-        {1, kwg_size / 32, 32},
+        {1, kwg_size / 32, 32, 1},
-        {1, kwg_size / 64, 64},
+        {1, kwg_size / 64, 64, 1},
-        {1, kwg_size / 128, 128},
+        {1, kwg_size / 128, 128, 1},
-        {3, 15, 9},
+        {3, 15, 9, 1},
-        {7, 15, 9},
+        {7, 15, 9, 1},
-        {9, 7, 15},
+        {9, 7, 15, 1},
-        {15, 7, 9},
+        {15, 7, 9, 1},
-        {1, kwg_size, 1},
+        {1, kwg_size, 1, 1},
-        {8, 128, 1},  // SNPE size
+        {8, 128, 1, 1},  // SNPE size
    };
  };
  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+    cl_int error = CL_SUCCESS;
-        bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+    if (timer == nullptr) {
-        cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            bm_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            bm_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
    return error;
  };
  std::string tuning_key =

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -51,42 +51,73 @@ static void Concat2(const Tensor *input0,
      static_cast<uint32_t>(batch * height),
  };
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel);
-  std::vector<uint32_t> lws = {8, 16, 8};
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    std::vector<uint32_t> local_ws(3, 0);
    local_ws[0] = std::min<uint32_t>(channel_blk, kwg_size);
    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{local_ws[0], local_ws[1], local_ws[2]},
+    return {{local_ws[0], local_ws[1], local_ws[2], 1},
-            {local_ws[2], local_ws[1], local_ws[0]},
+            {local_ws[2], local_ws[1], local_ws[0], 1},
-            {kwg_size / 16, 4, 4},
+            {kwg_size / 16, 4, 4, 1},
-            {kwg_size / 32, 4, 8},
+            {kwg_size / 32, 4, 8, 1},
-            {kwg_size / 32, 8, 4},
+            {kwg_size / 32, 8, 4, 1},
-            {kwg_size / 64, 8, 8},
+            {kwg_size / 64, 8, 8, 1},
-            {kwg_size / 64, 16, 4},
+            {kwg_size / 64, 16, 4, 1},
-            {kwg_size / 128, 8, 16},
+            {kwg_size / 128, 8, 16, 1},
-            {kwg_size / 128, 16, 8},
+            {kwg_size / 128, 16, 8, 1},
-            {kwg_size / 128, 32, 4},
+            {kwg_size / 128, 32, 4, 1},
-            {1, kwg_size / 32, 32},
+            {1, kwg_size / 32, 32, 1},
-            {1, kwg_size / 64, 64},
+            {1, kwg_size / 64, 64, 1},
-            {1, kwg_size / 128, 128},
+            {1, kwg_size / 128, 128, 1},
-            {3, 15, 9},
+            {3, 15, 9, 1},
-            {7, 15, 9},
+            {7, 15, 9, 1},
-            {9, 7, 15},
+            {9, 7, 15, 1},
-            {15, 7, 9},
+            {15, 7, 9, 1},
-            {1, kwg_size, 1},
+            {1, kwg_size, 1, 1},
-            {4, 15, 8}, //SNPE size
+            {4, 15, 8, 1}, //SNPE size
    };
  };
  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+    cl_int error = CL_SUCCESS;
-        concat_kernel, cl::NullRange,
+    if (timer == nullptr) {
-        cl::NDRange(gws[0], gws[1], gws[2]),
+      uint32_t num_blocks = params.back();
-        cl::NDRange(params[0], params[1], params[2]),
+      const uint32_t block_size = gws[2] / num_blocks;
-        nullptr, &event);
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            concat_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          concat_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            concat_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
    return error;
  };
  std::stringstream ss;

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -96,7 +96,7 @@ void Conv1x1(const Tensor *input,
  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                           static_cast<uint32_t>(width_blocks),
                           static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 15, 8};
+  std::vector<uint32_t> lws = {8, 15, 8, 1};
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    std::vector<uint32_t> local_ws(3, 0);
@@ -105,33 +105,66 @@ void Conv1x1(const Tensor *input,
    local_ws[2] = std::min<uint32_t>(height * batch,
                                     kwg_size / (local_ws[0] * local_ws[1]));
    return {
-        {local_ws[0], local_ws[1], local_ws[2]},
+        {local_ws[0], local_ws[1], local_ws[2], 1},
-        {kwg_size / 16, 4, 4},
+        {kwg_size / 16, 4, 4, 1},
-        {kwg_size / 32, 4, 8},
+        {kwg_size / 32, 4, 8, 1},
-        {kwg_size / 32, 8, 4},
+        {kwg_size / 32, 8, 4, 1},
-        {kwg_size / 64, 8, 8},
+        {kwg_size / 64, 8, 8, 1},
-        {kwg_size / 64, 16, 4},
+        {kwg_size / 64, 16, 4, 1},
-        {kwg_size / 128, 8, 16},
+        {kwg_size / 128, 8, 16, 1},
-        {kwg_size / 128, 16, 8},
+        {kwg_size / 128, 16, 8, 1},
-        {kwg_size / 128, 32, 4},
+        {kwg_size / 128, 32, 4, 1},
-        {1, kwg_size / 32, 32},
+        {1, kwg_size / 32, 32, 1},
-        {1, kwg_size / 64, 64},
+        {1, kwg_size / 64, 64, 1},
-        {1, kwg_size / 128, 128},
+        {1, kwg_size / 128, 128, 1},
-        {3, 15, 9},
+        {3, 15, 9, 1},
-        {7, 15, 9},
+        {7, 15, 9, 1},
-        {9, 7, 15},
+        {9, 7, 15, 1},
-        {15, 7, 9},
+        {15, 7, 9, 1},
-        {1, kwg_size, 1},
+        {1, kwg_size, 1, 1},
-        {4, 15, 8},  // SNPE size
+        {4, 15, 8, 1},  // SNPE size
    };
  };
  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+    cl_int error = CL_SUCCESS;
-        conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+    if (timer == nullptr) {
-        cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            conv_2d_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            conv_2d_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
    return error;
  };
  std::string tuning_key =

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -94,7 +94,7 @@ static void Conv2d3x3S12(const Tensor *input,
  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                           static_cast<uint32_t>(width_blocks),
                           static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {4, 15, 8};
+  std::vector<uint32_t> lws = {4, 15, 8, 1};
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    std::vector<uint32_t> local_ws(3, 0);
@@ -103,34 +103,66 @@ static void Conv2d3x3S12(const Tensor *input,
    local_ws[2] = std::min<uint32_t>(height * batch,
                                     kwg_size / (local_ws[0] * local_ws[1]));
    return {
-        {local_ws[0], local_ws[1], local_ws[2]},
+        {local_ws[0], local_ws[1], local_ws[2], 1},
-        {local_ws[2], local_ws[1], local_ws[0]},
+        {kwg_size / 16, 4, 4, 1},
-        {kwg_size / 16, 4, 4},
+        {kwg_size / 32, 4, 8, 1},
-        {kwg_size / 32, 4, 8},
+        {kwg_size / 32, 8, 4, 1},
-        {kwg_size / 32, 8, 4},
+        {kwg_size / 64, 8, 8, 1},
-        {kwg_size / 64, 8, 8},
+        {kwg_size / 64, 16, 4, 1},
-        {kwg_size / 64, 16, 4},
+        {kwg_size / 128, 8, 16, 1},
-        {kwg_size / 128, 8, 16},
+        {kwg_size / 128, 16, 8, 1},
-        {kwg_size / 128, 16, 8},
+        {kwg_size / 128, 32, 4, 1},
-        {kwg_size / 128, 32, 4},
+        {1, kwg_size / 32, 32, 1},
-        {1, kwg_size / 32, 32},
+        {1, kwg_size / 64, 64, 1},
-        {1, kwg_size / 64, 64},
+        {1, kwg_size / 128, 128, 1},
-        {1, kwg_size / 128, 128},
+        {3, 15, 9, 1},
-        {3, 15, 9},
+        {7, 15, 9, 1},
-        {7, 15, 9},
+        {9, 7, 15, 1},
-        {9, 7, 15},
+        {15, 7, 9, 1},
-        {15, 7, 9},
+        {1, kwg_size, 1, 1},
-        {1, kwg_size, 1},
+        {4, 15, 8, 1},  // SNPE size
-        {4, 15, 8},  // SNPE size
    };
  };
  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+    cl_int error = CL_SUCCESS;
-        conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+    if (timer == nullptr) {
-        cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            conv_2d_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            conv_2d_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
    return error;
  };
  std::string tuning_key =

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -96,7 +96,7 @@ void Conv2dOpencl(const Tensor *input,
  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                           static_cast<uint32_t>(width_blocks),
                           static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    std::vector<uint32_t> local_ws(3, 0);
@@ -105,34 +105,66 @@ void Conv2dOpencl(const Tensor *input,
    local_ws[2] = std::min<uint32_t>(height * batch,
                                     kwg_size / (local_ws[0] * local_ws[1]));
    return {
-        {local_ws[0], local_ws[1], local_ws[2]},
+        {local_ws[0], local_ws[1], local_ws[2], 1},
-        {local_ws[2], local_ws[1], local_ws[0]},
+        {kwg_size / 16, 4, 4, 1},
-        {kwg_size / 16, 4, 4},
+        {kwg_size / 32, 4, 8, 1},
-        {kwg_size / 32, 4, 8},
+        {kwg_size / 32, 8, 4, 1},
-        {kwg_size / 32, 8, 4},
+        {kwg_size / 64, 8, 8, 1},
-        {kwg_size / 64, 8, 8},
+        {kwg_size / 64, 16, 4, 1},
-        {kwg_size / 64, 16, 4},
+        {kwg_size / 128, 8, 16, 1},
-        {kwg_size / 128, 8, 16},
+        {kwg_size / 128, 16, 8, 1},
-        {kwg_size / 128, 16, 8},
+        {kwg_size / 128, 32, 4, 1},
-        {kwg_size / 128, 32, 4},
+        {1, kwg_size / 32, 32, 1},
-        {1, kwg_size / 32, 32},
+        {1, kwg_size / 64, 64, 1},
-        {1, kwg_size / 64, 64},
+        {1, kwg_size / 128, 128, 1},
-        {1, kwg_size / 128, 128},
+        {3, 15, 9, 1},
-        {3, 15, 9},
+        {7, 15, 9, 1},
-        {7, 15, 9},
+        {9, 7, 15, 1},
-        {9, 7, 15},
+        {15, 7, 9, 1},
-        {15, 7, 9},
+        {1, kwg_size, 1, 1},
-        {1, kwg_size, 1},
+        {4, 15, 8, 1},  // SNPE size
-        {4, 15, 8},  // SNPE size
    };
  };
  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+    cl_int error = CL_SUCCESS;
-        conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+    if (timer == nullptr) {
-        cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      uint32_t num_blocks = params.back();
+      const uint32_t block_size = gws[2] / num_blocks;
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            conv_2d_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            conv_2d_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
    return error;
  };
  std::string tuning_key =

--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -14,6 +14,8 @@
 namespace mace {
 namespace kernels {
+const float kMaxKernelExeTime = 1000.0; // microseconds
 enum BufferType {
  FILTER = 0,
  IN_OUT= 1,

--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -60,7 +60,7 @@ static void Pooling(const Tensor *input,
      static_cast<uint32_t>(batch * out_height),
  };
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
-  std::vector<uint32_t> lws(3, 0);
+  std::vector<uint32_t> lws(4, 1);
  lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
  lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
  lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
@@ -69,35 +69,67 @@ static void Pooling(const Tensor *input,
    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
    local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
    local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{local_ws[0], local_ws[1], local_ws[2]},
+    return {
-            {kwg_size / 16, 4, 4},
+        {local_ws[0], local_ws[1], local_ws[2], 1},
-            {kwg_size / 32, 4, 8},
+        {kwg_size / 16, 4, 4, 1},
-            {kwg_size / 32, 8, 4},
+        {kwg_size / 32, 4, 8, 1},
-            {kwg_size / 64, 8, 8},
+        {kwg_size / 32, 8, 4, 1},
-            {kwg_size / 64, 16, 4},
+        {kwg_size / 64, 8, 8, 1},
-            {kwg_size / 128, 8, 16},
+        {kwg_size / 64, 16, 4, 1},
-            {kwg_size / 128, 16, 8},
+        {kwg_size / 128, 8, 16, 1},
-            {kwg_size / 128, 32, 4},
+        {kwg_size / 128, 16, 8, 1},
-            {1, kwg_size / 32, 32},
+        {kwg_size / 128, 32, 4, 1},
-            {1, kwg_size / 64, 64},
+        {1, kwg_size / 32, 32, 1},
-            {1, kwg_size / 128, 128},
+        {1, kwg_size / 64, 64, 1},
-            {3, 15, 9},
+        {1, kwg_size / 128, 128, 1},
-            {7, 15, 9},
+        {3, 15, 9, 1},
-            {9, 7, 15},
+        {7, 15, 9, 1},
-            {15, 7, 9},
+        {9, 7, 15, 1},
-            {1, kwg_size, 1},
+        {15, 7, 9, 1},
-            {4, 15, 8}, //SNPE size
+        {1, kwg_size, 1, 1},
+        {4, 15, 8, 1},  // SNPE size
    };
  };
  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+    cl_int error = CL_SUCCESS;
-        pooling_kernel, cl::NullRange,
+    if (timer == nullptr) {
-        cl::NDRange(gws[0], gws[1], gws[2]),
+      uint32_t num_blocks = params.back();
-        cl::NDRange(params[0], params[1], params[2]),
+      const uint32_t block_size = gws[2] / num_blocks;
-        nullptr, &event);
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            pooling_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          pooling_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            pooling_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
    return error;
  };
  std::stringstream ss;

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -59,38 +59,74 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                           static_cast<uint32_t>(out_width),
                           static_cast<uint32_t>(out_height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    std::vector<uint32_t> local_ws(3, 0);
    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
    local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
    local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{local_ws[0], local_ws[1], local_ws[2]},
+    return {
-            {kwg_size / 16, 4, 4},
+        {local_ws[0], local_ws[1], local_ws[2], 1},
-            {kwg_size / 32, 4, 8},
+        {kwg_size / 16, 4, 4, 1},
-            {kwg_size / 32, 8, 4},
+        {kwg_size / 32, 4, 8, 1},
-            {kwg_size / 64, 8, 8},
+        {kwg_size / 32, 8, 4, 1},
-            {kwg_size / 64, 16, 4},
+        {kwg_size / 64, 8, 8, 1},
-            {kwg_size / 128, 8, 16},
+        {kwg_size / 64, 16, 4, 1},
-            {kwg_size / 128, 16, 8},
+        {kwg_size / 128, 8, 16, 1},
-            {kwg_size / 128, 32, 4},
+        {kwg_size / 128, 16, 8, 1},
-            {1, kwg_size / 32, 32},
+        {kwg_size / 128, 32, 4, 1},
-            {1, kwg_size / 64, 64},
+        {1, kwg_size / 32, 32, 1},
-            {1, kwg_size / 128, 128},
+        {1, kwg_size / 64, 64, 1},
-            {1, kwg_size, 1},
+        {1, kwg_size / 128, 128, 1},
-            {4, 15, 8}, //SNPE size
+        {3, 15, 9, 1},
+        {7, 15, 9, 1},
+        {9, 7, 15, 1},
+        {15, 7, 9, 1},
+        {1, kwg_size, 1, 1},
+        {4, 15, 8, 1},  // SNPE size
    };
  };
  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+    cl_int error = CL_SUCCESS;
-        rb_kernel, cl::NullRange,
+    if (timer == nullptr) {
-        cl::NDRange(gws[0], gws[1], gws[2]),
+      uint32_t num_blocks = params.back();
-        cl::NDRange(params[0], params[1], params[2]),
+      const uint32_t block_size = gws[2] / num_blocks;
-        nullptr, &event);
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            rb_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          rb_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            rb_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
    return error;
  };
  std::stringstream ss;

--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -41,42 +41,74 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
                           static_cast<uint32_t>(width),
                           static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(softmax_kernel);
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    std::vector<uint32_t> local_ws(3, 0);
    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
+    return {
-            {local_ws[0], local_ws[1], local_ws[2]},
+        {local_ws[0], local_ws[1], local_ws[2], 1},
-            {local_ws[2], local_ws[1], local_ws[0]},
+        {kwg_size / 16, 4, 4, 1},
-            {kwg_size / 16, 4, 4},
+        {kwg_size / 32, 4, 8, 1},
-            {kwg_size / 32, 4, 8},
+        {kwg_size / 32, 8, 4, 1},
-            {kwg_size / 32, 8, 4},
+        {kwg_size / 64, 8, 8, 1},
-            {kwg_size / 64, 8, 8},
+        {kwg_size / 64, 16, 4, 1},
-            {kwg_size / 64, 16, 4},
+        {kwg_size / 128, 8, 16, 1},
-            {kwg_size / 128, 8, 16},
+        {kwg_size / 128, 16, 8, 1},
-            {kwg_size / 128, 16, 8},
+        {kwg_size / 128, 32, 4, 1},
-            {kwg_size / 128, 32, 4},
+        {1, kwg_size / 32, 32, 1},
-            {1, kwg_size / 32, 32},
+        {1, kwg_size / 64, 64, 1},
-            {1, kwg_size / 64, 64},
+        {1, kwg_size / 128, 128, 1},
-            {1, kwg_size / 128, 128},
+        {3, 15, 9, 1},
-            {3, 15, 9},
+        {7, 15, 9, 1},
-            {7, 15, 9},
+        {9, 7, 15, 1},
-            {9, 7, 15},
+        {15, 7, 9, 1},
-            {15, 7, 9},
+        {1, kwg_size, 1, 1},
-            {1, kwg_size, 1}};
+        {4, 15, 8, 1},  // SNPE size
+    };
  };
  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+    cl_int error = CL_SUCCESS;
-        softmax_kernel, cl::NullRange,
+    if (timer == nullptr) {
-        cl::NDRange(gws[0], gws[1], gws[2]),
+      uint32_t num_blocks = params.back();
-        cl::NDRange(params[0], params[1], params[2]),
+      const uint32_t block_size = gws[2] / num_blocks;
-        nullptr, &event);
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            softmax_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          softmax_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            softmax_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
    return error;
  };
  std::stringstream ss;

--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -61,36 +61,74 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
  const uint32_t gws[3] = {chan_blk,
                           static_cast<uint32_t>(batch_tensor->dim(2)),
                           static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
-  const std::vector<uint32_t> lws = {8, 16, 8};
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(s2b_kernel);
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    std::vector<uint32_t> local_ws(3, 0);
    local_ws[0] = std::min<uint32_t>(chan_blk, kwg_size);
    local_ws[1] = std::min<uint32_t>(32, kwg_size / local_ws[0]);
    local_ws[2] = std::min<uint32_t>(32, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{local_ws[0], local_ws[1], local_ws[2]},
+    return {
-            {4, 32, 8},
+        {local_ws[0], local_ws[1], local_ws[2], 1},
-            {4, 64, 4},
+        {kwg_size / 16, 4, 4, 1},
-            {4, 128, 2},
+        {kwg_size / 32, 4, 8, 1},
-            {8, 16, 8},
+        {kwg_size / 32, 8, 4, 1},
-            {8, 32, 4},
+        {kwg_size / 64, 8, 8, 1},
-            {8, 64, 2},
+        {kwg_size / 64, 16, 4, 1},
-            {16, 8, 8},
+        {kwg_size / 128, 8, 16, 1},
-            {16, 16, 4},
+        {kwg_size / 128, 16, 8, 1},
-            {16, 32, 2},
+        {kwg_size / 128, 32, 4, 1},
-            {32, 8, 4},
+        {1, kwg_size / 32, 32, 1},
-            {32, 16, 2},
+        {1, kwg_size / 64, 64, 1},
-            {64, 4, 4}};
+        {1, kwg_size / 128, 128, 1},
+        {3, 15, 9, 1},
+        {7, 15, 9, 1},
+        {9, 7, 15, 1},
+        {15, 7, 9, 1},
+        {1, kwg_size, 1, 1},
+        {4, 15, 8, 1},  // SNPE size
+    };
  };
  cl::Event event;
-  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+  auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+    cl_int error = CL_SUCCESS;
-        s2b_kernel, cl::NullRange,
+    if (timer == nullptr) {
-        cl::NDRange(gws[0], gws[1], gws[2]),
+      uint32_t num_blocks = params.back();
-        cl::NDRange(params[0], params[1], params[2]),
+      const uint32_t block_size = gws[2] / num_blocks;
-        nullptr, &event);
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
-    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            s2b_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      }
+    } else {
+      timer->StartTiming();
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          s2b_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+      timer->StopTiming();
+      double elapse_time = timer->ElapsedMicros();
+      timer->ClearTiming();
+      uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
+      params.back() = num_blocks;
+      const uint32_t block_size = gws[2] / num_blocks;
+      if (gws[2] % num_blocks > 0) num_blocks++;
+      for (uint32_t i = 0; i < num_blocks; ++i) {
+        uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            s2b_kernel,
+            cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(gws[0], gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+        timer->AccumulateTiming();
+      }
+    }
    return error;
  };
  std::stringstream ss;

--- a/mace/utils/timer.h
+++ b/mace/utils/timer.h
@@ -10,29 +10,50 @@
 namespace mace {
 class Timer {
-  public:
+ public:
-    virtual void StartTiming() = 0;
+  virtual void StartTiming() = 0;
-    virtual void StopTiming() = 0;
+  virtual void StopTiming() = 0;
-    virtual double ElapsedMicros() = 0;
+  virtual void AccumulateTiming() = 0;
+  virtual void ClearTiming() = 0;
+  virtual double ElapsedMicros() = 0;
+  virtual double AccumulatedMicros() = 0;
 };
 class WallClockTimer : public Timer {
-  public:
+ public:
-    void StartTiming() override {
+  WallClockTimer() : accumulated_micros_(0) {}
-      start_micros_ = mace::utils::NowMicros();
-    }
+  void StartTiming() override {
+    start_micros_ = mace::utils::NowMicros();
-    void StopTiming() override {
+  }
-      stop_micros_ = mace::utils::NowMicros();
-    }
+  void StopTiming() override {
+    stop_micros_ = mace::utils::NowMicros();
-    double ElapsedMicros() override {
+  }
-      return stop_micros_ - start_micros_;
-    }
+  void AccumulateTiming() override {
+    StopTiming();
-  private:
+    accumulated_micros_ += stop_micros_ - start_micros_;
-    double start_micros_;
+  }
-    double stop_micros_;
+  void ClearTiming() override {
+    start_micros_ = 0;
+    stop_micros_ = 0;
+    accumulated_micros_ = 0;
+  }
+  double ElapsedMicros() override {
+    return stop_micros_ - start_micros_;
+  }
+  double AccumulatedMicros() override {
+    return accumulated_micros_;
+  }
+ private:
+  double start_micros_;
+  double stop_micros_;
+  double accumulated_micros_;
 };
 }  // namespace mace

--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -41,10 +41,10 @@ class Tuner {
  template <typename RetType>
  RetType TuneOrRun(
      const std::string param_key,
-      const std::vector<param_type> &default_param,
+      std::vector<param_type> &default_param,
      const std::function<std::vector<std::vector<param_type>>()>
          &param_generator,
-      const std::function<RetType(const std::vector<param_type> &)> &func,
+      const std::function<RetType(std::vector<param_type> &, Timer *)> &func,
      Timer *timer) {
    std::string obfucated_param_key = MACE_OBFUSCATE_SYMBOL(param_key);
    if (IsTuning() && param_generator != nullptr) {
@@ -60,12 +60,12 @@ class Tuner {
      if (param_table_.find(obfucated_param_key) != param_table_.end()) {
        VLOG(1) << param_key << ": "
                << internal::MakeString(param_table_[obfucated_param_key]);
-        return func(param_table_[obfucated_param_key]);
+        return func(param_table_[obfucated_param_key], nullptr);
      } else {
 #ifndef MACE_DISABLE_NO_TUNING_WARNING
        LOG(WARNING) << "Fallback to default parameter: " << param_key;
 #endif
-        return func(default_param);
+        return func(default_param, nullptr);
      }
    }
  }
@@ -119,18 +119,16 @@ class Tuner {
  template <typename RetType>
  inline RetType Run(
-      const std::function<RetType(const std::vector<param_type> &)> &func,
+      const std::function<RetType(std::vector<param_type> &, Timer *)> &func,
-      const std::vector<param_type> &params,
+      std::vector<param_type> &params,
      Timer *timer,
      int num_runs,
      double *time_us) {
    RetType res;
    int64_t total_time_us = 0;
    for (int i = 0; i < num_runs; ++i) {
-      timer->StartTiming();
+      res = func(params, timer);
-      res = func(params);
+      total_time_us += timer->AccumulatedMicros();
-      timer->StopTiming();
-      total_time_us += timer->ElapsedMicros();
    }
    *time_us = total_time_us * 1.0 / num_runs;
@@ -141,13 +139,13 @@ class Tuner {
  inline RetType Tune(
      const std::function<std::vector<std::vector<param_type>>()>
          &param_generator,
-      const std::function<RetType(const std::vector<param_type> &)> &func,
+      const std::function<RetType(std::vector<param_type> &, Timer *)> &func,
      Timer *timer,
      std::vector<param_type> *opt_params) {
    RetType res;
    double opt_time = std::numeric_limits<double>::max();
    auto params = param_generator();
-    for (const auto &param : params) {
+    for (auto param : params) {
      double tmp_time = 0.0;
      // warm up
      Run<RetType>(func, param, timer, 2, &tmp_time);