diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc
index 5236f4c8baa8699edd9a27360f86ab4b8582e5e7..04b844e8e4e64f9098c8b3d4d5000599f75ed713 100644
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -110,7 +110,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
   std::string tuning_key =
       Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
              output->dim(3));
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index ab4d97bca81d93f7d19c87fb72f10ea14b3e3c11..a67c6aaf120f8a04acf6ba065261fa9aa6b8f377 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -105,7 +105,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     input_shape_ = input_tensors[0]->shape();
   }
 
-  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
   std::stringstream ss;
   ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
      << "_" << output_shape[2] << "_" << output_shape[3];
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index d29d8b42070577950a1a26304b751e5cd065f366..84f926334c71494436744a7df5ff9151a67e4ed6 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -116,7 +116,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
   std::string tuning_key =
       Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3), folded_constant_);
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
index a03652ff2f3a9f3100b94c67233ea3a34c3b4dbd..0a6a460798e47da02bf91a37e4ba8282d1ef0535 100644
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -90,7 +90,7 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
   std::stringstream ss;
   ss << "channel_shuffle_opencl_kernel_"
      << output->dim(0) << "_"
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 0b278bd3928cf6f1195a8b0fb6f18798958f8325..b22896ea84a71253409fdd54cc7c6a1d0aec919d 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -95,7 +95,7 @@ static void Concat2(cl::Kernel *kernel,
     *prev_input_shape = input0->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
   std::stringstream ss;
   ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
      << "_" << output->dim(2) << "_" << output->dim(3);
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 51a860ad601f526eb4cc1ebb0fc2ad0e6e491d76..9f7694bc6517b66cc4dc146babfc7354c06081c5 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -130,7 +130,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     *prev_input_shape = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
   std::string tuning_key =
       Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index bea5601a9d86eb1e6a48519c1f6c95928f411848..fb86602c99fc5efaefa2b7f2cb9d5c272798a50c 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -128,7 +128,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
     *prev_input_shape = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 1};
+  const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 0};
   std::string tuning_key =
       Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index 0e2f30cbe68b019e5995ca7d301501b3aec9a7de..9db8df0f64bf676a2d090532299df6a138541162 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -130,7 +130,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
     *prev_input_shape = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
   std::string tuning_key =
       Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
diff --git a/mace/kernels/opencl/cwise_opencl.cc b/mace/kernels/opencl/cwise_opencl.cc
index 4256f5ba54b186a0646e92420a9621e73d2712bb..cf716c27937c9c14b8df53f4061e3722aa4e8797 100644
--- a/mace/kernels/opencl/cwise_opencl.cc
+++ b/mace/kernels/opencl/cwise_opencl.cc
@@ -76,7 +76,7 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
   std::stringstream ss;
   ss << "cwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
      << "_" << output->dim(2) << "_" << output->dim(3);
diff --git a/mace/kernels/opencl/depth_to_space_opencl.cc b/mace/kernels/opencl/depth_to_space_opencl.cc
index 56e1e106f3a33c7ca2dd448dcf35f7ca49fb9745..1ecbc891bfe9682b3c0f64a23f393e5ada3c5be4 100644
--- a/mace/kernels/opencl/depth_to_space_opencl.cc
+++ b/mace/kernels/opencl/depth_to_space_opencl.cc
@@ -134,7 +134,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
   TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc
index 1b2b372574bf62dc5f3b55cc9ca9757f4bf83b54..534aa64e6152e2f6eaa930f8e98c8960711d5479 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -149,7 +149,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
     *prev_input_shape = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
   std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
                                   batch, height, width, channels, multiplier);
   TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc
index 9a875a820cb675007e5f74d520c7589627474223..629ba89045b043f2b1f7965eefc875a32a78b8ae 100644
--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -85,7 +85,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
     input_shape_ = input0->shape();
   }
 
-  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
   std::stringstream ss;
   ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
      << "_" << output->dim(2) << "_" << output->dim(3);
diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc
index bf0961b82ff99dad3c183ac2f0306fb93598dcc0..50f8ed5cd5e9c3ad40b8f9cc0d3a5928fc1fb93c 100644
--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -233,7 +233,7 @@ void FCWTXKernel(cl::Kernel *kernel,
 
     uint32_t kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-    *lws = {16, kwg_size/16, 1};
+    *lws = {16, kwg_size/16, 0};
   }
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
     const index_t batch = output->dim(0);
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index d822a7c050f3b4a021e9e72255e1332050d42461..06df28895506f64463e9d00451b3162d30a7b2c6 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -223,23 +223,23 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
         std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1]));
     return {
         // TODO(heliangliang): tuning these magic numbers
-        {local_ws[0], local_ws[1], local_ws[2], 1},
-        {kwg_size / 16, 4, 4, 1},
-        {kwg_size / 32, 4, 8, 1},
-        {kwg_size / 32, 8, 4, 1},
-        {kwg_size / 64, 8, 8, 1},
-        {kwg_size / 64, 16, 4, 1},
-        {kwg_size / 128, 8, 16, 1},
-        {kwg_size / 128, 16, 8, 1},
-        {kwg_size / 128, 32, 4, 1},
-        {1, kwg_size / 32, 32, 1},
-        {1, kwg_size / 64, 64, 1},
-        {1, kwg_size / 128, 128, 1},
-        {4, kwg_size / 16, 4, 1},
-        {4, kwg_size / 28, 7, 1},
-        {4, kwg_size / 32, 8, 1},
-        {4, kwg_size / 56, 14, 1},
-        {1, kwg_size, 1, 1},
+        {local_ws[0], local_ws[1], local_ws[2], 0},
+        {kwg_size / 16, 4, 4, 0},
+        {kwg_size / 32, 4, 8, 0},
+        {kwg_size / 32, 8, 4, 0},
+        {kwg_size / 64, 8, 8, 0},
+        {kwg_size / 64, 16, 4, 0},
+        {kwg_size / 128, 8, 16, 0},
+        {kwg_size / 128, 16, 8, 0},
+        {kwg_size / 128, 32, 4, 0},
+        {1, kwg_size / 32, 32, 0},
+        {1, kwg_size / 64, 64, 0},
+        {1, kwg_size / 128, 128, 0},
+        {4, kwg_size / 16, 4, 0},
+        {4, kwg_size / 28, 7, 0},
+        {4, kwg_size / 32, 8, 0},
+        {4, kwg_size / 56, 14, 0},
+        {1, kwg_size, 1, 0},
     };
   };
   cl::Event event;
@@ -248,46 +248,35 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
     MACE_CHECK(params.size() == 4)
         << "Tuning parameters of 3D kernel must be 4D";
     cl_int error = CL_SUCCESS;
-    std::vector<uint32_t> roundup_gws(3);
+    std::vector<uint32_t> internal_gws(gws, gws+3);
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       for (size_t i = 0; i < 3; ++i) {
-        roundup_gws[i] = RoundUp(gws[i], params[i]);
+        internal_gws[i] = RoundUp(gws[i], params[i]);
       }
     }
 
     if (timer == nullptr) {
-      uint32_t num_blocks = params[3];
-      const uint32_t block_size = gws[2] / num_blocks;
-      if (gws[2] % num_blocks > 0) num_blocks++;
+      uint32_t block_size = params[3] == 0 ? internal_gws[2] : params[3];
+      const uint32_t num_blocks = RoundUpDiv<uint32_t>(internal_gws[2],
+                                                       block_size);
       for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws2 =
-            (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        if (runtime->IsNonUniformWorkgroupsSupported()) {
-          error = runtime->command_queue().enqueueNDRangeKernel(
-              kernel, cl::NDRange(0, 0, i * block_size),
-              cl::NDRange(gws[0], gws[1], gws2),
-              cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-        } else {
-          uint32_t roundup_gws2 = RoundUp(gws2, params[2]);
-          error = runtime->command_queue().enqueueNDRangeKernel(
-              kernel, cl::NDRange(0, 0, i * block_size),
-              cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2),
-              cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        uint32_t gws2 = block_size;
+        if (runtime->IsNonUniformWorkgroupsSupported()
+            && (i == num_blocks - 1)) {
+          gws2 = (internal_gws[2] - (i * block_size));
         }
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel, cl::NDRange(0, 0, i * block_size),
+            cl::NDRange(internal_gws[0], internal_gws[1], gws2),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
         MACE_CHECK_CL_SUCCESS(error);
       }
     } else {
       timer->ClearTiming();
-      if (runtime->IsNonUniformWorkgroupsSupported()) {
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-      } else {
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            kernel, cl::NullRange,
-            cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
-            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-      }
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel, cl::NullRange,
+          cl::NDRange(internal_gws[0], internal_gws[1], internal_gws[2]),
+          cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
       MACE_CHECK_CL_SUCCESS(error);
       timer->AccumulateTiming();
       tuning_result->assign(params.begin(), params.end());
@@ -297,24 +286,22 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
         timer->ClearTiming();
         uint32_t num_blocks = std::min(
             static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
-        (*tuning_result)[3] = num_blocks;
-        const uint32_t block_size = gws[2] / num_blocks;
-        if (gws[2] % num_blocks > 0) num_blocks++;
+        uint32_t block_size = gws[2] / num_blocks;
+        if (!runtime->IsNonUniformWorkgroupsSupported()) {
+          block_size = RoundUp(block_size, params[2]);
+        }
+        (*tuning_result)[3] = block_size;
+        num_blocks = RoundUpDiv<uint32_t>(internal_gws[2], block_size);
         for (uint32_t i = 0; i < num_blocks; ++i) {
-          uint32_t gws2 =
-              (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-          if (runtime->IsNonUniformWorkgroupsSupported()) {
-            error = runtime->command_queue().enqueueNDRangeKernel(
-                kernel, cl::NDRange(0, 0, i * block_size),
-                cl::NDRange(gws[0], gws[1], gws2),
-                cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
-          } else {
-            uint32_t roundup_gws2 = RoundUp(gws2, params[2]);
-            error = runtime->command_queue().enqueueNDRangeKernel(
-                kernel, cl::NDRange(0, 0, i * block_size),
-                cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2),
-                cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+          uint32_t gws2 = block_size;
+          if (runtime->IsNonUniformWorkgroupsSupported()
+              && (i == num_blocks - 1)) {
+            gws2 = (internal_gws[2] - (i * block_size));
           }
+          error = runtime->command_queue().enqueueNDRangeKernel(
+              kernel, cl::NDRange(0, 0, i * block_size),
+              cl::NDRange(internal_gws[0], internal_gws[1], gws2),
+              cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
           MACE_CHECK_CL_SUCCESS(error);
           timer->AccumulateTiming();
         }
@@ -349,16 +336,16 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
     uint32_t local_ws[2];
     local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
     local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
-    return {{local_ws[0], local_ws[1], 1},
-            {local_ws[1], local_ws[0], 1},
-            {kwg_size / 4, 4, 1},
-            {kwg_size / 16, 16, 1},
-            {kwg_size / 32, 32, 1},
-            {kwg_size / 64, 64, 1},
-            {kwg_size / 128, 128, 1},
-            {kwg_size / 256, 256, 1},
-            {kwg_size, 1, 1},
-            {1, kwg_size, 1}};
+    return {{local_ws[0], local_ws[1], 0},
+            {local_ws[1], local_ws[0], 0},
+            {kwg_size / 4, 4, 0},
+            {kwg_size / 16, 16, 0},
+            {kwg_size / 32, 32, 0},
+            {kwg_size / 64, 64, 0},
+            {kwg_size / 128, 128, 0},
+            {kwg_size / 256, 256, 0},
+            {kwg_size, 1, 0},
+            {1, kwg_size, 0}};
   };
   cl::Event event;
   auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
@@ -366,44 +353,34 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
     MACE_CHECK(params.size() == 3)
         << "Tuning parameters of 2D kernel must be 3d";
     cl_int error = CL_SUCCESS;
-    std::vector<uint32_t> roundup_gws(2);
+    std::vector<uint32_t> internal_gws(gws, gws+2);
     if (!runtime->IsNonUniformWorkgroupsSupported()) {
       for (size_t i = 0; i < 2; ++i) {
-        roundup_gws[i] = RoundUp(gws[i], params[i]);
+        internal_gws[i] = RoundUp(gws[i], params[i]);
       }
     }
 
     if (timer == nullptr) {
-      uint32_t num_blocks = params[2];
-      const uint32_t block_size = gws[1] / num_blocks;
-      if (gws[1] % num_blocks > 0) num_blocks++;
+      uint32_t block_size = params[2] == 0 ? internal_gws[1] : params[2];
+      const uint32_t num_blocks = RoundUpDiv<uint32_t>(internal_gws[1],
+                                                       block_size);
       for (uint32_t i = 0; i < num_blocks; ++i) {
-        uint32_t gws1 =
-            (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-        if (runtime->IsNonUniformWorkgroupsSupported()) {
-          error = runtime->command_queue().enqueueNDRangeKernel(
-              kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
-              cl::NDRange(params[0], params[1]), nullptr, &event);
-        } else {
-          uint32_t roundup_gws1 = RoundUp(gws1, params[1]);
-          error = runtime->command_queue().enqueueNDRangeKernel(
-              kernel, cl::NDRange(0, i * block_size),
-              cl::NDRange(roundup_gws[0], roundup_gws1),
-              cl::NDRange(params[0], params[1]), nullptr, &event);
+        uint32_t gws1 = block_size;
+        if (runtime->IsNonUniformWorkgroupsSupported()
+            && (i == num_blocks - 1)) {
+          gws1 = (internal_gws[1] - (i * block_size));
         }
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel, cl::NDRange(0, i * block_size),
+            cl::NDRange(internal_gws[0], gws1),
+            cl::NDRange(params[0], params[1]), nullptr, &event);
         MACE_CHECK_CL_SUCCESS(error);
       }
     } else {
       timer->ClearTiming();
-      if (runtime->IsNonUniformWorkgroupsSupported()) {
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
-            cl::NDRange(params[0], params[1]), nullptr, &event);
-      } else {
-        error = runtime->command_queue().enqueueNDRangeKernel(
-            kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
-            cl::NDRange(params[0], params[1]), nullptr, &event);
-      }
+      error = runtime->command_queue().enqueueNDRangeKernel(
+          kernel, cl::NullRange, cl::NDRange(internal_gws[0], internal_gws[1]),
+          cl::NDRange(params[0], params[1]), nullptr, &event);
       MACE_CHECK_CL_SUCCESS(error);
       timer->AccumulateTiming();
       tuning_result->assign(params.begin(), params.end());
@@ -413,24 +390,22 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
         timer->ClearTiming();
         uint32_t num_blocks = std::min(
             static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
-        (*tuning_result)[2] = num_blocks;
-        const uint32_t block_size = gws[1] / num_blocks;
-        if (gws[1] % num_blocks > 0) num_blocks++;
+        uint32_t block_size = gws[1] / num_blocks;
+        if (!runtime->IsNonUniformWorkgroupsSupported()) {
+          block_size = RoundUp(block_size, params[1]);
+        }
+        (*tuning_result)[2] = block_size;
+        num_blocks = RoundUpDiv<uint32_t>(internal_gws[1], block_size);
         for (uint32_t i = 0; i < num_blocks; ++i) {
-          uint32_t gws1 =
-              (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-          if (runtime->IsNonUniformWorkgroupsSupported()) {
-            error = runtime->command_queue().enqueueNDRangeKernel(
-                kernel, cl::NDRange(0, i * block_size),
-                cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]),
-                nullptr, &event);
-          } else {
-            uint32_t roundup_gws1 = RoundUp(gws1, params[1]);
-            error = runtime->command_queue().enqueueNDRangeKernel(
-                kernel, cl::NDRange(0, i * block_size),
-                cl::NDRange(roundup_gws[0], roundup_gws1),
-                cl::NDRange(params[0], params[1]), nullptr, &event);
+          uint32_t gws1 = block_size;
+          if (runtime->IsNonUniformWorkgroupsSupported()
+              && (i == num_blocks - 1)) {
+            gws1 = (internal_gws[1] - (i * block_size));
           }
+          error = runtime->command_queue().enqueueNDRangeKernel(
+              kernel, cl::NDRange(0, i * block_size),
+              cl::NDRange(internal_gws[0], gws1),
+              cl::NDRange(params[0], params[1]), nullptr, &event);
           MACE_CHECK_CL_SUCCESS(error);
           timer->AccumulateTiming();
         }
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index a462f948b21f06ce5322c7216a34aa86d8e0e991..d941040ed83ed1151569c5139eb30f73436358b5 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -84,7 +84,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
   kernel_.setArg(idx++, static_cast<int>(height_blocks));
   kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
 
-  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
   std::stringstream ss;
   ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
      << C->dim(2) << "_" << C->dim(3);
diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc
index df408555e097075927fd8f24fe6a76841f2c63b8..45f27d6dae21ec2f8b2c3d5b9d4913c913f753b0 100644
--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -100,7 +100,7 @@ void PadFunctor<DeviceType::OPENCL, T>::operator()(
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
   std::string tuning_key =
       Concat("pad", output->dim(0), output->dim(1), output->dim(2),
              output->dim(3));
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index ea1e3f2cdaac470cb41aa80ecd4f4237fea903a5..e3d9081e007446d54e5ccdb68014d48caa1855f2 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -134,7 +134,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     };
   }
 
-  std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
+  std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
   std::stringstream ss;
   ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
      << "_" << output->dim(2) << "_" << output->dim(3);
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index 15cbaf575fc4cbcf24c08f6b42be94495f4fd825..be4fe3cd103d7f5ce25fe41c0a26d06d7b69d8fd 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -99,7 +99,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
   std::stringstream ss;
   ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
      << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
index 54cf842c912fd3987cbc270ec5f75569d3011b93..7e463997b1f70c1c7593ead2fb5e48dd70a728dd 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -81,7 +81,7 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
     input_shape_ = logits->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
   std::stringstream ss;
   ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
      << "_" << output->dim(2) << "_" << output->dim(3);
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
index e4702c9c42f19d1552ba348b56df1e51c463bd4d..c5b9df5a0b72bcf758c0728c63bb7e4a883afde9 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -105,7 +105,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
     space_shape_ = space_tensor->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
   std::stringstream ss;
   ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
      << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index 480825f6003fe8562977109bccb20ac82d4dc113..591a6208e654ac5edbf12f7687f844e5e52f6acc 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -54,7 +54,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
   std::vector<index_t> output_shape(4);
-  std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
+  std::vector<index_t> filter_shape = {3, 3, 1, input_tensor->dim(3)};
   std::vector<int> paddings(2);
   if (paddings_.empty()) {
     kernels::CalcNHWCPaddingAndOutputSize(
@@ -101,7 +101,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
     input_shape_ = input_tensor->shape();
   }
 
-  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
   std::stringstream ss;
   ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
      << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
@@ -215,7 +215,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
     input_shape_ = input_tensor->shape();
   }
 
-  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
 
   std::stringstream ss;
   ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"
diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py
index 14e39039df14edb4b62723574e97a6e93b2d8257..69950ccd990dd801207ee9d0d572f034d63af6f5 100644
--- a/mace/python/tools/caffe_converter_lib.py
+++ b/mace/python/tools/caffe_converter_lib.py
@@ -559,6 +559,7 @@ class CaffeConverter(object):
         paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
 
         filter_shape = np.asarray(op.data[0].shape)
+        filter_shape = filter_shape[[2, 3, 0, 1]]  # OIHW -> HWOI
 
         input_format = 'NHWC'
         output_shape = Shapes.conv_pool_shape(