diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc
index d3e6c7f90748ed0061f4671b4d540cf8b7129563..f41513c54dd66b38d28e9c0f21b21998711d0d0a 100644
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -71,6 +71,11 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   if (!IsVecEqual(input_shape_, input->shape())) {
     int idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
     kernel_.setArg(idx++, *(input->opencl_image()));
     if (activation_ == PRELU) {
       MACE_CHECK_NOTNULL(alpha);
@@ -78,9 +83,6 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     }
     kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
     kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);
 
     input_shape_ = input->shape();
 
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index d7c149a9720ea30b45c9c3745422cd19f4c7820a..c2c19fa780a19e05aa43702322be799f97412417 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -70,12 +70,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     output_tensor->ResizeImage(output_shape, output_image_shape);
 
     uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+    }
     for (auto input : input_tensors) {
       kernel_.setArg(idx++, *(input->opencl_image()));
     }
     kernel_.setArg(idx++, *(output_tensor->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
 
     input_shape_ = input_tensors[0]->shape();
 
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index c3a1765ce83a08e446c6d27393420a18ea61d544..8065acba80374af51a2f1368e45f1e46ca4fc869 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -75,6 +75,11 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   }
   if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
     kernel_.setArg(idx++, *(input->opencl_image()));
     kernel_.setArg(idx++, *(scale->opencl_image()));
     kernel_.setArg(idx++, *(offset->opencl_image()));
@@ -85,9 +90,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     }
     kernel_.setArg(idx++, *(output->opencl_image()));
     kernel_.setArg(idx++, relux_max_limit_);
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);
 
     input_shape_ = input->shape();
 
diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc
index e67ebe712c9c4604566a0f4c3baddd273a205ee1..a518f074209a05b2676356ff6bc3d50bf890abd7 100644
--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -45,12 +45,14 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   }
   if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
     kernel_.setArg(idx++, *(input->opencl_image()));
     kernel_.setArg(idx++, *(bias->opencl_image()));
     kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);
     input_shape_ = input->shape();
 
     kwg_size_ =
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index 565b3d569934a3b2618ffbf48d278fa0041b8953..5d7ae4c1b47d7af2a0897ba17bf158148bdf5b12 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -87,6 +87,10 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
                                          obfuscated_kernel_name, built_options);
 
   uint32_t idx = 0;
+  if (!is_non_uniform_work_groups_supported) {
+    b2f_kernel.setArg(idx++, gws[0]);
+    b2f_kernel.setArg(idx++, gws[1]);
+  }
   b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
   if (!i2b_) {
     MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
@@ -112,8 +116,6 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
     b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
   }
   b2f_kernel.setArg(idx++, *(image->opencl_image()));
-  b2f_kernel.setArg(idx++, gws[0]);
-  b2f_kernel.setArg(idx++, gws[1]);
 
   const uint32_t kwg_size =
       static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(b2f_kernel));
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
index 316ae62a2217b03cf09b1b6dd92e0142fd89b3b0..29097345417aa0a1e30f532a25c8735b2831dd05 100644
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -54,13 +54,15 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
 
   if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
     kernel_.setArg(idx++, *(input->opencl_image()));
     kernel_.setArg(idx++, groups_);
     kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
     kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);
 
     input_shape_ = input->shape();
 
diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl
index a02b0e3570e7e27a96d6afab00843e3a6b648060..7976dd38c811a082c186507c2f7a5f446aadd6c9 100644
--- a/mace/kernels/opencl/cl/activation.cl
+++ b/mace/kernels/opencl/cl/activation.cl
@@ -1,19 +1,17 @@
 #include <common.h>
 
-__kernel void activation(__read_only image2d_t input,
+__kernel void activation(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                         __private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+#endif
+                         __read_only image2d_t input,
 #ifdef USE_PRELU
                          __read_only image2d_t alpha,
 #endif
                          __private const float relux_max_limit,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                         __write_only image2d_t output,
-                         __private const int global_size_dim0,
-                         __private const int global_size_dim1,
-                         __private const int global_size_dim2) {
-#else
                          __write_only image2d_t output) {
-#endif
-
   const int ch_blk = get_global_id(0);
   const int w = get_global_id(1);
   const int hb = get_global_id(2);
diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl
index 23e47e50157a8fe2ae1d8f12d3d16e5d7284ed6f..09dd5c388de0bf5b30a873d6a20a55181e341d06 100644
--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
@@ -1,6 +1,11 @@
 #include <common.h>
 
-__kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
+__kernel void addn(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                   __private const int global_size_dim0,
+                   __private const int global_size_dim1,
+#endif
+                   __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                    __read_only image2d_t input1,
 #if INPUT_NUM > 2
                    __read_only image2d_t input2,
@@ -8,14 +13,7 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
 #if INPUT_NUM > 3
                    __read_only image2d_t input3,
 #endif
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                   __write_only image2d_t output,
-                   __private const int global_size_dim0,
-                   __private const int global_size_dim1) {
-#else
                    __write_only image2d_t output) {
-#endif
-
   const int w = get_global_id(0);
   const int hb = get_global_id(1);
 
diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl
index d36c1e8bc054afeca6c0bda30740c54040bd906b..0e592fdcbd26bfa3b274ea222d7d7cb3070b132d 100644
--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
@@ -1,6 +1,12 @@
 #include <common.h>
 // Supported data types: half/float
-__kernel void batch_norm(__read_only image2d_t input,
+__kernel void batch_norm(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                         __private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+#endif
+                         __read_only image2d_t input,
                          __read_only image2d_t scale,
                          __read_only image2d_t offset,
 #ifndef FOLDED_CONSTANT
@@ -9,15 +15,7 @@ __kernel void batch_norm(__read_only image2d_t input,
                          __private const float epsilon,
 #endif
                          __write_only image2d_t output,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                         __private const float relux_max_limit,
-                         __private const int global_size_dim0,
-                         __private const int global_size_dim1,
-                         __private const int global_size_dim2) {
-#else
                          __private const float relux_max_limit) {
-#endif
-
   const int ch_blk = get_global_id(0);
   const int w = get_global_id(1);
   const int hb = get_global_id(2);
diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl
index 594528ce30ffb1164f1e02fc22ee12e708206ccd..ee7b60786a57466178e3517face1ee7900ac639c 100644
--- a/mace/kernels/opencl/cl/bias_add.cl
+++ b/mace/kernels/opencl/cl/bias_add.cl
@@ -1,15 +1,14 @@
 #include <common.h>
 // Supported data types: half/float
-__kernel void bias_add(__read_only image2d_t input,
-                       __read_only image2d_t bias,
+__kernel void bias_add(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                       __write_only image2d_t output,
                        __private const int global_size_dim0,
                        __private const int global_size_dim1,
-                       __private const int global_size_dim2) {
-#else
-                       __write_only image2d_t output) {
+                       __private const int global_size_dim2,
 #endif
+                       __read_only image2d_t input,
+                       __read_only image2d_t bias,
+                       __write_only image2d_t output) {
   const int ch_blk = get_global_id(0);
   const int w = get_global_id(1);
   const int hb = get_global_id(2);
diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl
index a5d9f289efc7266feb77c7c868b51cfe4b25fb28..7e764503656fbf7d4ed41c245753663899c85fcb 100644
--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
@@ -1,19 +1,17 @@
 #include <common.h>
 
-__kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */
+__kernel void filter_buffer_to_image(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                     __private const int global_size_dim0,
+                                     __private const int global_size_dim1,
+#endif
+                                     __global const DATA_TYPE *input, /* h, w, oc, ic */
                                      __private const int input_offset,
                                      __private const int filter_h,
                                      __private const int filter_w,
                                      __private const int out_channel,
                                      __private const int in_channel,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                     __write_only image2d_t output,
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1) {
-#else
                                      __write_only image2d_t output) {
-#endif
-
   int w = get_global_id(0);
   int h = get_global_id(1);
 
@@ -58,19 +56,17 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
   WRITE_IMAGET(output, coord, values);
 }
 
-__kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic */
+__kernel void filter_image_to_buffer(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                     __private const int global_size_dim0,
+                                     __private const int global_size_dim1,
+#endif
+                                     __global DATA_TYPE *output, /* h, w, oc, ic */
                                      __private const int filter_h,
                                      __private const int filter_w,
                                      __private const int out_channel,
                                      __private const int in_channel,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                     __read_only image2d_t input,
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1) {
-#else
                                      __read_only image2d_t input) {
-#endif
-
   int w = get_global_id(0);
   int h = get_global_id(1);
 
@@ -112,19 +108,17 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
   }
 }
 
-__kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, m */
+__kernel void dw_filter_buffer_to_image(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                        __private const int global_size_dim0,
+                                        __private const int global_size_dim1,
+#endif
+                                        __global const DATA_TYPE *input, /* h, w, ic, m */
                                         __private const int input_offset,
                                         __private const int filter_w,
                                         __private const int in_channel,
                                         __private const int multiplier,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                        __write_only image2d_t output,
-                                        __private const int global_size_dim0,
-                                        __private const int global_size_dim1) { /* ic%4 * kh * kw * m, ic/4 */
-#else
-                                        __write_only image2d_t output) {
-#endif
-
+                                        __write_only image2d_t output) { /* ic%4 * kh * kw * m, ic/4 */
   const int w = get_global_id(0);
   const int h = get_global_id(1);
 
@@ -175,19 +169,17 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
   WRITE_IMAGET(output, coord, values);
 }
 
-__kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+__kernel void in_out_buffer_to_image(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                     __private const int global_size_dim0,
+                                     __private const int global_size_dim1,
+#endif
+                                     __global const DATA_TYPE *input, /* nhwc */
                                      __private const int input_offset,
                                      __private const int height,
                                      __private const int width,
                                      __private const int channels,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                     __write_only image2d_t output,
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1) {
-#else
                                      __write_only image2d_t output) {
-#endif
-
   int w = get_global_id(0);
   int h = get_global_id(1);
 
@@ -222,18 +214,16 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
   WRITE_IMAGET(output, coord, values);
 }
 
-__kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
+__kernel void in_out_image_to_buffer(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                     __private const int global_size_dim0,
+                                     __private const int global_size_dim1,
+#endif
+                                     __global DATA_TYPE *output, /* nhwc */
                                      __private const int height,
                                      __private const int width,
                                      __private const int channels,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                     __read_only image2d_t input,
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1) {
-#else
                                      __read_only image2d_t input) {
-#endif
-
   int w = get_global_id(0);
   int h = get_global_id(1);
 
@@ -267,17 +257,15 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
   }
 }
 
-__kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
-                                  __private const int input_offset,
-                                  __private const int count,
+__kernel void arg_buffer_to_image(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                                  __write_only image2d_t output,
                                   __private const int global_size_dim0,
-                                  __private const int global_size_dim1) {
-#else
-                                  __write_only image2d_t output) {
+                                  __private const int global_size_dim1,
 #endif
-
+                                  __global const DATA_TYPE *input, /* nhwc */
+                                  __private const int input_offset,
+                                  __private const int count,
+                                  __write_only image2d_t output) {
   int w = get_global_id(0);
   int h = get_global_id(1);
 
@@ -308,16 +296,14 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
   WRITE_IMAGET(output, coord, values);
 }
 
-__kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
-                                  __private const int count,
+__kernel void arg_image_to_buffer(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                                  __read_only image2d_t input,
                                   __private const int global_size_dim0,
-                                  __private const int global_size_dim1) {
-#else
-                                  __read_only image2d_t input) {
+                                  __private const int global_size_dim1,
 #endif
-
+                                  __global DATA_TYPE *output, /* nhwc */
+                                  __private const int count,
+                                  __read_only image2d_t input) {
   int w = get_global_id(0);
   int h = get_global_id(1);
 
@@ -347,19 +333,17 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
 }
 
 
-__kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //nhwc
+__kernel void in_out_height_buffer_to_image(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                            __private const int global_size_dim0,
+                                            __private const int global_size_dim1,
+#endif
+                                            __global const DATA_TYPE *input, //nhwc
                                             __private const int input_offset,
                                             __private const int height,
                                             __private const int width,
                                             __private const int channels,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                            __write_only image2d_t output,
-                                            __private const int global_size_dim0,
-                                            __private const int global_size_dim1) {
-#else
                                             __write_only image2d_t output) {
-#endif
-
   int w = get_global_id(0);
   int h = get_global_id(1);
 
@@ -395,18 +379,16 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
   WRITE_IMAGET(output, coord, values);
 }
 
-__kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
+__kernel void in_out_height_image_to_buffer(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                            __private const int global_size_dim0,
+                                            __private const int global_size_dim1,
+#endif
+                                            __global DATA_TYPE *output, //nhwc
                                             __private const int height,
                                             __private const int width,
                                             __private const int channels,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                            __read_only image2d_t input,
-                                            __private const int global_size_dim0,
-                                            __private const int global_size_dim1) {
-#else
                                             __read_only image2d_t input) {
-#endif
-
   int w = get_global_id(0);
   int h = get_global_id(1);
 
@@ -439,19 +421,17 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
 }
 
 
-__kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+__kernel void in_out_width_buffer_to_image(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                           __private const int global_size_dim0,
+                                           __private const int global_size_dim1,
+#endif
+                                           __global const DATA_TYPE *input, /* nhwc */
                                            __private const int input_offset,
                                            __private const int height,
                                            __private const int width,
                                            __private const int channels,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                           __write_only image2d_t output,
-                                           __private const int global_size_dim0,
-                                           __private const int global_size_dim1) {
-#else
                                            __write_only image2d_t output) {
-#endif
-
   int w = get_global_id(0);
   int h = get_global_id(1);
 
@@ -487,19 +467,17 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
 }
 
 // only support 3x3 now
-__kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, //Oc, Ic, H, W
+__kernel void winograd_filter_buffer_to_image(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                              __private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+#endif
+                                              __global const DATA_TYPE *input, //Oc, Ic, H, W
                                               __private const int input_offset,
                                               __private const int in_channels,
                                               __private const int height,
                                               __private const int width,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                              __write_only image2d_t output,
-                                              __private const int global_size_dim0,
-                                              __private const int global_size_dim1) {
-#else
                                               __write_only image2d_t output) {
-#endif
-
   int w = get_global_id(0);
   int h = get_global_id(1);
 
@@ -584,18 +562,16 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
 }
 
 // only support 3x3 now
-__kernel void winograd_filter_image_to_buffer(__global DATA_TYPE *output, //Oc, Ic, H, W
+__kernel void winograd_filter_image_to_buffer(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                              __private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+#endif
+                                              __global DATA_TYPE *output, //Oc, Ic, H, W
                                               __private const int height,
                                               __private const int width,
                                               __private const int channel,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                              __read_only image2d_t input,
-                                              __private const int global_size_dim0,
-                                              __private const int global_size_dim1) {
-#else
                                               __read_only image2d_t input) {
-#endif
-
   const int w = get_global_id(0);
   const int h = get_global_id(1);
 
diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/kernels/opencl/cl/channel_shuffle.cl
index 87159784467f4e118f5aced5640d6185ff50f14f..5bf0e067a30d4248eff32858ec11fe171fff53b5 100644
--- a/mace/kernels/opencl/cl/channel_shuffle.cl
+++ b/mace/kernels/opencl/cl/channel_shuffle.cl
@@ -1,18 +1,16 @@
 #include <common.h>
 
 // assume channes_per_group mod 4 = 0 && groups mod 4 == 0
-__kernel void channel_shuffle(__read_only image2d_t input,
-                      __private const int groups,
-                      __private const int channels_per_group,
+__kernel void channel_shuffle(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                      __write_only image2d_t output,
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1,
-                      __private const int global_size_dim2) {
-#else
-                      __write_only image2d_t output) {
+                              __private const int global_size_dim0,
+                              __private const int global_size_dim1,
+                              __private const int global_size_dim2,
 #endif
-
+                              __read_only image2d_t input,
+                              __private const int groups,
+                              __private const int channels_per_group,
+                              __write_only image2d_t output) {
   const int group_chan_blk_idx = get_global_id(0);
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl
index c8bfebaa1765afed605ee6561fa96fddb24f4bd8..2658025df79333f23c063a99310c1acecb3cd014 100644
--- a/mace/kernels/opencl/cl/concat.cl
+++ b/mace/kernels/opencl/cl/concat.cl
@@ -22,18 +22,16 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
 }
 
 // Supported data type: half/float
-__kernel void concat_channel(__read_only image2d_t input0,
-                             __read_only image2d_t input1,
-                             __private const int input0_chan,
+__kernel void concat_channel(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                             __write_only image2d_t output,
                              __private const int global_size_dim0,
                              __private const int global_size_dim1,
-                             __private const int global_size_dim2) {
-#else
-                             __write_only image2d_t output) {
+                             __private const int global_size_dim2,
 #endif
-
+                             __read_only image2d_t input0,
+                             __read_only image2d_t input1,
+                             __private const int input0_chan,
+                             __write_only image2d_t output) {
   const int chan_blk_idx = get_global_id(0);
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
@@ -90,17 +88,15 @@ __kernel void concat_channel(__read_only image2d_t input0,
 }
 
 // Required: All input channels are divisible by 4
-__kernel void concat_channel_multi(__read_only image2d_t input,
-                                   __private const int chan_blk_offset,
+__kernel void concat_channel_multi(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                                   __write_only image2d_t output,
                                    __private const int global_size_dim0,
                                    __private const int global_size_dim1,
-                                   __private const int global_size_dim2) {
-#else
-                                   __write_only image2d_t output) {
+                                   __private const int global_size_dim2,
 #endif
-
+                                   __read_only image2d_t input,
+                                   __private const int chan_blk_offset,
+                                   __write_only image2d_t output) {
   const int chan_blk_idx = get_global_id(0);
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl
index 8fa23f02db62c5c43469e7c4f5095830ebeeed1e..f40f31dab55dd7da07385004185ea5d9e84b9bfc 100644
--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -1,6 +1,12 @@
 #include <common.h>
 
-__kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void conv_2d(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                      __private const int global_size_dim0,
+                      __private const int global_size_dim1,
+                      __private const int global_size_dim2,
+#endif
+                      __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                       __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
 #ifdef BIAS
     __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -18,15 +24,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                       __private const int padding_top,
                       __private const int padding_left,
                       __private const int dilation_h,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                      __private const int dilation_w,
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1,
-                      __private const int global_size_dim2) {
-#else
                       __private const int dilation_w) {
-#endif
-
   const int out_ch_blk = get_global_id(0);
   const int out_w_blk = get_global_id(1);
   const int out_hb = get_global_id(2);
diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index 70d888670b26f536c451d2852f2b1652f8661a25..96d9a2c06aae8fd81e420eb1b0b407149bc13bdd 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -1,6 +1,12 @@
 #include <common.h>
 
-__kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void conv_2d_1x1(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                          __private const int global_size_dim0,
+                          __private const int global_size_dim1,
+                          __private const int global_size_dim2,
+#endif
+                          __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                           __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
 #ifdef BIAS
                           __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -12,15 +18,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                           __private const int in_ch_blks,
                           __private const int height,
                           __private const int width,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                          __private const int stride,
-                          __private const int global_size_dim0,
-                          __private const int global_size_dim1,
-                          __private const int global_size_dim2) {
-#else
                           __private const int stride) {
-#endif
-
   const int out_ch_blk = get_global_id(0);
   const int out_w_blk = get_global_id(1);
   const int out_hb = get_global_id(2);
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index 8ce485b7f3f92e46d3d25a1a95bcc624564d1125..b159fd6af0fdcdecd2f9586f9992f1412548398c 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -1,6 +1,12 @@
 #include <common.h>
 
-__kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void conv_2d_3x3(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                          __private const int global_size_dim0,
+                          __private const int global_size_dim1,
+                          __private const int global_size_dim2,
+#endif
+                          __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                           __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
 #ifdef BIAS
                           __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -16,15 +22,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                           __private const int padding_top,
                           __private const int padding_left,
                           __private const int dilation_h,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                          __private const int dilation_w,
-                          __private const int global_size_dim0,
-                          __private const int global_size_dim1,
-                          __private const int global_size_dim2) {
-#else
                           __private const int dilation_w) {
-#endif
-
   const int out_ch_blk = get_global_id(0);
   const int out_w_blk = get_global_id(1);
   const int out_hb = get_global_id(2);
diff --git a/mace/kernels/opencl/cl/depth_to_space.cl b/mace/kernels/opencl/cl/depth_to_space.cl
index 349b665daa608086f0013d2f5702e7c1353bffa0..2a5a8893ea2c86ba6a13cc1d9dc24dfa728088ac 100644
--- a/mace/kernels/opencl/cl/depth_to_space.cl
+++ b/mace/kernels/opencl/cl/depth_to_space.cl
@@ -1,17 +1,15 @@
 #include <common.h>
 
-__kernel void depth_to_space(__read_only image2d_t input,
-                             __private const int block_size,
-                             __private const int output_depth,
+__kernel void depth_to_space(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                             __write_only image2d_t output,
                              __private const int global_size_dim0,
                              __private const int global_size_dim1,
-                             __private const int global_size_dim2) {
-#else
-                             __write_only image2d_t output) {
+                             __private const int global_size_dim2,
 #endif
-
+                             __read_only image2d_t input,
+                             __private const int block_size,
+                             __private const int output_depth,
+                             __write_only image2d_t output) {
   const int out_d = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_h = get_global_id(2);
@@ -44,17 +42,16 @@ __kernel void depth_to_space(__read_only image2d_t input,
   WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data);
 }
 
-__kernel void space_to_depth(__read_only image2d_t input,
-                             __private const int block_size,
-                             __private const int input_depth,
+__kernel void space_to_depth(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                             __write_only image2d_t output,
                              __private const int global_size_dim0,
                              __private const int global_size_dim1,
-                             __private const int global_size_dim2) {
-#else
-                             __write_only image2d_t output) {
+                             __private const int global_size_dim2,
 #endif
+                             __read_only image2d_t input,
+                             __private const int block_size,
+                             __private const int input_depth,
+                             __write_only image2d_t output) {
 
   const int d = get_global_id(0);
   const int w = get_global_id(1);
diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl
index 7d39d3c1375c961039d933c3aa598bb1306fcdfc..1974d8db50935f2abc96e6a309e8fc41f448deb3 100644
--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
@@ -1,7 +1,13 @@
 #include <common.h>
 
 // Only multiplier = 1 is supported
-__kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void depthwise_conv2d(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                               __private const int global_size_dim0,
+                               __private const int global_size_dim1,
+                               __private const int global_size_dim2,
+#endif
+                               __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
     __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -18,15 +24,7 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
                                __private const short padding_top,
                                __private const short padding_left,
                                __private const short dilation_h,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                               __private const short dilation_w,
-                               __private const int global_size_dim0,
-                               __private const int global_size_dim1,
-                               __private const int global_size_dim2) {
-#else
                                __private const short dilation_w) {
-#endif
-
   const short out_ch_blk = get_global_id(0);
   const short out_w_blk = get_global_id(1);
   const short out_hb = get_global_id(2);
@@ -144,7 +142,13 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
   WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
 }
 
-__kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void depthwise_conv2d_s1(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                  __private const int global_size_dim0,
+                                  __private const int global_size_dim1,
+                                  __private const int global_size_dim2,
+#endif
+                                  __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                   __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
     __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -159,15 +163,7 @@ __kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4
                                   __private const short filter_height,
                                   __private const short filter_width,
                                   __private const short padding_top,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                  __private const short padding_left,
-                                  __private const int global_size_dim0,
-                                  __private const int global_size_dim1,
-                                  __private const int global_size_dim2) {
-#else
                                   __private const short padding_left) {
-#endif
-
   const short out_ch_blk = get_global_id(0);
   const short out_w_blk = get_global_id(1) << 2;
   const short out_hb = get_global_id(2);
diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/kernels/opencl/cl/eltwise.cl
index d7c90e03924566165fd3665f36cb113d7f52b7fe..9a69af1af46cacff54c20c1c5583f25bfdec4907 100644
--- a/mace/kernels/opencl/cl/eltwise.cl
+++ b/mace/kernels/opencl/cl/eltwise.cl
@@ -1,19 +1,17 @@
 #include <common.h>
 
-__kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
+__kernel void eltwise(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                      __private const int global_size_dim0,
+                      __private const int global_size_dim1,
+#endif
+                      __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                       __read_only image2d_t input1,
 #ifdef COEFF_SUM
                       __private const float coeff0,
                       __private const float coeff1,
 #endif
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                      __write_only image2d_t output,
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1) {
-#else
                       __write_only image2d_t output) {
-#endif
-
   const int w = get_global_id(0);
   const int hb = get_global_id(1);
 
diff --git a/mace/kernels/opencl/cl/matmul.cl b/mace/kernels/opencl/cl/matmul.cl
index 7107838c894b7050e2d6b3b02bd7e85712a87d4c..c3efc9f25a8869af9e10f27d74d6ea113d94826e 100644
--- a/mace/kernels/opencl/cl/matmul.cl
+++ b/mace/kernels/opencl/cl/matmul.cl
@@ -1,21 +1,19 @@
 #include <common.h>
 
 // C = A * B
-__kernel void matmul(__read_only image2d_t A,
+__kernel void matmul(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                     __private const int global_size_dim0,
+                     __private const int global_size_dim1,
+#endif
+                     __read_only image2d_t A,
                      __read_only image2d_t B,
                      __write_only image2d_t C,
                      __private const int M,
                      __private const int N,
                      __private const int K,
                      __private const int height_blocks,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                     __private const int k_blocks,
-                     __private const int global_size_dim0,
-                     __private const int global_size_dim1) {
-#else
                      __private const int k_blocks) {
-#endif
-
   const int gx = get_global_id(0) << 2;
   const int hb = get_global_id(1);
 
diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl
index 8cdc4e4625a24e409c38697d9e40a39e57007ab7..0a28b745d5f78cbd8b73181dff2b960c664aa6ae 100644
--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
@@ -19,7 +19,13 @@ inline int calculate_avg_block_size(const int pool_size,
 }
 
 // Supported data type: half/float
-__kernel void pooling(__read_only image2d_t input,
+__kernel void pooling(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                      __private const int global_size_dim0,
+                      __private const int global_size_dim1,
+                      __private const int global_size_dim2,
+#endif
+                      __read_only image2d_t input,
                       __private const int in_height,
                       __private const int in_width,
                       __private const int out_height,
@@ -27,14 +33,7 @@ __kernel void pooling(__read_only image2d_t input,
                       __private const int pad_left,
                       __private const int stride,
                       __private const int pooling_size,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                      __write_only image2d_t output,
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1,
-                      __private const int global_size_dim2) {
-#else
                       __write_only image2d_t output) {
-#endif
 
   const int out_chan_idx = get_global_id(0);
   const int out_width_idx = get_global_id(1);
diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl
index 5369c7625846757556debce41a2835b9fa71651f..8c5b7a33b561c7bbd37366630725546f64a9d72d 100644
--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
@@ -1,19 +1,18 @@
 #include <common.h>
 
-__kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void resize_bilinear_nocache(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                      __private const int global_size_dim0,
+                                      __private const int global_size_dim1,
+                                      __private const int global_size_dim2,
+#endif
+                                      __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                       __write_only image2d_t output,
                                       __private const float height_scale,
                                       __private const float width_scale,
                                       __private const int in_height,
                                       __private const int in_width,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                      __private const int out_height,
-                                      __private const int global_size_dim0,
-                                      __private const int global_size_dim1,
-                                      __private const int global_size_dim2) {
-#else
                                       __private const int out_height) {
-#endif
 
   const int ch_blk = get_global_id(0);
   const int w = get_global_id(1);
diff --git a/mace/kernels/opencl/cl/slice.cl b/mace/kernels/opencl/cl/slice.cl
index bb5f40cda7f1fdafecab81d0e48c3b3b410b2388..4517ec99a0f23c58377de0d784cb59f39bf604c1 100644
--- a/mace/kernels/opencl/cl/slice.cl
+++ b/mace/kernels/opencl/cl/slice.cl
@@ -1,16 +1,14 @@
 #include <common.h>
 
-__kernel void slice(__read_only image2d_t input,
-                    __private const int chan_blk_offset,
+__kernel void slice(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                    __write_only image2d_t output,
                     __private const int global_size_dim0,
                     __private const int global_size_dim1,
-                    __private const int global_size_dim2) {
-#else
-                    __write_only image2d_t output) {
+                    __private const int global_size_dim2,
 #endif
-
+                    __read_only image2d_t input,
+                    __private const int chan_blk_offset,
+                    __write_only image2d_t output) {
   const int chan_blk_idx = get_global_id(0);
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl
index 3fadd18e1b814e58ef0437e65aef7ea5820c9ca4..11ff80bf9f601d874ea81df7fac2f12a8001fa2e 100644
--- a/mace/kernels/opencl/cl/softmax.cl
+++ b/mace/kernels/opencl/cl/softmax.cl
@@ -1,17 +1,15 @@
 #include <common.h>
 
-__kernel void softmax(__read_only image2d_t input,
-                      __private const int channels,
-                      __private const int remain_channels,
+__kernel void softmax(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                      __write_only image2d_t output,
                       __private const int global_size_dim0,
                       __private const int global_size_dim1,
-                      __private const int global_size_dim2) {
-#else
-                      __write_only image2d_t output) {
+                      __private const int global_size_dim2,
 #endif
-
+                      __read_only image2d_t input,
+                      __private const int channels,
+                      __private const int remain_channels,
+                      __write_only image2d_t output) {
   const int chan_blk_idx = get_global_id(0);
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl
index 822d09063e2ff3d40bdf4ad954325db9be90cf14..0a54601269284a5038d733154a836515618e7443 100644
--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
@@ -1,6 +1,12 @@
 #include <common.h>
 
-__kernel void space_to_batch(__read_only image2d_t space_data,
+__kernel void space_to_batch(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                             __private const int global_size_dim0,
+                             __private const int global_size_dim1,
+                             __private const int global_size_dim2,
+#endif
+                             __read_only image2d_t space_data,
                              __write_only image2d_t batch_data,
                              __private const int block_height,
                              __private const int block_width,
@@ -9,15 +15,7 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
                              __private const int space_height,
                              __private const int space_width,
                              __private const int batch_height,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                             __private const int batch_width,
-                             __private const int global_size_dim0,
-                             __private const int global_size_dim1,
-                             __private const int global_size_dim2) {
-#else
                              __private const int batch_width) {
-#endif
-
   const int chan_idx = get_global_id(0);
   const int batch_w_idx = get_global_id(1);
   const int batch_hb_idx = get_global_id(2);
@@ -54,7 +52,13 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
   WRITE_IMAGET(batch_data, batch_coord, value);
 }
 
-__kernel void batch_to_space(__read_only image2d_t batch_data,
+__kernel void batch_to_space(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                             __private const int global_size_dim0,
+                             __private const int global_size_dim1,
+                             __private const int global_size_dim2,
+#endif
+                             __read_only image2d_t batch_data,
                              __write_only image2d_t space_data,
                              __private const int block_height,
                              __private const int block_width,
@@ -63,15 +67,7 @@ __kernel void batch_to_space(__read_only image2d_t batch_data,
                              __private const int space_height,
                              __private const int space_width,
                              __private const int batch_height,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                             __private const int batch_width,
-                             __private const int global_size_dim0,
-                             __private const int global_size_dim1,
-                             __private const int global_size_dim2) {
-#else
                              __private const int batch_width) {
-#endif
-
   const int chan_idx = get_global_id(0);
   const int batch_w_idx = get_global_id(1);
   const int batch_hb_idx = get_global_id(2);
diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl
index 098c8e3b2f119f2d55dfe5d0250ddf70086a8554..f3f99cfa567514ae4ba3a99c73930dd8e13efa6f 100644
--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
@@ -1,6 +1,11 @@
 #include <common.h>
 
-__kernel void winograd_transform_2x2(__read_only image2d_t input,
+__kernel void winograd_transform_2x2(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                     __private const int global_size_dim0,
+                                     __private const int global_size_dim1,
+#endif
+                                     __read_only image2d_t input,
                                      __write_only image2d_t output,
                                      __private const int in_height,
                                      __private const int in_width,
@@ -8,14 +13,7 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
                                      __private const int round_hw,
                                      __private const int round_w,
                                      __private const int padding_top,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                     __private const int padding_left,
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1) {
-#else
                                      __private const int padding_left) {
-#endif
-
   int out_width_idx = get_global_id(0);
   int chan_blk_idx = get_global_id(1);
 
@@ -121,7 +119,12 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
   }
 }
 
-__kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
+__kernel void winograd_inverse_transform_2x2(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                             __private const int global_size_dim0,
+                                             __private const int global_size_dim1,
+#endif
+                                             __read_only image2d_t input,
 #ifdef BIAS
                                              __read_only image2d_t bias, /* cout%4 * cout/4 */
 #endif
@@ -130,14 +133,7 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
                                              __private const int out_width,
                                              __private const int round_hw,
                                              __private const int round_w,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                             __private const float relux_max_limit,
-                                             __private const int global_size_dim0,
-                                             __private const int global_size_dim1) {
-#else
                                              __private const float relux_max_limit) {
-#endif
-
   const int width_idx = get_global_id(0);
   const int height_idx = get_global_id(1);
 
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 111b7a9c17b130f582b5d37ca209f77ced0dc9ba..56449d14188d778fc25cc5e906f2b083f9e9ce42 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -56,6 +56,11 @@ static void Concat2(cl::Kernel *kernel,
   }
   if (!IsVecEqual(*prev_input_shape, input0->shape())) {
     uint32_t idx = 0;
+    if (!(*is_non_uniform_work_groups_supported)) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
     kernel->setArg(idx++,
                    *(static_cast<const cl::Image2D *>(input0->opencl_image())));
     kernel->setArg(idx++,
@@ -63,9 +68,6 @@ static void Concat2(cl::Kernel *kernel,
     kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
     kernel->setArg(idx++,
                    *(static_cast<cl::Image2D *>(output->opencl_image())));
-    kernel->setArg(idx++, gws[0]);
-    kernel->setArg(idx++, gws[1]);
-    kernel->setArg(idx++, gws[2]);
 
     *prev_input_shape = input0->shape();
 
@@ -119,12 +121,14 @@ static void ConcatN(cl::Kernel *kernel,
     };
 
     uint32_t idx = 0;
+    if (!(*is_non_uniform_work_groups_supported)) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
     kernel->setArg(idx++, *(input->opencl_image()));
     kernel->setArg(idx++, static_cast<int32_t>(chan_blk_offset));
     kernel->setArg(idx++, *(output->opencl_image()));
-    kernel->setArg(idx++, gws[0]);
-    kernel->setArg(idx++, gws[1]);
-    kernel->setArg(idx++, gws[2]);
 
     chan_blk_offset += input_channel_blk;
     *kwg_size =
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index be2fd08b8c25e82d681ab67b6ca8eecb0fe431ae..34055189421eeb7fb583b4e610f51cd01731b1ed 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -84,6 +84,11 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
 
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
     uint32_t idx = 0;
+    if (!(*is_non_uniform_work_groups_supported)) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
     kernel->setArg(idx++, *(input->opencl_image()));
     kernel->setArg(idx++, *(filter->opencl_image()));
     if (bias != nullptr) {
@@ -98,9 +103,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     kernel->setArg(idx++, static_cast<int>(height));
     kernel->setArg(idx++, static_cast<int>(width));
     kernel->setArg(idx++, stride);
-    kernel->setArg(idx++, gws[0]);
-    kernel->setArg(idx++, gws[1]);
-    kernel->setArg(idx++, gws[2]);
 
     *prev_input_shape = input->shape();
 
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index cec0927fa44b79ffc897470272bd1827cd0c1308..88793dacb06dc5e75ca6bee7a8cb57c8dd0c6775 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -79,6 +79,11 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
 
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
     uint32_t idx = 0;
+    if (!(*is_non_uniform_work_groups_supported)) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
     kernel->setArg(idx++, *(input->opencl_image()));
     kernel->setArg(idx++, *(filter->opencl_image()));
     if (bias != nullptr) {
@@ -96,9 +101,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
     kernel->setArg(idx++, padding[1] / 2);
     kernel->setArg(idx++, dilations[0]);
     kernel->setArg(idx++, dilations[1]);
-    kernel->setArg(idx++, gws[0]);
-    kernel->setArg(idx++, gws[1]);
-    kernel->setArg(idx++, gws[2]);
 
     *prev_input_shape = input->shape();
 
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index a9151b480fa5a19b010d9220e62532fe588fc85d..191322095748de1b7a8ee2c4f6ca7049fca5a8b2 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -79,6 +79,11 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
 
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
     uint32_t idx = 0;
+    if (!(*is_non_uniform_work_groups_supported)) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
     kernel->setArg(idx++, *(input->opencl_image()));
     kernel->setArg(idx++, *(filter->opencl_image()));
     if (bias != nullptr) {
@@ -98,9 +103,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
     kernel->setArg(idx++, padding[1] / 2);
     kernel->setArg(idx++, dilations[0]);
     kernel->setArg(idx++, dilations[1]);
-    kernel->setArg(idx++, gws[0]);
-    kernel->setArg(idx++, gws[1]);
-    kernel->setArg(idx++, gws[2]);
 
     *prev_input_shape = input->shape();
 
diff --git a/mace/kernels/opencl/depth_to_space_opencl.cc b/mace/kernels/opencl/depth_to_space_opencl.cc
index 4daeac61bf58589fd29676aaa9001ec37aab065d..83cff27354a563d63094d1541a4735e51b136396 100644
--- a/mace/kernels/opencl/depth_to_space_opencl.cc
+++ b/mace/kernels/opencl/depth_to_space_opencl.cc
@@ -68,12 +68,6 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
   uint32_t gws[3];
   std::stringstream ss;
   if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, block_size_);
-    kernel_.setArg(idx++, depth_blocks);
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
     if (d2s_) {
       gws[0] = static_cast<uint32_t>(depth_blocks);
       gws[1] = static_cast<uint32_t>(output_width);
@@ -88,9 +82,16 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
          << input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3);
     }
 
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);
+    uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, block_size_);
+    kernel_.setArg(idx++, depth_blocks);
+    kernel_.setArg(idx++, *(output->opencl_image()));
 
     input_shape_ = input->shape();
 
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc
index 873a16a40e937443f341119421cb85af4d1f749a..11bb38b3a603055b738e4ed2a6ea609a12e1d24b 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -97,6 +97,11 @@ void DepthwiseConv2d(cl::Kernel *kernel,
                input_channels);
 
     uint32_t idx = 0;
+    if (!(*is_non_uniform_work_groups_supported)) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
     kernel->setArg(idx++, *(input->opencl_image()));
     kernel->setArg(idx++, *(filter->opencl_image()));
     if (bias != nullptr) {
@@ -117,9 +122,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
       kernel->setArg(idx++, static_cast<int16_t>(dilations[0]));
       kernel->setArg(idx++, static_cast<int16_t>(dilations[1]));
     }
-    kernel->setArg(idx++, gws[0]);
-    kernel->setArg(idx++, gws[1]);
-    kernel->setArg(idx++, gws[2]);
 
     *prev_input_shape = input->shape();
 
diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc
index 13413130c485ecb1ffb68a42a079be27c543046d..38a231da40224da36eb0a9e972764b0a1915ead9 100644
--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -47,6 +47,10 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
   }
   if (!IsVecEqual(input_shape_, input0->shape())) {
     uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+    }
     kernel_.setArg(idx++, *(input0->opencl_image()));
     kernel_.setArg(idx++, *(input1->opencl_image()));
     if (!coeff_.empty()) {
@@ -54,8 +58,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
       kernel_.setArg(idx++, coeff_[1]);
     }
     kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
 
     input_shape_ = input0->shape();
 
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index 19769f3d3d3d37389f55de983ed58d1480ba6935..79dcc40d67b0c2a855fb68ab632409e0d868aa6f 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -48,6 +48,10 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
     kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
   }
   uint32_t idx = 0;
+  if (!is_non_uniform_work_groups_supported_) {
+    kernel_.setArg(idx++, gws[0]);
+    kernel_.setArg(idx++, gws[1]);
+  }
   kernel_.setArg(idx++, *(A->opencl_image()));
   kernel_.setArg(idx++, *(B->opencl_image()));
   kernel_.setArg(idx++, *(C->opencl_image()));
@@ -56,8 +60,6 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
   kernel_.setArg(idx++, static_cast<int>(A->dim(2)));
   kernel_.setArg(idx++, static_cast<int>(height_blocks));
   kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
-  kernel_.setArg(idx++, gws[0]);
-  kernel_.setArg(idx++, gws[1]);
 
   kwg_size_ =
       static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index fa9e157716919773dbc3a6d1f99beb016508a19d..9b2f96c8f1e92d7c64897bd3d5c3c7f83bcd63ea 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -81,6 +81,11 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     };
 
     uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
     kernel_.setArg(idx++, *(input->opencl_image()));
     kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
     kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
@@ -90,14 +95,23 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     kernel_.setArg(idx++, strides_[0]);
     kernel_.setArg(idx++, kernels_[0]);
     kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);
 
     input_shape_ = input->shape();
 
     kwg_size_ =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  } else {
+    index_t batch = output->dim(0);
+    index_t out_height = output->dim(1);
+    index_t out_width = output->dim(2);
+    index_t channels = output->dim(3);
+
+    index_t channel_blocks = (channels + 3) / 4;
+
+    gws = {
+        static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
+        static_cast<uint32_t>(batch * out_height),
+    };
   }
 
   std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index 5bcb53e37f4b354b950ec7eca44589b50d1a6dbd..ce2fe7bfbc28c9d4563e6e900d87ec753179af84 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -60,6 +60,11 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
         CalculateResizeScale(in_width, out_width, align_corners_);
 
     uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
     kernel_.setArg(idx++, *(input->opencl_image()));
     kernel_.setArg(idx++, *(output->opencl_image()));
     kernel_.setArg(idx++, height_scale);
@@ -67,9 +72,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, static_cast<int32_t>(in_height));
     kernel_.setArg(idx++, static_cast<int32_t>(in_width));
     kernel_.setArg(idx++, static_cast<int32_t>(out_height));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);
 
     input_shape_ = input->shape();
 
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
index 94f541b2418afe4906710be2b5d7f89b9d61c06b..d610e1e14b6bfd49d61ed93fc95c150afd1a665e 100644
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -65,12 +65,14 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
      << outputs_count;
   for (int i = 0; i < outputs_count; ++i) {
     uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
     kernel_.setArg(idx++, *(input->opencl_image()));
     kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
     kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);
 
     TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
   }
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
index 6b06cc8fb4c408a233b4799c97fb15326d695721..61ea022890a30875a845b6034e37ef8b80081e9b 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -45,13 +45,15 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
   }
   if (!IsVecEqual(input_shape_, logits->shape())) {
     uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
     kernel_.setArg(idx++, *(logits->opencl_image()));
     kernel_.setArg(idx++, static_cast<int>(channels));
     kernel_.setArg(idx++, remain_channels);
     kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);
 
     input_shape_ = logits->shape();
 
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
index 6e00f6ea2ef726cb52b69ac580a4072f8a2d84a4..38da0548b974269a114022d71b6e792493e2c71e 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -57,6 +57,11 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
   }
   if (!IsVecEqual(space_shape_, space_tensor->shape())) {
     uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
     if (b2s_) {
       kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
       kernel_.setArg(idx++, *(space_tensor->opencl_image()));
@@ -72,9 +77,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
     kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
     kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);
 
     space_shape_ = space_tensor->shape();
 
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index 905b1346a18e33f42cabfe53fd4f436d10c602bd..092a60cd805acd2b28cb909395846ed8ec3bd1e2 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -61,6 +61,10 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
     output_tensor->ResizeImage(output_shape, image_shape);
 
     uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+    }
     kernel_.setArg(idx++, *(input_tensor->opencl_image()));
     kernel_.setArg(idx++, *(output_tensor->opencl_image()));
     kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
@@ -70,8 +74,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
     kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
     kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
 
     input_shape_ = input_tensor->shape();
 
@@ -151,6 +153,10 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
     const uint32_t round_h = (height_ + 1) / 2;
     const uint32_t round_w = (width_ + 1) / 2;
     uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+    }
     kernel_.setArg(
         idx++,
         *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
@@ -165,8 +171,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
     kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
     kernel_.setArg(idx++, relux_max_limit_);
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
 
     input_shape_ = input_tensor->shape();
 
diff --git a/tools/mace_tools.py b/tools/mace_tools.py
index 4f2b209a700439fffd6f466551c0dffceb555805..2e0ea3fa386c0390202cc2cfe0449c5a58c52637 100644
--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -376,7 +376,8 @@ def main(unused_args):
         build_run_throughput_test(target_soc, FLAGS.run_seconds,
                                   merged_lib_file, FLAGS.output_dir)
 
-  packaging_lib_file(FLAGS.output_dir)
+  if FLAGS.mode == "build" or FLAGS.mode == "all":
+    packaging_lib_file(FLAGS.output_dir)
 
 
 if __name__ == "__main__":
diff --git a/tools/packaging_lib.sh b/tools/packaging_lib.sh
index 607514875a022b7a1f18117c15d4efe248b7c349..c6158cd5f9c954211d858526a4e560d9ceab8a0b 100644
--- a/tools/packaging_lib.sh
+++ b/tools/packaging_lib.sh
@@ -14,8 +14,13 @@ source ${CURRENT_DIR}/env.sh
 
 LIBMACE_BUILD_DIR=$1
 
+TAR_PACKAGE_NAME=libmace_${PROJECT_NAME}.tar.gz
+
 pushd $LIBMACE_BUILD_DIR/$PROJECT_NAME
-ls | grep -v build | xargs tar cvzf libmace_${PROJECT_NAME}.tar.gz
+if [ -f $TAR_PACKAGE_NAME ]; then
+  rm -f $TAR_PACKAGE_NAME
+fi
+ls | grep -v build | xargs tar cvzf $TAR_PACKAGE_NAME
 popd
 
 echo "Packaging done!"