update cl source code format, fix pooling op core dump

eefc9d01 · yejianwu · 08a362e7 · eefc9d01 · eefc9d01 · eefc9d01
42 changed file
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -71,6 +71,11 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,

  if (!IsVecEqual(input_shape_, input->shape())) {
    int idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    if (activation_ == PRELU) {
      MACE_CHECK_NOTNULL(alpha);
@@ -78,9 +83,6 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    }
    kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);

    input_shape_ = input->shape();


--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -70,12 +70,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
    output_tensor->ResizeImage(output_shape, output_image_shape);

    uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+    }
    for (auto input : input_tensors) {
      kernel_.setArg(idx++, *(input->opencl_image()));
    }
    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);

    input_shape_ = input_tensors[0]->shape();


--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -75,6 +75,11 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  }
  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, *(scale->opencl_image()));
    kernel_.setArg(idx++, *(offset->opencl_image()));
@@ -85,9 +90,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    }
    kernel_.setArg(idx++, *(output->opencl_image()));
    kernel_.setArg(idx++, relux_max_limit_);
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);

    input_shape_ = input->shape();


--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -45,12 +45,14 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  }
  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, *(bias->opencl_image()));
    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);
    input_shape_ = input->shape();

    kwg_size_ =

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -87,6 +87,10 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
                                         obfuscated_kernel_name, built_options);

  uint32_t idx = 0;
+  if (!is_non_uniform_work_groups_supported) {
+    b2f_kernel.setArg(idx++, gws[0]);
+    b2f_kernel.setArg(idx++, gws[1]);
+  }
  b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
  if (!i2b_) {
    MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
@@ -112,8 +116,6 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
  }
  b2f_kernel.setArg(idx++, *(image->opencl_image()));
-  b2f_kernel.setArg(idx++, gws[0]);
-  b2f_kernel.setArg(idx++, gws[1]);

  const uint32_t kwg_size =
      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(b2f_kernel));

--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -54,13 +54,15 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(

  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, groups_);
    kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);

    input_shape_ = input->shape();


--- a/mace/kernels/opencl/cl/activation.cl
+++ b/mace/kernels/opencl/cl/activation.cl
 #include <common.h>

-__kernel void activation(__read_only image2d_t input,
+__kernel void activation(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                         __private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+#endif
+                         __read_only image2d_t input,
 #ifdef USE_PRELU
                         __read_only image2d_t alpha,
 #endif
                         __private const float relux_max_limit,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                         __write_only image2d_t output,
-                         __private const int global_size_dim0,
-                         __private const int global_size_dim1,
-                         __private const int global_size_dim2) {
-#else
                         __write_only image2d_t output) {
-#endif
-
  const int ch_blk = get_global_id(0);
  const int w = get_global_id(1);
  const int hb = get_global_id(2);

--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
 #include <common.h>

-__kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
+__kernel void addn(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                   __private const int global_size_dim0,
+                   __private const int global_size_dim1,
+#endif
+                   __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                   __read_only image2d_t input1,
 #if INPUT_NUM > 2
                   __read_only image2d_t input2,
@@ -8,14 +13,7 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
 #if INPUT_NUM > 3
                   __read_only image2d_t input3,
 #endif
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                   __write_only image2d_t output,
-                   __private const int global_size_dim0,
-                   __private const int global_size_dim1) {
-#else
                   __write_only image2d_t output) {
-#endif
-
  const int w = get_global_id(0);
  const int hb = get_global_id(1);


--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
 #include <common.h>
 // Supported data types: half/float
-__kernel void batch_norm(__read_only image2d_t input,
+__kernel void batch_norm(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                         __private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+#endif
+                         __read_only image2d_t input,
                         __read_only image2d_t scale,
                         __read_only image2d_t offset,
 #ifndef FOLDED_CONSTANT
@@ -9,15 +15,7 @@ __kernel void batch_norm(__read_only image2d_t input,
                         __private const float epsilon,
 #endif
                         __write_only image2d_t output,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                         __private const float relux_max_limit,
-                         __private const int global_size_dim0,
-                         __private const int global_size_dim1,
-                         __private const int global_size_dim2) {
-#else
                         __private const float relux_max_limit) {
-#endif
-
  const int ch_blk = get_global_id(0);
  const int w = get_global_id(1);
  const int hb = get_global_id(2);

--- a/mace/kernels/opencl/cl/bias_add.cl
+++ b/mace/kernels/opencl/cl/bias_add.cl
 #include <common.h>
 // Supported data types: half/float
-__kernel void bias_add(__read_only image2d_t input,
-                       __read_only image2d_t bias,
+__kernel void bias_add(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                       __write_only image2d_t output,
                       __private const int global_size_dim0,
                       __private const int global_size_dim1,
-                       __private const int global_size_dim2) {
-#else
-                       __write_only image2d_t output) {
+                       __private const int global_size_dim2,
 #endif
+                       __read_only image2d_t input,
+                       __read_only image2d_t bias,
+                       __write_only image2d_t output) {
  const int ch_blk = get_global_id(0);
  const int w = get_global_id(1);
  const int hb = get_global_id(2);

--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
 #include <common.h>

-__kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */
+__kernel void filter_buffer_to_image(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                     __private const int global_size_dim0,
+                                     __private const int global_size_dim1,
+#endif
+                                     __global const DATA_TYPE *input, /* h, w, oc, ic */
                                     __private const int input_offset,
                                     __private const int filter_h,
                                     __private const int filter_w,
                                     __private const int out_channel,
                                     __private const int in_channel,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                     __write_only image2d_t output,
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1) {
-#else
                                     __write_only image2d_t output) {
-#endif
-
  int w = get_global_id(0);
  int h = get_global_id(1);

@@ -58,19 +56,17 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic */
+__kernel void filter_image_to_buffer(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                     __private const int global_size_dim0,
+                                     __private const int global_size_dim1,
+#endif
+                                     __global DATA_TYPE *output, /* h, w, oc, ic */
                                     __private const int filter_h,
                                     __private const int filter_w,
                                     __private const int out_channel,
                                     __private const int in_channel,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                     __read_only image2d_t input,
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1) {
-#else
                                     __read_only image2d_t input) {
-#endif
-
  int w = get_global_id(0);
  int h = get_global_id(1);

@@ -112,19 +108,17 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
  }
 }

-__kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, m */
+__kernel void dw_filter_buffer_to_image(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                        __private const int global_size_dim0,
+                                        __private const int global_size_dim1,
+#endif
+                                        __global const DATA_TYPE *input, /* h, w, ic, m */
                                        __private const int input_offset,
                                        __private const int filter_w,
                                        __private const int in_channel,
                                        __private const int multiplier,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                        __write_only image2d_t output,
-                                        __private const int global_size_dim0,
-                                        __private const int global_size_dim1) { /* ic%4 * kh * kw * m, ic/4 */
-#else
-                                        __write_only image2d_t output) {
-#endif
-
+                                        __write_only image2d_t output) { /* ic%4 * kh * kw * m, ic/4 */
  const int w = get_global_id(0);
  const int h = get_global_id(1);

@@ -175,19 +169,17 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+__kernel void in_out_buffer_to_image(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                     __private const int global_size_dim0,
+                                     __private const int global_size_dim1,
+#endif
+                                     __global const DATA_TYPE *input, /* nhwc */
                                     __private const int input_offset,
                                     __private const int height,
                                     __private const int width,
                                     __private const int channels,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                     __write_only image2d_t output,
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1) {
-#else
                                     __write_only image2d_t output) {
-#endif
-
  int w = get_global_id(0);
  int h = get_global_id(1);

@@ -222,18 +214,16 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
+__kernel void in_out_image_to_buffer(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                     __private const int global_size_dim0,
+                                     __private const int global_size_dim1,
+#endif
+                                     __global DATA_TYPE *output, /* nhwc */
                                     __private const int height,
                                     __private const int width,
                                     __private const int channels,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                     __read_only image2d_t input,
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1) {
-#else
                                     __read_only image2d_t input) {
-#endif
-
  int w = get_global_id(0);
  int h = get_global_id(1);

@@ -267,17 +257,15 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
  }
 }

-__kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
-                                  __private const int input_offset,
-                                  __private const int count,
+__kernel void arg_buffer_to_image(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                                  __write_only image2d_t output,
                                  __private const int global_size_dim0,
-                                  __private const int global_size_dim1) {
-#else
-                                  __write_only image2d_t output) {
+                                  __private const int global_size_dim1,
 #endif
-
+                                  __global const DATA_TYPE *input, /* nhwc */
+                                  __private const int input_offset,
+                                  __private const int count,
+                                  __write_only image2d_t output) {
  int w = get_global_id(0);
  int h = get_global_id(1);

@@ -308,16 +296,14 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
-                                  __private const int count,
+__kernel void arg_image_to_buffer(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                                  __read_only image2d_t input,
                                  __private const int global_size_dim0,
-                                  __private const int global_size_dim1) {
-#else
-                                  __read_only image2d_t input) {
+                                  __private const int global_size_dim1,
 #endif
-
+                                  __global DATA_TYPE *output, /* nhwc */
+                                  __private const int count,
+                                  __read_only image2d_t input) {
  int w = get_global_id(0);
  int h = get_global_id(1);

@@ -347,19 +333,17 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
 }


-__kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //nhwc
+__kernel void in_out_height_buffer_to_image(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                            __private const int global_size_dim0,
+                                            __private const int global_size_dim1,
+#endif
+                                            __global const DATA_TYPE *input, //nhwc
                                            __private const int input_offset,
                                            __private const int height,
                                            __private const int width,
                                            __private const int channels,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                            __write_only image2d_t output,
-                                            __private const int global_size_dim0,
-                                            __private const int global_size_dim1) {
-#else
                                            __write_only image2d_t output) {
-#endif
-
  int w = get_global_id(0);
  int h = get_global_id(1);

@@ -395,18 +379,16 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
+__kernel void in_out_height_image_to_buffer(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                            __private const int global_size_dim0,
+                                            __private const int global_size_dim1,
+#endif
+                                            __global DATA_TYPE *output, //nhwc
                                            __private const int height,
                                            __private const int width,
                                            __private const int channels,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                            __read_only image2d_t input,
-                                            __private const int global_size_dim0,
-                                            __private const int global_size_dim1) {
-#else
                                            __read_only image2d_t input) {
-#endif
-
  int w = get_global_id(0);
  int h = get_global_id(1);

@@ -439,19 +421,17 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
 }


-__kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+__kernel void in_out_width_buffer_to_image(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                           __private const int global_size_dim0,
+                                           __private const int global_size_dim1,
+#endif
+                                           __global const DATA_TYPE *input, /* nhwc */
                                           __private const int input_offset,
                                           __private const int height,
                                           __private const int width,
                                           __private const int channels,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                           __write_only image2d_t output,
-                                           __private const int global_size_dim0,
-                                           __private const int global_size_dim1) {
-#else
                                           __write_only image2d_t output) {
-#endif
-
  int w = get_global_id(0);
  int h = get_global_id(1);

@@ -487,19 +467,17 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
 }

 // only support 3x3 now
-__kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, //Oc, Ic, H, W
+__kernel void winograd_filter_buffer_to_image(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                              __private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+#endif
+                                              __global const DATA_TYPE *input, //Oc, Ic, H, W
                                              __private const int input_offset,
                                              __private const int in_channels,
                                              __private const int height,
                                              __private const int width,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                              __write_only image2d_t output,
-                                              __private const int global_size_dim0,
-                                              __private const int global_size_dim1) {
-#else
                                              __write_only image2d_t output) {
-#endif
-
  int w = get_global_id(0);
  int h = get_global_id(1);

@@ -584,18 +562,16 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
 }

 // only support 3x3 now
-__kernel void winograd_filter_image_to_buffer(__global DATA_TYPE *output, //Oc, Ic, H, W
+__kernel void winograd_filter_image_to_buffer(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                              __private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+#endif
+                                              __global DATA_TYPE *output, //Oc, Ic, H, W
                                              __private const int height,
                                              __private const int width,
                                              __private const int channel,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                              __read_only image2d_t input,
-                                              __private const int global_size_dim0,
-                                              __private const int global_size_dim1) {
-#else
                                              __read_only image2d_t input) {
-#endif
-
  const int w = get_global_id(0);
  const int h = get_global_id(1);


--- a/mace/kernels/opencl/cl/channel_shuffle.cl
+++ b/mace/kernels/opencl/cl/channel_shuffle.cl
 #include <common.h>

 // assume channes_per_group mod 4 = 0 && groups mod 4 == 0
-__kernel void channel_shuffle(__read_only image2d_t input,
-                      __private const int groups,
-                      __private const int channels_per_group,
+__kernel void channel_shuffle(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                      __write_only image2d_t output,
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1,
-                      __private const int global_size_dim2) {
-#else
-                      __write_only image2d_t output) {
+                              __private const int global_size_dim0,
+                              __private const int global_size_dim1,
+                              __private const int global_size_dim2,
 #endif
-
+                              __read_only image2d_t input,
+                              __private const int groups,
+                              __private const int channels_per_group,
+                              __write_only image2d_t output) {
  const int group_chan_blk_idx = get_global_id(0);
  const int width_idx = get_global_id(1);
  const int hb_idx = get_global_id(2);

--- a/mace/kernels/opencl/cl/concat.cl
+++ b/mace/kernels/opencl/cl/concat.cl
@@ -22,18 +22,16 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
 }

 // Supported data type: half/float
-__kernel void concat_channel(__read_only image2d_t input0,
-                             __read_only image2d_t input1,
-                             __private const int input0_chan,
+__kernel void concat_channel(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                             __write_only image2d_t output,
                             __private const int global_size_dim0,
                             __private const int global_size_dim1,
-                             __private const int global_size_dim2) {
-#else
-                             __write_only image2d_t output) {
+                             __private const int global_size_dim2,
 #endif
-
+                             __read_only image2d_t input0,
+                             __read_only image2d_t input1,
+                             __private const int input0_chan,
+                             __write_only image2d_t output) {
  const int chan_blk_idx = get_global_id(0);
  const int width_idx = get_global_id(1);
  const int hb_idx = get_global_id(2);
@@ -90,17 +88,15 @@ __kernel void concat_channel(__read_only image2d_t input0,
 }

 // Required: All input channels are divisible by 4
-__kernel void concat_channel_multi(__read_only image2d_t input,
-                                   __private const int chan_blk_offset,
+__kernel void concat_channel_multi(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                                   __write_only image2d_t output,
                                   __private const int global_size_dim0,
                                   __private const int global_size_dim1,
-                                   __private const int global_size_dim2) {
-#else
-                                   __write_only image2d_t output) {
+                                   __private const int global_size_dim2,
 #endif
-
+                                   __read_only image2d_t input,
+                                   __private const int chan_blk_offset,
+                                   __write_only image2d_t output) {
  const int chan_blk_idx = get_global_id(0);
  const int width_idx = get_global_id(1);
  const int hb_idx = get_global_id(2);

--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
 #include <common.h>

-__kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void conv_2d(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                      __private const int global_size_dim0,
+                      __private const int global_size_dim1,
+                      __private const int global_size_dim2,
+#endif
+                      __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                      __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
 #ifdef BIAS
    __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -18,15 +24,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                      __private const int padding_top,
                      __private const int padding_left,
                      __private const int dilation_h,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                      __private const int dilation_w,
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1,
-                      __private const int global_size_dim2) {
-#else
                      __private const int dilation_w) {
-#endif
-
  const int out_ch_blk = get_global_id(0);
  const int out_w_blk = get_global_id(1);
  const int out_hb = get_global_id(2);

--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
 #include <common.h>

-__kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void conv_2d_1x1(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                          __private const int global_size_dim0,
+                          __private const int global_size_dim1,
+                          __private const int global_size_dim2,
+#endif
+                          __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                          __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
 #ifdef BIAS
                          __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -12,15 +18,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                          __private const int in_ch_blks,
                          __private const int height,
                          __private const int width,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                          __private const int stride,
-                          __private const int global_size_dim0,
-                          __private const int global_size_dim1,
-                          __private const int global_size_dim2) {
-#else
                          __private const int stride) {
-#endif
-
  const int out_ch_blk = get_global_id(0);
  const int out_w_blk = get_global_id(1);
  const int out_hb = get_global_id(2);

--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
 #include <common.h>

-__kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void conv_2d_3x3(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                          __private const int global_size_dim0,
+                          __private const int global_size_dim1,
+                          __private const int global_size_dim2,
+#endif
+                          __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                          __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
 #ifdef BIAS
                          __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -16,15 +22,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                          __private const int padding_top,
                          __private const int padding_left,
                          __private const int dilation_h,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                          __private const int dilation_w,
-                          __private const int global_size_dim0,
-                          __private const int global_size_dim1,
-                          __private const int global_size_dim2) {
-#else
                          __private const int dilation_w) {
-#endif
-
  const int out_ch_blk = get_global_id(0);
  const int out_w_blk = get_global_id(1);
  const int out_hb = get_global_id(2);

--- a/mace/kernels/opencl/cl/depth_to_space.cl
+++ b/mace/kernels/opencl/cl/depth_to_space.cl
 #include <common.h>

-__kernel void depth_to_space(__read_only image2d_t input,
-                             __private const int block_size,
-                             __private const int output_depth,
+__kernel void depth_to_space(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                             __write_only image2d_t output,
                             __private const int global_size_dim0,
                             __private const int global_size_dim1,
-                             __private const int global_size_dim2) {
-#else
-                             __write_only image2d_t output) {
+                             __private const int global_size_dim2,
 #endif
-
+                             __read_only image2d_t input,
+                             __private const int block_size,
+                             __private const int output_depth,
+                             __write_only image2d_t output) {
  const int out_d = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_h = get_global_id(2);
@@ -44,17 +42,16 @@ __kernel void depth_to_space(__read_only image2d_t input,
  WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data);
 }

-__kernel void space_to_depth(__read_only image2d_t input,
-                             __private const int block_size,
-                             __private const int input_depth,
+__kernel void space_to_depth(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                             __write_only image2d_t output,
                             __private const int global_size_dim0,
                             __private const int global_size_dim1,
-                             __private const int global_size_dim2) {
-#else
-                             __write_only image2d_t output) {
+                             __private const int global_size_dim2,
 #endif
+                             __read_only image2d_t input,
+                             __private const int block_size,
+                             __private const int input_depth,
+                             __write_only image2d_t output) {

  const int d = get_global_id(0);
  const int w = get_global_id(1);

--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
 #include <common.h>

 // Only multiplier = 1 is supported
-__kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void depthwise_conv2d(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                               __private const int global_size_dim0,
+                               __private const int global_size_dim1,
+                               __private const int global_size_dim2,
+#endif
+                               __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                               __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
    __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -18,15 +24,7 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
                               __private const short padding_top,
                               __private const short padding_left,
                               __private const short dilation_h,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                               __private const short dilation_w,
-                               __private const int global_size_dim0,
-                               __private const int global_size_dim1,
-                               __private const int global_size_dim2) {
-#else
                               __private const short dilation_w) {
-#endif
-
  const short out_ch_blk = get_global_id(0);
  const short out_w_blk = get_global_id(1);
  const short out_hb = get_global_id(2);
@@ -144,7 +142,13 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
 }

-__kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void depthwise_conv2d_s1(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                  __private const int global_size_dim0,
+                                  __private const int global_size_dim1,
+                                  __private const int global_size_dim2,
+#endif
+                                  __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                  __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
    __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -159,15 +163,7 @@ __kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4
                                  __private const short filter_height,
                                  __private const short filter_width,
                                  __private const short padding_top,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                  __private const short padding_left,
-                                  __private const int global_size_dim0,
-                                  __private const int global_size_dim1,
-                                  __private const int global_size_dim2) {
-#else
                                  __private const short padding_left) {
-#endif
-
  const short out_ch_blk = get_global_id(0);
  const short out_w_blk = get_global_id(1) << 2;
  const short out_hb = get_global_id(2);

--- a/mace/kernels/opencl/cl/eltwise.cl
+++ b/mace/kernels/opencl/cl/eltwise.cl
 #include <common.h>

-__kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
+__kernel void eltwise(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                      __private const int global_size_dim0,
+                      __private const int global_size_dim1,
+#endif
+                      __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                      __read_only image2d_t input1,
 #ifdef COEFF_SUM
                      __private const float coeff0,
                      __private const float coeff1,
 #endif
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                      __write_only image2d_t output,
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1) {
-#else
                      __write_only image2d_t output) {
-#endif
-
  const int w = get_global_id(0);
  const int hb = get_global_id(1);


--- a/mace/kernels/opencl/cl/matmul.cl
+++ b/mace/kernels/opencl/cl/matmul.cl
 #include <common.h>

 // C = A * B
-__kernel void matmul(__read_only image2d_t A,
+__kernel void matmul(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                     __private const int global_size_dim0,
+                     __private const int global_size_dim1,
+#endif
+                     __read_only image2d_t A,
                     __read_only image2d_t B,
                     __write_only image2d_t C,
                     __private const int M,
                     __private const int N,
                     __private const int K,
                     __private const int height_blocks,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                     __private const int k_blocks,
-                     __private const int global_size_dim0,
-                     __private const int global_size_dim1) {
-#else
                     __private const int k_blocks) {
-#endif
-
  const int gx = get_global_id(0) << 2;
  const int hb = get_global_id(1);


--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
@@ -19,7 +19,13 @@ inline int calculate_avg_block_size(const int pool_size,
 }

 // Supported data type: half/float
-__kernel void pooling(__read_only image2d_t input,
+__kernel void pooling(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                      __private const int global_size_dim0,
+                      __private const int global_size_dim1,
+                      __private const int global_size_dim2,
+#endif
+                      __read_only image2d_t input,
                      __private const int in_height,
                      __private const int in_width,
                      __private const int out_height,
@@ -27,14 +33,7 @@ __kernel void pooling(__read_only image2d_t input,
                      __private const int pad_left,
                      __private const int stride,
                      __private const int pooling_size,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                      __write_only image2d_t output,
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1,
-                      __private const int global_size_dim2) {
-#else
                      __write_only image2d_t output) {
-#endif

  const int out_chan_idx = get_global_id(0);
  const int out_width_idx = get_global_id(1);

--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
 #include <common.h>

-__kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void resize_bilinear_nocache(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                      __private const int global_size_dim0,
+                                      __private const int global_size_dim1,
+                                      __private const int global_size_dim2,
+#endif
+                                      __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                      __write_only image2d_t output,
                                      __private const float height_scale,
                                      __private const float width_scale,
                                      __private const int in_height,
                                      __private const int in_width,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                      __private const int out_height,
-                                      __private const int global_size_dim0,
-                                      __private const int global_size_dim1,
-                                      __private const int global_size_dim2) {
-#else
                                      __private const int out_height) {
-#endif

  const int ch_blk = get_global_id(0);
  const int w = get_global_id(1);

--- a/mace/kernels/opencl/cl/slice.cl
+++ b/mace/kernels/opencl/cl/slice.cl
 #include <common.h>

-__kernel void slice(__read_only image2d_t input,
-                    __private const int chan_blk_offset,
+__kernel void slice(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                    __write_only image2d_t output,
                    __private const int global_size_dim0,
                    __private const int global_size_dim1,
-                    __private const int global_size_dim2) {
-#else
-                    __write_only image2d_t output) {
+                    __private const int global_size_dim2,
 #endif
-
+                    __read_only image2d_t input,
+                    __private const int chan_blk_offset,
+                    __write_only image2d_t output) {
  const int chan_blk_idx = get_global_id(0);
  const int width_idx = get_global_id(1);
  const int hb_idx = get_global_id(2);

--- a/mace/kernels/opencl/cl/softmax.cl
+++ b/mace/kernels/opencl/cl/softmax.cl
 #include <common.h>

-__kernel void softmax(__read_only image2d_t input,
-                      __private const int channels,
-                      __private const int remain_channels,
+__kernel void softmax(
 #ifndef USE_QUALCOMM_OPENCL_2_0
-                      __write_only image2d_t output,
                      __private const int global_size_dim0,
                      __private const int global_size_dim1,
-                      __private const int global_size_dim2) {
-#else
-                      __write_only image2d_t output) {
+                      __private const int global_size_dim2,
 #endif
-
+                      __read_only image2d_t input,
+                      __private const int channels,
+                      __private const int remain_channels,
+                      __write_only image2d_t output) {
  const int chan_blk_idx = get_global_id(0);
  const int width_idx = get_global_id(1);
  const int hb_idx = get_global_id(2);

--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
 #include <common.h>

-__kernel void space_to_batch(__read_only image2d_t space_data,
+__kernel void space_to_batch(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                             __private const int global_size_dim0,
+                             __private const int global_size_dim1,
+                             __private const int global_size_dim2,
+#endif
+                             __read_only image2d_t space_data,
                             __write_only image2d_t batch_data,
                             __private const int block_height,
                             __private const int block_width,
@@ -9,15 +15,7 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
                             __private const int space_height,
                             __private const int space_width,
                             __private const int batch_height,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                             __private const int batch_width,
-                             __private const int global_size_dim0,
-                             __private const int global_size_dim1,
-                             __private const int global_size_dim2) {
-#else
                             __private const int batch_width) {
-#endif
-
  const int chan_idx = get_global_id(0);
  const int batch_w_idx = get_global_id(1);
  const int batch_hb_idx = get_global_id(2);
@@ -54,7 +52,13 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
  WRITE_IMAGET(batch_data, batch_coord, value);
 }

-__kernel void batch_to_space(__read_only image2d_t batch_data,
+__kernel void batch_to_space(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                             __private const int global_size_dim0,
+                             __private const int global_size_dim1,
+                             __private const int global_size_dim2,
+#endif
+                             __read_only image2d_t batch_data,
                             __write_only image2d_t space_data,
                             __private const int block_height,
                             __private const int block_width,
@@ -63,15 +67,7 @@ __kernel void batch_to_space(__read_only image2d_t batch_data,
                             __private const int space_height,
                             __private const int space_width,
                             __private const int batch_height,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                             __private const int batch_width,
-                             __private const int global_size_dim0,
-                             __private const int global_size_dim1,
-                             __private const int global_size_dim2) {
-#else
                             __private const int batch_width) {
-#endif
-
  const int chan_idx = get_global_id(0);
  const int batch_w_idx = get_global_id(1);
  const int batch_hb_idx = get_global_id(2);

--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
 #include <common.h>

-__kernel void winograd_transform_2x2(__read_only image2d_t input,
+__kernel void winograd_transform_2x2(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                     __private const int global_size_dim0,
+                                     __private const int global_size_dim1,
+#endif
+                                     __read_only image2d_t input,
                                     __write_only image2d_t output,
                                     __private const int in_height,
                                     __private const int in_width,
@@ -8,14 +13,7 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
                                     __private const int round_hw,
                                     __private const int round_w,
                                     __private const int padding_top,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                     __private const int padding_left,
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1) {
-#else
                                     __private const int padding_left) {
-#endif
-
  int out_width_idx = get_global_id(0);
  int chan_blk_idx = get_global_id(1);

@@ -121,7 +119,12 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
  }
 }

-__kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
+__kernel void winograd_inverse_transform_2x2(
+#ifndef USE_QUALCOMM_OPENCL_2_0
+                                             __private const int global_size_dim0,
+                                             __private const int global_size_dim1,
+#endif
+                                             __read_only image2d_t input,
 #ifdef BIAS
                                             __read_only image2d_t bias, /* cout%4 * cout/4 */
 #endif
@@ -130,14 +133,7 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
                                             __private const int out_width,
                                             __private const int round_hw,
                                             __private const int round_w,
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                             __private const float relux_max_limit,
-                                             __private const int global_size_dim0,
-                                             __private const int global_size_dim1) {
-#else
                                             __private const float relux_max_limit) {
-#endif
-
  const int width_idx = get_global_id(0);
  const int height_idx = get_global_id(1);


--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -56,6 +56,11 @@ static void Concat2(cl::Kernel *kernel,
  }
  if (!IsVecEqual(*prev_input_shape, input0->shape())) {
    uint32_t idx = 0;
+    if (!(*is_non_uniform_work_groups_supported)) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
    kernel->setArg(idx++,
                   *(static_cast<const cl::Image2D *>(input0->opencl_image())));
    kernel->setArg(idx++,
@@ -63,9 +68,6 @@ static void Concat2(cl::Kernel *kernel,
    kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
    kernel->setArg(idx++,
                   *(static_cast<cl::Image2D *>(output->opencl_image())));
-    kernel->setArg(idx++, gws[0]);
-    kernel->setArg(idx++, gws[1]);
-    kernel->setArg(idx++, gws[2]);

    *prev_input_shape = input0->shape();

@@ -119,12 +121,14 @@ static void ConcatN(cl::Kernel *kernel,
    };

    uint32_t idx = 0;
+    if (!(*is_non_uniform_work_groups_supported)) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, static_cast<int32_t>(chan_blk_offset));
    kernel->setArg(idx++, *(output->opencl_image()));
-    kernel->setArg(idx++, gws[0]);
-    kernel->setArg(idx++, gws[1]);
-    kernel->setArg(idx++, gws[2]);

    chan_blk_offset += input_channel_blk;
    *kwg_size =

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -84,6 +84,11 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,

  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    uint32_t idx = 0;
+    if (!(*is_non_uniform_work_groups_supported)) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, *(filter->opencl_image()));
    if (bias != nullptr) {
@@ -98,9 +103,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
    kernel->setArg(idx++, static_cast<int>(height));
    kernel->setArg(idx++, static_cast<int>(width));
    kernel->setArg(idx++, stride);
-    kernel->setArg(idx++, gws[0]);
-    kernel->setArg(idx++, gws[1]);
-    kernel->setArg(idx++, gws[2]);

    *prev_input_shape = input->shape();


--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -79,6 +79,11 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,

  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    uint32_t idx = 0;
+    if (!(*is_non_uniform_work_groups_supported)) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, *(filter->opencl_image()));
    if (bias != nullptr) {
@@ -96,9 +101,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
    kernel->setArg(idx++, padding[1] / 2);
    kernel->setArg(idx++, dilations[0]);
    kernel->setArg(idx++, dilations[1]);
-    kernel->setArg(idx++, gws[0]);
-    kernel->setArg(idx++, gws[1]);
-    kernel->setArg(idx++, gws[2]);

    *prev_input_shape = input->shape();


--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -79,6 +79,11 @@ extern void Conv2dOpencl(cl::Kernel *kernel,

  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    uint32_t idx = 0;
+    if (!(*is_non_uniform_work_groups_supported)) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, *(filter->opencl_image()));
    if (bias != nullptr) {
@@ -98,9 +103,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
    kernel->setArg(idx++, padding[1] / 2);
    kernel->setArg(idx++, dilations[0]);
    kernel->setArg(idx++, dilations[1]);
-    kernel->setArg(idx++, gws[0]);
-    kernel->setArg(idx++, gws[1]);
-    kernel->setArg(idx++, gws[2]);

    *prev_input_shape = input->shape();


--- a/mace/kernels/opencl/depth_to_space_opencl.cc
+++ b/mace/kernels/opencl/depth_to_space_opencl.cc
@@ -68,12 +68,6 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
  uint32_t gws[3];
  std::stringstream ss;
  if (!IsVecEqual(input_shape_, input->shape())) {
-    uint32_t idx = 0;
-    kernel_.setArg(idx++, *(input->opencl_image()));
-    kernel_.setArg(idx++, block_size_);
-    kernel_.setArg(idx++, depth_blocks);
-    kernel_.setArg(idx++, *(output->opencl_image()));
-
    if (d2s_) {
      gws[0] = static_cast<uint32_t>(depth_blocks);
      gws[1] = static_cast<uint32_t>(output_width);
@@ -88,9 +82,16 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
         << input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3);
    }

-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);
+    uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
+    kernel_.setArg(idx++, *(input->opencl_image()));
+    kernel_.setArg(idx++, block_size_);
+    kernel_.setArg(idx++, depth_blocks);
+    kernel_.setArg(idx++, *(output->opencl_image()));

    input_shape_ = input->shape();


--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -97,6 +97,11 @@ void DepthwiseConv2d(cl::Kernel *kernel,
               input_channels);

    uint32_t idx = 0;
+    if (!(*is_non_uniform_work_groups_supported)) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, *(filter->opencl_image()));
    if (bias != nullptr) {
@@ -117,9 +122,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
      kernel->setArg(idx++, static_cast<int16_t>(dilations[0]));
      kernel->setArg(idx++, static_cast<int16_t>(dilations[1]));
    }
-    kernel->setArg(idx++, gws[0]);
-    kernel->setArg(idx++, gws[1]);
-    kernel->setArg(idx++, gws[2]);

    *prev_input_shape = input->shape();


--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -47,6 +47,10 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
  }
  if (!IsVecEqual(input_shape_, input0->shape())) {
    uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+    }
    kernel_.setArg(idx++, *(input0->opencl_image()));
    kernel_.setArg(idx++, *(input1->opencl_image()));
    if (!coeff_.empty()) {
@@ -54,8 +58,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
      kernel_.setArg(idx++, coeff_[1]);
    }
    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);

    input_shape_ = input0->shape();


--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -48,6 +48,10 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
    kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
  }
  uint32_t idx = 0;
+  if (!is_non_uniform_work_groups_supported_) {
+    kernel_.setArg(idx++, gws[0]);
+    kernel_.setArg(idx++, gws[1]);
+  }
  kernel_.setArg(idx++, *(A->opencl_image()));
  kernel_.setArg(idx++, *(B->opencl_image()));
  kernel_.setArg(idx++, *(C->opencl_image()));
@@ -56,8 +60,6 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
  kernel_.setArg(idx++, static_cast<int>(A->dim(2)));
  kernel_.setArg(idx++, static_cast<int>(height_blocks));
  kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
-  kernel_.setArg(idx++, gws[0]);
-  kernel_.setArg(idx++, gws[1]);

  kwg_size_ =
      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));

--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -81,6 +81,11 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    };

    uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
@@ -90,14 +95,23 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    kernel_.setArg(idx++, strides_[0]);
    kernel_.setArg(idx++, kernels_[0]);
    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);

    input_shape_ = input->shape();

    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
+  } else {
+    index_t batch = output->dim(0);
+    index_t out_height = output->dim(1);
+    index_t out_width = output->dim(2);
+    index_t channels = output->dim(3);
+
+    index_t channel_blocks = (channels + 3) / 4;
+
+    gws = {
+        static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
+        static_cast<uint32_t>(batch * out_height),
+    };
  }

  std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -60,6 +60,11 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
        CalculateResizeScale(in_width, out_width, align_corners_);

    uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, *(output->opencl_image()));
    kernel_.setArg(idx++, height_scale);
@@ -67,9 +72,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);

    input_shape_ = input->shape();


--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -65,12 +65,14 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
     << outputs_count;
  for (int i = 0; i < outputs_count; ++i) {
    uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
    kernel_.setArg(idx++, *(output_list[i]->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);

    TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
  }

--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -45,13 +45,15 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
  }
  if (!IsVecEqual(input_shape_, logits->shape())) {
    uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(logits->opencl_image()));
    kernel_.setArg(idx++, static_cast<int>(channels));
    kernel_.setArg(idx++, remain_channels);
    kernel_.setArg(idx++, *(output->opencl_image()));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);

    input_shape_ = logits->shape();


--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -57,6 +57,11 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
  }
  if (!IsVecEqual(space_shape_, space_tensor->shape())) {
    uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    if (b2s_) {
      kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
      kernel_.setArg(idx++, *(space_tensor->opencl_image()));
@@ -72,9 +77,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, static_cast<int32_t>(space_tensor->dim(2)));
    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(1)));
    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);
-    kernel_.setArg(idx++, gws[2]);

    space_shape_ = space_tensor->shape();


--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -61,6 +61,10 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
    output_tensor->ResizeImage(output_shape, image_shape);

    uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+    }
    kernel_.setArg(idx++, *(input_tensor->opencl_image()));
    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
@@ -70,8 +74,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[0] / 2));
    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);

    input_shape_ = input_tensor->shape();

@@ -151,6 +153,10 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
    const uint32_t round_h = (height_ + 1) / 2;
    const uint32_t round_w = (width_ + 1) / 2;
    uint32_t idx = 0;
+    if (!is_non_uniform_work_groups_supported_) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+    }
    kernel_.setArg(
        idx++,
        *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
@@ -165,8 +171,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
    kernel_.setArg(idx++, relux_max_limit_);
-    kernel_.setArg(idx++, gws[0]);
-    kernel_.setArg(idx++, gws[1]);

    input_shape_ = input_tensor->shape();


--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -376,7 +376,8 @@ def main(unused_args):
        build_run_throughput_test(target_soc, FLAGS.run_seconds,
                                  merged_lib_file, FLAGS.output_dir)

-  packaging_lib_file(FLAGS.output_dir)
+  if FLAGS.mode == "build" or FLAGS.mode == "all":
+    packaging_lib_file(FLAGS.output_dir)


 if __name__ == "__main__":

--- a/tools/packaging_lib.sh
+++ b/tools/packaging_lib.sh
@@ -14,8 +14,13 @@ source ${CURRENT_DIR}/env.sh

 LIBMACE_BUILD_DIR=$1

+TAR_PACKAGE_NAME=libmace_${PROJECT_NAME}.tar.gz
+
 pushd $LIBMACE_BUILD_DIR/$PROJECT_NAME
-ls | grep -v build | xargs tar cvzf libmace_${PROJECT_NAME}.tar.gz
+if [ -f $TAR_PACKAGE_NAME ]; then
+  rm -f $TAR_PACKAGE_NAME
+fi
+ls | grep -v build | xargs tar cvzf $TAR_PACKAGE_NAME
 popd

 echo "Packaging done!"