FC and CWise ops support opencl 1.1/1.2.

e9eaa4d4 · liuqi · 6fcf7b6e · e9eaa4d4 · e9eaa4d4 · e9eaa4d4
30 changed file
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -480,12 +480,12 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
 }

 const bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() {
-  if (gpu_type_ == GPUType::QUALCOMM_ADRENO &&
-      opencl_version_ == "2.0") {
-    return true;
-  } else {
-    return false;
-  }
+  return (gpu_type_ == GPUType::QUALCOMM_ADRENO &&
+      opencl_version_ == "2.0");
+}
+
+const GPUType OpenCLRuntime::gpu_type() const {
+  return gpu_type_;
 }

 const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName(

--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -66,6 +66,7 @@ class OpenCLRuntime {
  uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
  const bool IsNonUniformWorkgroupsSupported();
  const GPUType ParseGPUTypeFromDeviceName(const std::string &device_name);
+  const GPUType gpu_type() const;
  cl::Kernel BuildKernel(const std::string &program_name,
                         const std::string &kernel_name,
                         const std::set<std::string> &build_options);

--- a/mace/kernels/cwise.h
+++ b/mace/kernels/cwise.h
@@ -114,6 +114,7 @@ struct CWiseFunctor<DeviceType::OPENCL, T> : CWiseFunctorBase {
                  StatsFuture *future);

  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };


--- a/mace/kernels/opencl/cl/activation.cl
+++ b/mace/kernels/opencl/cl/activation.cl
 #include <common.h>

-__kernel void activation(
-                         UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void activation(GLOBAL_WORK_GROUP_SIZE_DIM3
                         __read_only image2d_t input,
 #ifdef USE_PRELU
                         __read_only image2d_t alpha,

--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
 #include <common.h>

-__kernel void addn(
-                   UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void addn(GLOBAL_WORK_GROUP_SIZE_DIM2
                   __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                   __read_only image2d_t input1,
 #if INPUT_NUM > 2

--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
 #include <common.h>
 // Supported data types: half/float
-__kernel void batch_norm(
-                         UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void batch_norm(GLOBAL_WORK_GROUP_SIZE_DIM3
                         __read_only image2d_t input,
                         __read_only image2d_t scale,
                         __read_only image2d_t offset,

--- a/mace/kernels/opencl/cl/bias_add.cl
+++ b/mace/kernels/opencl/cl/bias_add.cl
 #include <common.h>
 // Supported data types: half/float
-__kernel void bias_add(
-                       UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void bias_add(GLOBAL_WORK_GROUP_SIZE_DIM3
                       __read_only image2d_t input,
                       __read_only image2d_t bias,
                       __write_only image2d_t output) {

--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
 #include <common.h>

-__kernel void filter_buffer_to_image(
-                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
                                     __global const DATA_TYPE *input, /* h, w, oc, ic */
                                     __private const int input_offset,
                                     __private const int filter_h,
@@ -53,8 +52,7 @@ __kernel void filter_buffer_to_image(
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void filter_image_to_buffer(
-                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
                                     __global DATA_TYPE *output, /* h, w, oc, ic */
                                     __private const int filter_h,
                                     __private const int filter_w,
@@ -102,8 +100,7 @@ __kernel void filter_image_to_buffer(
  }
 }

-__kernel void dw_filter_buffer_to_image(
-                                        UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
                                        __global const DATA_TYPE *input, /* h, w, ic, m */
                                        __private const int input_offset,
                                        __private const int filter_w,
@@ -160,8 +157,7 @@ __kernel void dw_filter_buffer_to_image(
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void in_out_buffer_to_image(
-                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
                                     __global const DATA_TYPE *input, /* nhwc */
                                     __private const int input_offset,
                                     __private const int height,
@@ -202,8 +198,7 @@ __kernel void in_out_buffer_to_image(
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void in_out_image_to_buffer(
-                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
                                     __global DATA_TYPE *output, /* nhwc */
                                     __private const int height,
                                     __private const int width,
@@ -242,8 +237,7 @@ __kernel void in_out_image_to_buffer(
  }
 }

-__kernel void arg_buffer_to_image(
-                                  UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
                                  __global const DATA_TYPE *input, /* nhwc */
                                  __private const int input_offset,
                                  __private const int count,
@@ -278,8 +272,7 @@ __kernel void arg_buffer_to_image(
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void arg_image_to_buffer(
-                                  UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
                                  __global DATA_TYPE *output, /* nhwc */
                                  __private const int count,
                                  __read_only image2d_t input) {
@@ -312,8 +305,7 @@ __kernel void arg_image_to_buffer(
 }


-__kernel void in_out_height_buffer_to_image(
-                                            UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
                                            __global const DATA_TYPE *input, //nhwc
                                            __private const int input_offset,
                                            __private const int height,
@@ -355,8 +347,7 @@ __kernel void in_out_height_buffer_to_image(
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void in_out_height_image_to_buffer(
-                                            UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
                                            __global DATA_TYPE *output, //nhwc
                                            __private const int height,
                                            __private const int width,
@@ -394,8 +385,7 @@ __kernel void in_out_height_image_to_buffer(
 }


-__kernel void in_out_width_buffer_to_image(
-                                           UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
                                           __global const DATA_TYPE *input, /* nhwc */
                                           __private const int input_offset,
                                           __private const int height,
@@ -437,8 +427,7 @@ __kernel void in_out_width_buffer_to_image(
 }

 // only support 3x3 now
-__kernel void winograd_filter_buffer_to_image(
-                                              UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
                                              __global const DATA_TYPE *input, //Oc, Ic, H, W
                                              __private const int input_offset,
                                              __private const int in_channels,
@@ -529,8 +518,7 @@ __kernel void winograd_filter_buffer_to_image(
 }

 // only support 3x3 now
-__kernel void winograd_filter_image_to_buffer(
-                                              UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void winograd_filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
                                              __global DATA_TYPE *output, //Oc, Ic, H, W
                                              __private const int height,
                                              __private const int width,

--- a/mace/kernels/opencl/cl/channel_shuffle.cl
+++ b/mace/kernels/opencl/cl/channel_shuffle.cl
 #include <common.h>

 // assume channes_per_group mod 4 = 0 && groups mod 4 == 0
-__kernel void channel_shuffle(
-                              UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void channel_shuffle(GLOBAL_WORK_GROUP_SIZE_DIM3
                              __read_only image2d_t input,
                              __private const int groups,
                              __private const int channels_per_group,

--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -19,18 +19,18 @@

 #ifndef NON_UNIFORM_WORK_GROUP

-#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 \
+#define GLOBAL_WORK_GROUP_SIZE_DIM2 \
    __private const int global_size_dim0,       \
    __private const int global_size_dim1,
-#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 \
+#define GLOBAL_WORK_GROUP_SIZE_DIM3 \
    __private const int global_size_dim0,       \
    __private const int global_size_dim1,       \
    __private const int global_size_dim2,

 #else

-#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+#define GLOBAL_WORK_GROUP_SIZE_DIM2
+#define GLOBAL_WORK_GROUP_SIZE_DIM3

 #endif


--- a/mace/kernels/opencl/cl/concat.cl
+++ b/mace/kernels/opencl/cl/concat.cl
@@ -22,8 +22,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
 }

 // Supported data type: half/float
-__kernel void concat_channel(
-                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void concat_channel(GLOBAL_WORK_GROUP_SIZE_DIM3
                             __read_only image2d_t input0,
                             __read_only image2d_t input1,
                             __private const int input0_chan,
@@ -84,8 +83,7 @@ __kernel void concat_channel(
 }

 // Required: All input channels are divisible by 4
-__kernel void concat_channel_multi(
-                                   UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3
                                   __read_only image2d_t input,
                                   __private const int chan_blk_offset,
                                   __write_only image2d_t output) {

--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
 #include <common.h>

-__kernel void conv_2d(
-                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void conv_2d(GLOBAL_WORK_GROUP_SIZE_DIM3
                      __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                      __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
 #ifdef BIAS

--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
 #include <common.h>

-__kernel void conv_2d_1x1(
-                          UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void conv_2d_1x1(GLOBAL_WORK_GROUP_SIZE_DIM3
                          __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                          __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
 #ifdef BIAS

--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
 #include <common.h>

-__kernel void conv_2d_3x3(
-                          UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void conv_2d_3x3(GLOBAL_WORK_GROUP_SIZE_DIM3
                          __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                          __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
 #ifdef BIAS

--- a/mace/kernels/opencl/cl/cwise.cl
+++ b/mace/kernels/opencl/cl/cwise.cl
 #include <common.h>

-__kernel void cwise(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
-                      __private const float value,
-                      __write_only image2d_t output) {
+__kernel void cwise(GLOBAL_WORK_GROUP_SIZE_DIM2
+                    __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+                    __private const float value,
+                    __write_only image2d_t output) {
  const int w = get_global_id(0);
  const int hb = get_global_id(1);

+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || hb >= global_size_dim1) return;
+#endif
+
  DATA_TYPE4 in0 = READ_IMAGET(input, SAMPLER, (int2)(w, hb));
  DATA_TYPE4 in1 = (DATA_TYPE4){value, value, value, value};
  DATA_TYPE4 out;

--- a/mace/kernels/opencl/cl/depth_to_space.cl
+++ b/mace/kernels/opencl/cl/depth_to_space.cl
 #include <common.h>

-__kernel void depth_to_space(
-                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void depth_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3
                             __read_only image2d_t input,
                             __private const int block_size,
                             __private const int input_height,
@@ -36,7 +35,7 @@ __kernel void depth_to_space(
 }

 __kernel void space_to_depth(
-                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                             GLOBAL_WORK_GROUP_SIZE_DIM3
                             __read_only image2d_t input,
                             __private const int block_size,
                             __private const int input_height,

--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
 #include <common.h>

 // Only multiplier = 1 is supported
-__kernel void depthwise_conv2d(
-                               UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void depthwise_conv2d(GLOBAL_WORK_GROUP_SIZE_DIM3
                               __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                               __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
@@ -138,8 +137,7 @@ __kernel void depthwise_conv2d(
  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
 }

-__kernel void depthwise_conv2d_s1(
-                                  UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void depthwise_conv2d_s1(GLOBAL_WORK_GROUP_SIZE_DIM3
                                  __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                  __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS

--- a/mace/kernels/opencl/cl/eltwise.cl
+++ b/mace/kernels/opencl/cl/eltwise.cl
 #include <common.h>

-__kernel void eltwise(
-                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void eltwise(GLOBAL_WORK_GROUP_SIZE_DIM2
                      __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                      __read_only image2d_t input1,
 #ifdef COEFF_SUM

--- a/mace/kernels/opencl/cl/fully_connected.cl
+++ b/mace/kernels/opencl/cl/fully_connected.cl
 #include <common.h>

 // output = weight * input + bias
-__kernel void fully_connected(__read_only image2d_t input,
+__kernel void fully_connected(GLOBAL_WORK_GROUP_SIZE_DIM2
+                              __read_only image2d_t input,
                              __read_only image2d_t weight,
 #ifdef BIAS
    __read_only image2d_t bias,
@@ -15,6 +16,10 @@ __kernel void fully_connected(__read_only image2d_t input,
  const int out_blk_idx = get_global_id(1);
  const int input_chan_blk = (input_channel + 3) >> 2;

+#ifndef NON_UNIFORM_WORK_GROUP
+  if (batch_idx >= global_size_dim0 || out_blk_idx >= global_size_dim1) return;
+#endif
+
  float4 input_value;
  float4 w0, w1, w2, w3;

@@ -57,7 +62,8 @@ __kernel void fully_connected(__read_only image2d_t input,
 }

 // output = weight * input + bias
-__kernel void fully_connected_width(__read_only image2d_t input,
+__kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3
+                                    __read_only image2d_t input,
                                    __read_only image2d_t weight,
 #ifdef BIAS
    __read_only image2d_t bias,
@@ -73,6 +79,7 @@ __kernel void fully_connected_width(__read_only image2d_t input,
  const int width_blk_idx = get_global_id(1);
  const int width_blk_count = get_global_size(1);
  const int batch_out_blk_idx = get_global_id(2);
+
  const int batch_idx = batch_out_blk_idx / out_blks;
  const int out_blk_idx = batch_out_blk_idx % out_blks;

@@ -115,6 +122,16 @@ __kernel void fully_connected_width(__read_only image2d_t input,
  short inter_idx = mad24((short)get_local_id(2), local_size, inter_out_offset);
  intermediate_output[inter_idx] = sum;

+#ifdef NON_QUALCOMM_ADRENO
+  barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (batch_out_blk_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+
  if (inter_out_offset == 0) {
 #ifdef BIAS
    DATA_TYPE4 result = READ_IMAGET(bias, SAMPLER, (int2)(out_blk_idx, 0));
@@ -122,7 +139,7 @@ __kernel void fully_connected_width(__read_only image2d_t input,
    DATA_TYPE4 result = (DATA_TYPE4)(0, 0, 0, 0);
 #endif

-    for(short i = 0; i < local_width_blk_size; ++i) {
+    for (short i = 0; i < local_width_blk_size; ++i) {
      result += vload4(0, intermediate_output+inter_idx);
      inter_idx += 4;
    }

--- a/mace/kernels/opencl/cl/matmul.cl
+++ b/mace/kernels/opencl/cl/matmul.cl
 #include <common.h>

 // C = A * B
-__kernel void matmul(
-                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void matmul(GLOBAL_WORK_GROUP_SIZE_DIM2
                     __read_only image2d_t A,
                     __read_only image2d_t B,
                     __write_only image2d_t C,

--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
@@ -19,8 +19,7 @@ inline int calculate_avg_block_size(const int pool_size,
 }

 // Supported data type: half/float
-__kernel void pooling(
-                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void pooling(GLOBAL_WORK_GROUP_SIZE_DIM3
                      __read_only image2d_t input,
                      __private const int in_height,
                      __private const int in_width,

--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
 #include <common.h>

-__kernel void resize_bilinear_nocache(
-                                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void resize_bilinear_nocache(GLOBAL_WORK_GROUP_SIZE_DIM3
                                      __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                      __write_only image2d_t output,
                                      __private const float height_scale,

--- a/mace/kernels/opencl/cl/slice.cl
+++ b/mace/kernels/opencl/cl/slice.cl
 #include <common.h>

-__kernel void slice(
-                    UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void slice(GLOBAL_WORK_GROUP_SIZE_DIM3
                    __read_only image2d_t input,
                    __private const int chan_blk_offset,
                    __write_only image2d_t output) {

--- a/mace/kernels/opencl/cl/softmax.cl
+++ b/mace/kernels/opencl/cl/softmax.cl
 #include <common.h>

-__kernel void softmax(
-                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void softmax(GLOBAL_WORK_GROUP_SIZE_DIM3
                      __read_only image2d_t input,
                      __private const int channels,
                      __private const int remain_channels,

--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
 #include <common.h>

-__kernel void space_to_batch(
-                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void space_to_batch(GLOBAL_WORK_GROUP_SIZE_DIM3
                             __read_only image2d_t space_data,
                             __write_only image2d_t batch_data,
                             __private const int block_height,
@@ -48,8 +47,7 @@ __kernel void space_to_batch(
  WRITE_IMAGET(batch_data, batch_coord, value);
 }

-__kernel void batch_to_space(
-                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3
                             __read_only image2d_t batch_data,
                             __write_only image2d_t space_data,
                             __private const int block_height,

--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
 #include <common.h>

-__kernel void winograd_transform_2x2(
-                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
                                     __read_only image2d_t input,
                                     __write_only image2d_t output,
                                     __private const int in_height,
@@ -116,8 +115,7 @@ __kernel void winograd_transform_2x2(
  }
 }

-__kernel void winograd_inverse_transform_2x2(
-                                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void winograd_inverse_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
                                             __read_only image2d_t input,
 #ifdef BIAS
                                             __read_only image2d_t bias, /* cout%4 * cout/4 */

--- a/mace/kernels/opencl/cwise_opencl.cc
+++ b/mace/kernels/opencl/cwise_opencl.cc
@@ -23,8 +23,10 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  const index_t width_pixels = channel_blocks * width;
  const index_t batch_height_pixels = batch * height;

+  auto runtime = OpenCLRuntime::Global();
+  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
+                           static_cast<uint32_t>(batch_height_pixels)};
  if (kernel_.get() == nullptr) {
-    auto runtime = OpenCLRuntime::Global();
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("cwise");
@@ -32,19 +34,27 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
    built_options.emplace(MakeString("-DCWISE_TYPE=", type_));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    kernel_ = runtime->BuildKernel("cwise", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, static_cast<float>(coeff_));
    kernel_.setArg(idx++, *(output->opencl_image()));
    input_shape_ = input->shape();
  }

-  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
-                           static_cast<uint32_t>(batch_height_pixels)};
-  const std::vector<uint32_t> lws = {64, 16, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
  std::stringstream ss;
  ss << "cwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
     << "_" << output->dim(2) << "_" << output->dim(3);

--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -27,6 +27,10 @@ void FCWXKernel(cl::Kernel *kernel,
  auto runtime = OpenCLRuntime::Global();

  if (kernel->get() == nullptr) {
+    const index_t batch = output->dim(0);
+    const index_t output_size = output->dim(3);
+    const index_t output_blocks = RoundUpDiv4(output_size);
+
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
@@ -55,28 +59,47 @@ void FCWXKernel(cl::Kernel *kernel,
      default:
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
+    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
+      built_options.emplace("-DNON_QUALCOMM_ADRENO");
+    }
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }

    *kernel =
        runtime->BuildKernel("fully_connected", kernel_name, built_options);

-    const index_t batch = output->dim(0);
-    const index_t output_size = output->dim(3);
-    const index_t output_blocks = RoundUpDiv4(output_size);
-    const uint32_t wave_size =
-        static_cast<uint32_t>(runtime->GetKernelWaveSize(*kernel));
+    if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+      const uint32_t wave_size =
+          static_cast<uint32_t>(runtime->GetKernelWaveSize(*kernel));

-    *gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
+      *gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};

-    const uint32_t kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-    const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]);
-    *lws = {(*gws)[0], (*gws)[1], inter_local_blks};
+      const uint32_t kwg_size =
+          static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
+      const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]);
+      *lws = {(*gws)[0], (*gws)[1], inter_local_blks};
+    } else {
+      *gws = {4, 8, static_cast<uint32_t>(batch * output_blocks)};
+
+      const uint32_t kwg_size =
+          static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
+      const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]);
+      *lws = {(*gws)[0], (*gws)[1], inter_local_blks};
+    }
  }
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    const index_t batch = output->dim(0);
    const index_t output_blocks = RoundUpDiv4(output->dim(3));
+    (*gws)[2] = static_cast<uint32_t>(batch * output_blocks);

    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel->setArg(idx++, (*gws)[0]);
+      kernel->setArg(idx++, (*gws)[1]);
+      kernel->setArg(idx++, (*gws)[2]);
+    }
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, *(weight->opencl_image()));
    if (bias != nullptr) {
@@ -91,15 +114,25 @@ void FCWXKernel(cl::Kernel *kernel,
    kernel->setArg(idx++, static_cast<int>(output_blocks));
    kernel->setArg(idx++, relux_max_limit);

-    (*gws)[2] = static_cast<uint32_t>(batch * output_blocks);
-
    *prev_input_shape = input->shape();
  }
  cl::Event event;
-  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      *kernel, cl::NullRange, cl::NDRange((*gws)[0], (*gws)[1], (*gws)[2]),
-      cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
-  MACE_CHECK_CL_SUCCESS(error);
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        *kernel, cl::NullRange, cl::NDRange((*gws)[0], (*gws)[1], (*gws)[2]),
+        cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws->size());
+    for (size_t i = 0; i < lws->size(); ++i) {
+      roundup_gws[i] = RoundUp((*gws)[i], (*lws)[i]);
+    }
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        *kernel, cl::NullRange,
+        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
+  }
+  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;

  if (future != nullptr) {
    future->wait_fn = [runtime, event](CallStats *stats) {
@@ -125,8 +158,8 @@ void FCWTXKernel(cl::Kernel *kernel,
                 StatsFuture *future) {
  MACE_CHECK_NOTNULL(gws);
  MACE_CHECK_NOTNULL(lws);
+  auto runtime = OpenCLRuntime::Global();
  if (kernel->get() == nullptr) {
-    auto runtime = OpenCLRuntime::Global();
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
@@ -136,6 +169,9 @@ void FCWTXKernel(cl::Kernel *kernel,
    if (bias != nullptr) {
      built_options.emplace("-DBIAS");
    }
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    switch (activation) {
      case NOOP:
        break;
@@ -157,10 +193,23 @@ void FCWTXKernel(cl::Kernel *kernel,
    *kernel =
        runtime->BuildKernel("fully_connected", kernel_name, built_options);

-    *lws = {16, 64, 1};
+    uint32_t kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
+    *lws = {16, kwg_size/16, 1};
  }
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
+    const index_t batch = output->dim(0);
+    const index_t output_blocks = RoundUpDiv4(output->dim(3));
+
+    *gws = {
+        static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
+    };
+
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel->setArg(idx++, (*gws)[0]);
+      kernel->setArg(idx++, (*gws)[1]);
+    }
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, *(weight->opencl_image()));
    if (bias != nullptr) {
@@ -173,12 +222,6 @@ void FCWTXKernel(cl::Kernel *kernel,
    // FIXME handle flexable data type: half not supported
    kernel->setArg(idx++, relux_max_limit);

-    const index_t batch = output->dim(0);
-    const index_t output_blocks = RoundUpDiv4(output->dim(3));
-
-    *gws = {
-        static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
-    };
    *prev_input_shape = input->shape();
  }


--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -57,7 +57,6 @@ void SimpleValidTest() {
      .AddIntsArg("strides", {1, 1})
      .AddIntArg("padding", Padding::VALID)
      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);

--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -225,7 +225,7 @@ void TestWXFormat(const index_t batch,
                                       kernels::BufferType::IN_OUT_CHANNEL);
  BufferToImage<DeviceType::OPENCL, T>(&net, "Weight", "WeightImage",
                                       kernels::BufferType::WEIGHT_WIDTH);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Bias", "BiasImage",
+  BufferToImage<DeviceType::OPENCL, T>(&net, "Bias", "BiasImage",
                                           kernels::BufferType::ARGUMENT);

  OpDefBuilder("FC", "FullyConnectedTest")
@@ -236,7 +236,7 @@ void TestWXFormat(const index_t batch,
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

-  // Run on opencl
+  // Run
  net.RunOp(DeviceType::OPENCL);

  ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",