From e9eaa4d4a57ce46e2a7c41dd96950429e5af2b0e Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Fri, 30 Mar 2018 16:58:16 +0800
Subject: [PATCH] FC and CWise ops support opencl 1.1/1.2.

---
 mace/core/runtime/opencl/opencl_runtime.cc    | 12 +--
 mace/core/runtime/opencl/opencl_runtime.h     |  1 +
 mace/kernels/cwise.h                          |  1 +
 mace/kernels/opencl/cl/activation.cl          |  3 +-
 mace/kernels/opencl/cl/addn.cl                |  3 +-
 mace/kernels/opencl/cl/batch_norm.cl          |  3 +-
 mace/kernels/opencl/cl/bias_add.cl            |  3 +-
 mace/kernels/opencl/cl/buffer_to_image.cl     | 36 +++-----
 mace/kernels/opencl/cl/channel_shuffle.cl     |  3 +-
 mace/kernels/opencl/cl/common.h               |  8 +-
 mace/kernels/opencl/cl/concat.cl              |  6 +-
 mace/kernels/opencl/cl/conv_2d.cl             |  3 +-
 mace/kernels/opencl/cl/conv_2d_1x1.cl         |  3 +-
 mace/kernels/opencl/cl/conv_2d_3x3.cl         |  3 +-
 mace/kernels/opencl/cl/cwise.cl               | 11 ++-
 mace/kernels/opencl/cl/depth_to_space.cl      |  5 +-
 mace/kernels/opencl/cl/depthwise_conv2d.cl    |  6 +-
 mace/kernels/opencl/cl/eltwise.cl             |  3 +-
 mace/kernels/opencl/cl/fully_connected.cl     | 23 ++++-
 mace/kernels/opencl/cl/matmul.cl              |  3 +-
 mace/kernels/opencl/cl/pooling.cl             |  3 +-
 mace/kernels/opencl/cl/resize_bilinear.cl     |  3 +-
 mace/kernels/opencl/cl/slice.cl               |  3 +-
 mace/kernels/opencl/cl/softmax.cl             |  3 +-
 mace/kernels/opencl/cl/space_to_batch.cl      |  6 +-
 mace/kernels/opencl/cl/winograd_transform.cl  |  6 +-
 mace/kernels/opencl/cwise_opencl.cc           | 18 +++-
 mace/kernels/opencl/fully_connected_opencl.cc | 91 ++++++++++++++-----
 mace/ops/depthwise_conv2d_test.cc             |  1 -
 mace/ops/fully_connected_test.cc              |  4 +-
 30 files changed, 159 insertions(+), 118 deletions(-)

diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 01f370c5..0728d5f0 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -480,12 +480,12 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
 }
 
 const bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() {
-  if (gpu_type_ == GPUType::QUALCOMM_ADRENO &&
-      opencl_version_ == "2.0") {
-    return true;
-  } else {
-    return false;
-  }
+  return (gpu_type_ == GPUType::QUALCOMM_ADRENO &&
+      opencl_version_ == "2.0");
+}
+
+const GPUType OpenCLRuntime::gpu_type() const {
+  return gpu_type_;
 }
 
 const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName(
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index 3814eb41..3f5261b8 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -66,6 +66,7 @@ class OpenCLRuntime {
   uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
   const bool IsNonUniformWorkgroupsSupported();
   const GPUType ParseGPUTypeFromDeviceName(const std::string &device_name);
+  const GPUType gpu_type() const;
   cl::Kernel BuildKernel(const std::string &program_name,
                          const std::string &kernel_name,
                          const std::set<std::string> &build_options);
diff --git a/mace/kernels/cwise.h b/mace/kernels/cwise.h
index 073f5c48..07e03e7f 100644
--- a/mace/kernels/cwise.h
+++ b/mace/kernels/cwise.h
@@ -114,6 +114,7 @@ struct CWiseFunctor<DeviceType::OPENCL, T> : CWiseFunctorBase {
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl
index 42afc701..2978f402 100644
--- a/mace/kernels/opencl/cl/activation.cl
+++ b/mace/kernels/opencl/cl/activation.cl
@@ -1,7 +1,6 @@
 #include <common.h>
 
-__kernel void activation(
-                         UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void activation(GLOBAL_WORK_GROUP_SIZE_DIM3
                          __read_only image2d_t input,
 #ifdef USE_PRELU
                          __read_only image2d_t alpha,
diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl
index d0604f9e..30f52247 100644
--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
@@ -1,7 +1,6 @@
 #include <common.h>
 
-__kernel void addn(
-                   UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void addn(GLOBAL_WORK_GROUP_SIZE_DIM2
                    __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                    __read_only image2d_t input1,
 #if INPUT_NUM > 2
diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl
index 0075932d..290b6c1a 100644
--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
@@ -1,7 +1,6 @@
 #include <common.h>
 // Supported data types: half/float
-__kernel void batch_norm(
-                         UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void batch_norm(GLOBAL_WORK_GROUP_SIZE_DIM3
                          __read_only image2d_t input,
                          __read_only image2d_t scale,
                          __read_only image2d_t offset,
diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl
index a2d99abc..64de2d77 100644
--- a/mace/kernels/opencl/cl/bias_add.cl
+++ b/mace/kernels/opencl/cl/bias_add.cl
@@ -1,7 +1,6 @@
 #include <common.h>
 // Supported data types: half/float
-__kernel void bias_add(
-                       UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void bias_add(GLOBAL_WORK_GROUP_SIZE_DIM3
                        __read_only image2d_t input,
                        __read_only image2d_t bias,
                        __write_only image2d_t output) {
diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl
index 86071708..e300bc51 100644
--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
@@ -1,7 +1,6 @@
 #include <common.h>
 
-__kernel void filter_buffer_to_image(
-                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
                                      __global const DATA_TYPE *input, /* h, w, oc, ic */
                                      __private const int input_offset,
                                      __private const int filter_h,
@@ -53,8 +52,7 @@ __kernel void filter_buffer_to_image(
   WRITE_IMAGET(output, coord, values);
 }
 
-__kernel void filter_image_to_buffer(
-                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
                                      __global DATA_TYPE *output, /* h, w, oc, ic */
                                      __private const int filter_h,
                                      __private const int filter_w,
@@ -102,8 +100,7 @@ __kernel void filter_image_to_buffer(
   }
 }
 
-__kernel void dw_filter_buffer_to_image(
-                                        UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
                                         __global const DATA_TYPE *input, /* h, w, ic, m */
                                         __private const int input_offset,
                                         __private const int filter_w,
@@ -160,8 +157,7 @@ __kernel void dw_filter_buffer_to_image(
   WRITE_IMAGET(output, coord, values);
 }
 
-__kernel void in_out_buffer_to_image(
-                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
                                      __global const DATA_TYPE *input, /* nhwc */
                                      __private const int input_offset,
                                      __private const int height,
@@ -202,8 +198,7 @@ __kernel void in_out_buffer_to_image(
   WRITE_IMAGET(output, coord, values);
 }
 
-__kernel void in_out_image_to_buffer(
-                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
                                      __global DATA_TYPE *output, /* nhwc */
                                      __private const int height,
                                      __private const int width,
@@ -242,8 +237,7 @@ __kernel void in_out_image_to_buffer(
   }
 }
 
-__kernel void arg_buffer_to_image(
-                                  UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
                                   __global const DATA_TYPE *input, /* nhwc */
                                   __private const int input_offset,
                                   __private const int count,
@@ -278,8 +272,7 @@ __kernel void arg_buffer_to_image(
   WRITE_IMAGET(output, coord, values);
 }
 
-__kernel void arg_image_to_buffer(
-                                  UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
                                   __global DATA_TYPE *output, /* nhwc */
                                   __private const int count,
                                   __read_only image2d_t input) {
@@ -312,8 +305,7 @@ __kernel void arg_image_to_buffer(
 }
 
 
-__kernel void in_out_height_buffer_to_image(
-                                            UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
                                             __global const DATA_TYPE *input, //nhwc
                                             __private const int input_offset,
                                             __private const int height,
@@ -355,8 +347,7 @@ __kernel void in_out_height_buffer_to_image(
   WRITE_IMAGET(output, coord, values);
 }
 
-__kernel void in_out_height_image_to_buffer(
-                                            UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
                                             __global DATA_TYPE *output, //nhwc
                                             __private const int height,
                                             __private const int width,
@@ -394,8 +385,7 @@ __kernel void in_out_height_image_to_buffer(
 }
 
 
-__kernel void in_out_width_buffer_to_image(
-                                           UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
                                            __global const DATA_TYPE *input, /* nhwc */
                                            __private const int input_offset,
                                            __private const int height,
@@ -437,8 +427,7 @@ __kernel void in_out_width_buffer_to_image(
 }
 
 // only support 3x3 now
-__kernel void winograd_filter_buffer_to_image(
-                                              UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
                                               __global const DATA_TYPE *input, //Oc, Ic, H, W
                                               __private const int input_offset,
                                               __private const int in_channels,
@@ -529,8 +518,7 @@ __kernel void winograd_filter_buffer_to_image(
 }
 
 // only support 3x3 now
-__kernel void winograd_filter_image_to_buffer(
-                                              UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void winograd_filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
                                               __global DATA_TYPE *output, //Oc, Ic, H, W
                                               __private const int height,
                                               __private const int width,
diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/kernels/opencl/cl/channel_shuffle.cl
index 3fa2894e..92ff9447 100644
--- a/mace/kernels/opencl/cl/channel_shuffle.cl
+++ b/mace/kernels/opencl/cl/channel_shuffle.cl
@@ -1,8 +1,7 @@
 #include <common.h>
 
 // assume channes_per_group mod 4 = 0 && groups mod 4 == 0
-__kernel void channel_shuffle(
-                              UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void channel_shuffle(GLOBAL_WORK_GROUP_SIZE_DIM3
                               __read_only image2d_t input,
                               __private const int groups,
                               __private const int channels_per_group,
diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h
index 6e698b5c..b68bca07 100644
--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -19,18 +19,18 @@
 
 #ifndef NON_UNIFORM_WORK_GROUP
 
-#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 \
+#define GLOBAL_WORK_GROUP_SIZE_DIM2 \
     __private const int global_size_dim0,       \
     __private const int global_size_dim1,
-#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 \
+#define GLOBAL_WORK_GROUP_SIZE_DIM3 \
     __private const int global_size_dim0,       \
     __private const int global_size_dim1,       \
     __private const int global_size_dim2,
 
 #else
 
-#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+#define GLOBAL_WORK_GROUP_SIZE_DIM2
+#define GLOBAL_WORK_GROUP_SIZE_DIM3
 
 #endif
 
diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl
index 3b7370a8..0e171e0f 100644
--- a/mace/kernels/opencl/cl/concat.cl
+++ b/mace/kernels/opencl/cl/concat.cl
@@ -22,8 +22,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
 }
 
 // Supported data type: half/float
-__kernel void concat_channel(
-                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void concat_channel(GLOBAL_WORK_GROUP_SIZE_DIM3
                              __read_only image2d_t input0,
                              __read_only image2d_t input1,
                              __private const int input0_chan,
@@ -84,8 +83,7 @@ __kernel void concat_channel(
 }
 
 // Required: All input channels are divisible by 4
-__kernel void concat_channel_multi(
-                                   UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3
                                    __read_only image2d_t input,
                                    __private const int chan_blk_offset,
                                    __write_only image2d_t output) {
diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl
index 1383557d..f88885b0 100644
--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -1,7 +1,6 @@
 #include <common.h>
 
-__kernel void conv_2d(
-                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void conv_2d(GLOBAL_WORK_GROUP_SIZE_DIM3
                       __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                       __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
 #ifdef BIAS
diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index e993a159..a5454a67 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -1,7 +1,6 @@
 #include <common.h>
 
-__kernel void conv_2d_1x1(
-                          UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void conv_2d_1x1(GLOBAL_WORK_GROUP_SIZE_DIM3
                           __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                           __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
 #ifdef BIAS
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index 8bc27b33..19a636bb 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -1,7 +1,6 @@
 #include <common.h>
 
-__kernel void conv_2d_3x3(
-                          UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void conv_2d_3x3(GLOBAL_WORK_GROUP_SIZE_DIM3
                           __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                           __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
 #ifdef BIAS
diff --git a/mace/kernels/opencl/cl/cwise.cl b/mace/kernels/opencl/cl/cwise.cl
index 16f1f085..92cdaf7e 100644
--- a/mace/kernels/opencl/cl/cwise.cl
+++ b/mace/kernels/opencl/cl/cwise.cl
@@ -1,11 +1,16 @@
 #include <common.h>
 
-__kernel void cwise(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
-                      __private const float value,
-                      __write_only image2d_t output) {
+__kernel void cwise(GLOBAL_WORK_GROUP_SIZE_DIM2
+                    __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+                    __private const float value,
+                    __write_only image2d_t output) {
   const int w = get_global_id(0);
   const int hb = get_global_id(1);
 
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || hb >= global_size_dim1) return;
+#endif
+
   DATA_TYPE4 in0 = READ_IMAGET(input, SAMPLER, (int2)(w, hb));
   DATA_TYPE4 in1 = (DATA_TYPE4){value, value, value, value};
   DATA_TYPE4 out;
diff --git a/mace/kernels/opencl/cl/depth_to_space.cl b/mace/kernels/opencl/cl/depth_to_space.cl
index 21045ec9..8d989290 100644
--- a/mace/kernels/opencl/cl/depth_to_space.cl
+++ b/mace/kernels/opencl/cl/depth_to_space.cl
@@ -1,7 +1,6 @@
 #include <common.h>
 
-__kernel void depth_to_space(
-                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void depth_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3
                              __read_only image2d_t input,
                              __private const int block_size,
                              __private const int input_height,
@@ -36,7 +35,7 @@ __kernel void depth_to_space(
 }
 
 __kernel void space_to_depth(
-                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                             GLOBAL_WORK_GROUP_SIZE_DIM3
                              __read_only image2d_t input,
                              __private const int block_size,
                              __private const int input_height,
diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl
index fff19613..c71ec404 100644
--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
@@ -1,8 +1,7 @@
 #include <common.h>
 
 // Only multiplier = 1 is supported
-__kernel void depthwise_conv2d(
-                               UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void depthwise_conv2d(GLOBAL_WORK_GROUP_SIZE_DIM3
                                __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
@@ -138,8 +137,7 @@ __kernel void depthwise_conv2d(
   WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
 }
 
-__kernel void depthwise_conv2d_s1(
-                                  UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void depthwise_conv2d_s1(GLOBAL_WORK_GROUP_SIZE_DIM3
                                   __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                   __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/kernels/opencl/cl/eltwise.cl
index 8509dc38..0b9647f5 100644
--- a/mace/kernels/opencl/cl/eltwise.cl
+++ b/mace/kernels/opencl/cl/eltwise.cl
@@ -1,7 +1,6 @@
 #include <common.h>
 
-__kernel void eltwise(
-                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void eltwise(GLOBAL_WORK_GROUP_SIZE_DIM2
                       __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                       __read_only image2d_t input1,
 #ifdef COEFF_SUM
diff --git a/mace/kernels/opencl/cl/fully_connected.cl b/mace/kernels/opencl/cl/fully_connected.cl
index 057a66a4..3205e492 100644
--- a/mace/kernels/opencl/cl/fully_connected.cl
+++ b/mace/kernels/opencl/cl/fully_connected.cl
@@ -1,7 +1,8 @@
 #include <common.h>
 
 // output = weight * input + bias
-__kernel void fully_connected(__read_only image2d_t input,
+__kernel void fully_connected(GLOBAL_WORK_GROUP_SIZE_DIM2
+                              __read_only image2d_t input,
                               __read_only image2d_t weight,
 #ifdef BIAS
     __read_only image2d_t bias,
@@ -15,6 +16,10 @@ __kernel void fully_connected(__read_only image2d_t input,
   const int out_blk_idx = get_global_id(1);
   const int input_chan_blk = (input_channel + 3) >> 2;
 
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (batch_idx >= global_size_dim0 || out_blk_idx >= global_size_dim1) return;
+#endif
+
   float4 input_value;
   float4 w0, w1, w2, w3;
 
@@ -57,7 +62,8 @@ __kernel void fully_connected(__read_only image2d_t input,
 }
 
 // output = weight * input + bias
-__kernel void fully_connected_width(__read_only image2d_t input,
+__kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3
+                                    __read_only image2d_t input,
                                     __read_only image2d_t weight,
 #ifdef BIAS
     __read_only image2d_t bias,
@@ -73,6 +79,7 @@ __kernel void fully_connected_width(__read_only image2d_t input,
   const int width_blk_idx = get_global_id(1);
   const int width_blk_count = get_global_size(1);
   const int batch_out_blk_idx = get_global_id(2);
+
   const int batch_idx = batch_out_blk_idx / out_blks;
   const int out_blk_idx = batch_out_blk_idx % out_blks;
 
@@ -115,6 +122,16 @@ __kernel void fully_connected_width(__read_only image2d_t input,
   short inter_idx = mad24((short)get_local_id(2), local_size, inter_out_offset);
   intermediate_output[inter_idx] = sum;
 
+#ifdef NON_QUALCOMM_ADRENO
+  barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (batch_out_blk_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+
   if (inter_out_offset == 0) {
 #ifdef BIAS
     DATA_TYPE4 result = READ_IMAGET(bias, SAMPLER, (int2)(out_blk_idx, 0));
@@ -122,7 +139,7 @@ __kernel void fully_connected_width(__read_only image2d_t input,
     DATA_TYPE4 result = (DATA_TYPE4)(0, 0, 0, 0);
 #endif
 
-    for(short i = 0; i < local_width_blk_size; ++i) {
+    for (short i = 0; i < local_width_blk_size; ++i) {
       result += vload4(0, intermediate_output+inter_idx);
       inter_idx += 4;
     }
diff --git a/mace/kernels/opencl/cl/matmul.cl b/mace/kernels/opencl/cl/matmul.cl
index fe260e7a..82ccf6ba 100644
--- a/mace/kernels/opencl/cl/matmul.cl
+++ b/mace/kernels/opencl/cl/matmul.cl
@@ -1,8 +1,7 @@
 #include <common.h>
 
 // C = A * B
-__kernel void matmul(
-                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void matmul(GLOBAL_WORK_GROUP_SIZE_DIM2
                      __read_only image2d_t A,
                      __read_only image2d_t B,
                      __write_only image2d_t C,
diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl
index ead83994..25785bb2 100644
--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
@@ -19,8 +19,7 @@ inline int calculate_avg_block_size(const int pool_size,
 }
 
 // Supported data type: half/float
-__kernel void pooling(
-                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void pooling(GLOBAL_WORK_GROUP_SIZE_DIM3
                       __read_only image2d_t input,
                       __private const int in_height,
                       __private const int in_width,
diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl
index 83e6df85..2b0464c7 100644
--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
@@ -1,7 +1,6 @@
 #include <common.h>
 
-__kernel void resize_bilinear_nocache(
-                                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void resize_bilinear_nocache(GLOBAL_WORK_GROUP_SIZE_DIM3
                                       __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                       __write_only image2d_t output,
                                       __private const float height_scale,
diff --git a/mace/kernels/opencl/cl/slice.cl b/mace/kernels/opencl/cl/slice.cl
index eccdd882..0692c62b 100644
--- a/mace/kernels/opencl/cl/slice.cl
+++ b/mace/kernels/opencl/cl/slice.cl
@@ -1,7 +1,6 @@
 #include <common.h>
 
-__kernel void slice(
-                    UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void slice(GLOBAL_WORK_GROUP_SIZE_DIM3
                     __read_only image2d_t input,
                     __private const int chan_blk_offset,
                     __write_only image2d_t output) {
diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl
index 628d71cb..b5b99de6 100644
--- a/mace/kernels/opencl/cl/softmax.cl
+++ b/mace/kernels/opencl/cl/softmax.cl
@@ -1,7 +1,6 @@
 #include <common.h>
 
-__kernel void softmax(
-                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void softmax(GLOBAL_WORK_GROUP_SIZE_DIM3
                       __read_only image2d_t input,
                       __private const int channels,
                       __private const int remain_channels,
diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl
index 1e202404..431a5997 100644
--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
@@ -1,7 +1,6 @@
 #include <common.h>
 
-__kernel void space_to_batch(
-                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void space_to_batch(GLOBAL_WORK_GROUP_SIZE_DIM3
                              __read_only image2d_t space_data,
                              __write_only image2d_t batch_data,
                              __private const int block_height,
@@ -48,8 +47,7 @@ __kernel void space_to_batch(
   WRITE_IMAGET(batch_data, batch_coord, value);
 }
 
-__kernel void batch_to_space(
-                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+__kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3
                              __read_only image2d_t batch_data,
                              __write_only image2d_t space_data,
                              __private const int block_height,
diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl
index d447001e..0cab37d7 100644
--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
@@ -1,7 +1,6 @@
 #include <common.h>
 
-__kernel void winograd_transform_2x2(
-                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
                                      __read_only image2d_t input,
                                      __write_only image2d_t output,
                                      __private const int in_height,
@@ -116,8 +115,7 @@ __kernel void winograd_transform_2x2(
   }
 }
 
-__kernel void winograd_inverse_transform_2x2(
-                                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+__kernel void winograd_inverse_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
                                              __read_only image2d_t input,
 #ifdef BIAS
                                              __read_only image2d_t bias, /* cout%4 * cout/4 */
diff --git a/mace/kernels/opencl/cwise_opencl.cc b/mace/kernels/opencl/cwise_opencl.cc
index bd839c55..dce3d14d 100644
--- a/mace/kernels/opencl/cwise_opencl.cc
+++ b/mace/kernels/opencl/cwise_opencl.cc
@@ -23,8 +23,10 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   const index_t width_pixels = channel_blocks * width;
   const index_t batch_height_pixels = batch * height;
 
+  auto runtime = OpenCLRuntime::Global();
+  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
+                           static_cast<uint32_t>(batch_height_pixels)};
   if (kernel_.get() == nullptr) {
-    auto runtime = OpenCLRuntime::Global();
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("cwise");
@@ -32,19 +34,27 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     built_options.emplace(MakeString("-DCWISE_TYPE=", type_));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
     kernel_ = runtime->BuildKernel("cwise", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
   if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+    }
     kernel_.setArg(idx++, *(input->opencl_image()));
     kernel_.setArg(idx++, static_cast<float>(coeff_));
     kernel_.setArg(idx++, *(output->opencl_image()));
     input_shape_ = input->shape();
   }
 
-  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
-                           static_cast<uint32_t>(batch_height_pixels)};
-  const std::vector<uint32_t> lws = {64, 16, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
   std::stringstream ss;
   ss << "cwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
      << "_" << output->dim(2) << "_" << output->dim(3);
diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected_opencl.cc
index 70af952e..3178b8ae 100644
--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -27,6 +27,10 @@ void FCWXKernel(cl::Kernel *kernel,
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel->get() == nullptr) {
+    const index_t batch = output->dim(0);
+    const index_t output_size = output->dim(3);
+    const index_t output_blocks = RoundUpDiv4(output_size);
+
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
@@ -55,28 +59,47 @@ void FCWXKernel(cl::Kernel *kernel,
       default:
         LOG(FATAL) << "Unknown activation type: " << activation;
     }
+    if (runtime->gpu_type() != GPUType::QUALCOMM_ADRENO) {
+      built_options.emplace("-DNON_QUALCOMM_ADRENO");
+    }
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
 
     *kernel =
         runtime->BuildKernel("fully_connected", kernel_name, built_options);
 
-    const index_t batch = output->dim(0);
-    const index_t output_size = output->dim(3);
-    const index_t output_blocks = RoundUpDiv4(output_size);
-    const uint32_t wave_size =
-        static_cast<uint32_t>(runtime->GetKernelWaveSize(*kernel));
+    if (runtime->gpu_type() == GPUType::QUALCOMM_ADRENO) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+      const uint32_t wave_size =
+          static_cast<uint32_t>(runtime->GetKernelWaveSize(*kernel));
 
-    *gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
+      *gws = {4, (wave_size / 4), static_cast<uint32_t>(batch * output_blocks)};
 
-    const uint32_t kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-    const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]);
-    *lws = {(*gws)[0], (*gws)[1], inter_local_blks};
+      const uint32_t kwg_size =
+          static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
+      const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]);
+      *lws = {(*gws)[0], (*gws)[1], inter_local_blks};
+    } else {
+      *gws = {4, 8, static_cast<uint32_t>(batch * output_blocks)};
+
+      const uint32_t kwg_size =
+          static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
+      const uint32_t inter_local_blks = kwg_size / ((*gws)[0] * (*gws)[1]);
+      *lws = {(*gws)[0], (*gws)[1], inter_local_blks};
+    }
   }
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
     const index_t batch = output->dim(0);
     const index_t output_blocks = RoundUpDiv4(output->dim(3));
+    (*gws)[2] = static_cast<uint32_t>(batch * output_blocks);
 
     uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel->setArg(idx++, (*gws)[0]);
+      kernel->setArg(idx++, (*gws)[1]);
+      kernel->setArg(idx++, (*gws)[2]);
+    }
     kernel->setArg(idx++, *(input->opencl_image()));
     kernel->setArg(idx++, *(weight->opencl_image()));
     if (bias != nullptr) {
@@ -91,15 +114,25 @@ void FCWXKernel(cl::Kernel *kernel,
     kernel->setArg(idx++, static_cast<int>(output_blocks));
     kernel->setArg(idx++, relux_max_limit);
 
-    (*gws)[2] = static_cast<uint32_t>(batch * output_blocks);
-
     *prev_input_shape = input->shape();
   }
   cl::Event event;
-  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      *kernel, cl::NullRange, cl::NDRange((*gws)[0], (*gws)[1], (*gws)[2]),
-      cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
-  MACE_CHECK_CL_SUCCESS(error);
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        *kernel, cl::NullRange, cl::NDRange((*gws)[0], (*gws)[1], (*gws)[2]),
+        cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws->size());
+    for (size_t i = 0; i < lws->size(); ++i) {
+      roundup_gws[i] = RoundUp((*gws)[i], (*lws)[i]);
+    }
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        *kernel, cl::NullRange,
+        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange((*lws)[0], (*lws)[1], (*lws)[2]), nullptr, &event);
+  }
+  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
 
   if (future != nullptr) {
     future->wait_fn = [runtime, event](CallStats *stats) {
@@ -125,8 +158,8 @@ void FCWTXKernel(cl::Kernel *kernel,
                  StatsFuture *future) {
   MACE_CHECK_NOTNULL(gws);
   MACE_CHECK_NOTNULL(lws);
+  auto runtime = OpenCLRuntime::Global();
   if (kernel->get() == nullptr) {
-    auto runtime = OpenCLRuntime::Global();
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("fully_connected");
@@ -136,6 +169,9 @@ void FCWTXKernel(cl::Kernel *kernel,
     if (bias != nullptr) {
       built_options.emplace("-DBIAS");
     }
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
     switch (activation) {
       case NOOP:
         break;
@@ -157,10 +193,23 @@ void FCWTXKernel(cl::Kernel *kernel,
     *kernel =
         runtime->BuildKernel("fully_connected", kernel_name, built_options);
 
-    *lws = {16, 64, 1};
+    uint32_t kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
+    *lws = {16, kwg_size/16, 1};
   }
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
+    const index_t batch = output->dim(0);
+    const index_t output_blocks = RoundUpDiv4(output->dim(3));
+
+    *gws = {
+        static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
+    };
+
     uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel->setArg(idx++, (*gws)[0]);
+      kernel->setArg(idx++, (*gws)[1]);
+    }
     kernel->setArg(idx++, *(input->opencl_image()));
     kernel->setArg(idx++, *(weight->opencl_image()));
     if (bias != nullptr) {
@@ -173,12 +222,6 @@ void FCWTXKernel(cl::Kernel *kernel,
     // FIXME handle flexable data type: half not supported
     kernel->setArg(idx++, relux_max_limit);
 
-    const index_t batch = output->dim(0);
-    const index_t output_blocks = RoundUpDiv4(output->dim(3));
-
-    *gws = {
-        static_cast<uint32_t>(batch), static_cast<uint32_t>(output_blocks),
-    };
     *prev_input_shape = input->shape();
   }
 
diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc
index d401da97..c3bca21f 100644
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -57,7 +57,6 @@ void SimpleValidTest() {
       .AddIntsArg("strides", {1, 1})
       .AddIntArg("padding", Padding::VALID)
       .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
     // Run
     net.RunOp(D);
diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc
index 26a893b2..f839f95c 100644
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -225,7 +225,7 @@ void TestWXFormat(const index_t batch,
                                        kernels::BufferType::IN_OUT_CHANNEL);
   BufferToImage<DeviceType::OPENCL, T>(&net, "Weight", "WeightImage",
                                        kernels::BufferType::WEIGHT_WIDTH);
-  BufferToImage<DeviceType::OPENCL, float>(&net, "Bias", "BiasImage",
+  BufferToImage<DeviceType::OPENCL, T>(&net, "Bias", "BiasImage",
                                            kernels::BufferType::ARGUMENT);
 
   OpDefBuilder("FC", "FullyConnectedTest")
@@ -236,7 +236,7 @@ void TestWXFormat(const index_t batch,
       .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
       .Finalize(net.NewOperatorDef());
 
-  // Run on opencl
+  // Run
   net.RunOp(DeviceType::OPENCL);
 
   ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "OPENCLOutput",
-- 
GitLab