update uniform work group size macro name, remove unnessary variable, use mace...

update uniform work group size macro name, remove unnessary variable, use mace for dim params in *.cl

update uniform work group size macro name, remove unnessary variable, use mace...
update uniform work group size macro name, remove unnessary variable, use mace for dim params in *.cl
0fd7ce42 · yejianwu · af6d9162 · 0fd7ce42 · 0fd7ce42 · 0fd7ce42
60 changed file
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -156,7 +156,6 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
  T relux_max_limit_;
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  std::string tuning_key_prefix_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -91,7 +91,6 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -158,7 +158,6 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
                  StatsFuture *future);
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -65,7 +65,6 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
                  StatsFuture *future);
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -57,7 +57,6 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  const int groups_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -86,7 +86,6 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
                  StatsFuture *future);
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -402,7 +402,6 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -109,7 +109,6 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  const int block_size_;
  bool d2s_;
  std::vector<index_t> input_shape_;

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -438,7 +438,6 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -98,7 +98,6 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -242,7 +242,6 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
 };
 }  // namespace kernels

--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -27,16 +27,14 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  auto runtime = OpenCLRuntime::Global();
  if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
    built_options.emplace("-Dactivation=" + kernel_name);
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    switch (activation_) {
      case RELU:
@@ -63,6 +61,9 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
        LOG(FATAL) << "Unknown activation type: " << activation_;
    }
    kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
@@ -71,7 +72,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  if (!IsVecEqual(input_shape_, input->shape())) {
    int idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel_.setArg(idx++, gws[0]);
      kernel_.setArg(idx++, gws[1]);
      kernel_.setArg(idx++, gws[2]);
@@ -85,9 +86,6 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    kernel_.setArg(idx++, *(output->opencl_image()));
    input_shape_ = input->shape();
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -35,8 +35,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
  }
  if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
    if (input_tensors.size() > 4) {
      MACE_NOT_IMPLEMENTED;
    }
@@ -47,11 +45,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
    built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
-    if (is_non_uniform_work_groups_supported_) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
+    kwg_size_ =
+      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  std::vector<index_t> output_shape = input_tensors[0]->shape();
@@ -70,7 +71,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
    output_tensor->ResizeImage(output_shape, output_image_shape);
    uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel_.setArg(idx++, gws[0]);
      kernel_.setArg(idx++, gws[1]);
    }
@@ -80,9 +81,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
    input_shape_ = input_tensors[0]->shape();
-    kwg_size_ =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};

--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -38,16 +38,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
    built_options.emplace("-Dbatch_norm=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    if (folded_constant_) {
      built_options.emplace("-DFOLDED_CONSTANT");
@@ -72,10 +70,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    }
    kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel_.setArg(idx++, gws[0]);
      kernel_.setArg(idx++, gws[1]);
      kernel_.setArg(idx++, gws[2]);
@@ -92,9 +93,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    kernel_.setArg(idx++, relux_max_limit_);
    input_shape_ = input->shape();
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};

--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -30,22 +30,23 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  auto runtime = OpenCLRuntime::Global();
  if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
    built_options.emplace("-Dbias_add=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel_.setArg(idx++, gws[0]);
      kernel_.setArg(idx++, gws[1]);
      kernel_.setArg(idx++, gws[2]);
@@ -54,16 +55,13 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    kernel_.setArg(idx++, *(bias->opencl_image()));
    kernel_.setArg(idx++, *(output->opencl_image()));
    input_shape_ = input->shape();
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8};
  cl::Event event;
  cl_int error;
-  if (is_non_uniform_work_groups_supported_) {
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
    error = runtime->command_queue().enqueueNDRangeKernel(
        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -62,16 +62,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
-  const bool is_non_uniform_work_groups_supported =
-      runtime->IsNonUniformWorkgroupsSupported();
  std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
  std::set<std::string> built_options;
  std::stringstream kernel_name_ss;
  kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
  built_options.emplace(kernel_name_ss.str());
-  if (is_non_uniform_work_groups_supported) {
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
-    built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
  }
  if (buffer->dtype() == image->dtype()) {
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
@@ -87,7 +84,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
                                         obfuscated_kernel_name, built_options);
  uint32_t idx = 0;
-  if (!is_non_uniform_work_groups_supported) {
+  if (!runtime->IsNonUniformWorkgroupsSupported()) {
    b2f_kernel.setArg(idx++, gws[0]);
    b2f_kernel.setArg(idx++, gws[1]);
  }
@@ -123,7 +120,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
  cl::Event event;
  cl_int error;
-  if (is_non_uniform_work_groups_supported) {
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
    error = runtime->command_queue().enqueueNDRangeKernel(
        b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
        cl::NDRange(lws[0], lws[1]), nullptr, &event);

--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -37,24 +37,25 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
  if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
    built_options.emplace("-Dchannel_shuffle=" + kernel_name);
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
                                   built_options);
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel_.setArg(idx++, gws[0]);
      kernel_.setArg(idx++, gws[1]);
      kernel_.setArg(idx++, gws[2]);
@@ -65,9 +66,6 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, *(output->opencl_image()));
    input_shape_ = input->shape();
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};

--- a/mace/kernels/opencl/cl/activation.cl
+++ b/mace/kernels/opencl/cl/activation.cl
 #include <common.h>
 __kernel void activation(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                         UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                         __private const int global_size_dim0,
-                         __private const int global_size_dim1,
-                         __private const int global_size_dim2,
-#endif
                         __read_only image2d_t input,
 #ifdef USE_PRELU
                         __read_only image2d_t alpha,
@@ -16,7 +12,7 @@ __kernel void activation(
  const int w = get_global_id(1);
  const int hb = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (ch_blk >= global_size_dim0 || w >= global_size_dim1
      || hb >= global_size_dim2) {
    return;

--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
 #include <common.h>
 __kernel void addn(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                   UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                   __private const int global_size_dim0,
-                   __private const int global_size_dim1,
-#endif
                   __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                   __read_only image2d_t input1,
 #if INPUT_NUM > 2
@@ -17,7 +14,7 @@ __kernel void addn(
  const int w = get_global_id(0);
  const int hb = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (w >= global_size_dim0 || hb >= global_size_dim1) return;
 #endif

--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
 #include <common.h>
 // Supported data types: half/float
 __kernel void batch_norm(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                         UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                         __private const int global_size_dim0,
-                         __private const int global_size_dim1,
-                         __private const int global_size_dim2,
-#endif
                         __read_only image2d_t input,
                         __read_only image2d_t scale,
                         __read_only image2d_t offset,
@@ -20,7 +16,7 @@ __kernel void batch_norm(
  const int w = get_global_id(1);
  const int hb = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (ch_blk >= global_size_dim0 || w >= global_size_dim1
      || hb >= global_size_dim2) {
    return;

--- a/mace/kernels/opencl/cl/bias_add.cl
+++ b/mace/kernels/opencl/cl/bias_add.cl
 #include <common.h>
 // Supported data types: half/float
 __kernel void bias_add(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                       UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                       __private const int global_size_dim0,
-                       __private const int global_size_dim1,
-                       __private const int global_size_dim2,
-#endif
                       __read_only image2d_t input,
                       __read_only image2d_t bias,
                       __write_only image2d_t output) {
@@ -13,7 +9,7 @@ __kernel void bias_add(
  const int w = get_global_id(1);
  const int hb = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (ch_blk >= global_size_dim0 || w >= global_size_dim1
      || hb >= global_size_dim2) {
    return;

--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
 #include <common.h>
 __kernel void filter_buffer_to_image(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1,
-#endif
                                     __global const DATA_TYPE *input, /* h, w, oc, ic */
                                     __private const int input_offset,
                                     __private const int filter_h,
@@ -15,7 +12,7 @@ __kernel void filter_buffer_to_image(
  int w = get_global_id(0);
  int h = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (w >= global_size_dim0 || h >= global_size_dim1) {
    return;
  }
@@ -57,10 +54,7 @@ __kernel void filter_buffer_to_image(
 }
 __kernel void filter_image_to_buffer(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1,
-#endif
                                     __global DATA_TYPE *output, /* h, w, oc, ic */
                                     __private const int filter_h,
                                     __private const int filter_w,
@@ -70,7 +64,7 @@ __kernel void filter_image_to_buffer(
  int w = get_global_id(0);
  int h = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (w >= global_size_dim0 || h >= global_size_dim1) {
    return;
  }
@@ -109,10 +103,7 @@ __kernel void filter_image_to_buffer(
 }
 __kernel void dw_filter_buffer_to_image(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                        UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                                        __private const int global_size_dim0,
-                                        __private const int global_size_dim1,
-#endif
                                        __global const DATA_TYPE *input, /* h, w, ic, m */
                                        __private const int input_offset,
                                        __private const int filter_w,
@@ -122,7 +113,7 @@ __kernel void dw_filter_buffer_to_image(
  const int w = get_global_id(0);
  const int h = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (w >= global_size_dim0 || h >= global_size_dim1) {
    return;
  }
@@ -170,10 +161,7 @@ __kernel void dw_filter_buffer_to_image(
 }
 __kernel void in_out_buffer_to_image(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1,
-#endif
                                     __global const DATA_TYPE *input, /* nhwc */
                                     __private const int input_offset,
                                     __private const int height,
@@ -183,7 +171,7 @@ __kernel void in_out_buffer_to_image(
  int w = get_global_id(0);
  int h = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (w >= global_size_dim0 || h >= global_size_dim1) {
    return;
  }
@@ -215,10 +203,7 @@ __kernel void in_out_buffer_to_image(
 }
 __kernel void in_out_image_to_buffer(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1,
-#endif
                                     __global DATA_TYPE *output, /* nhwc */
                                     __private const int height,
                                     __private const int width,
@@ -227,7 +212,7 @@ __kernel void in_out_image_to_buffer(
  int w = get_global_id(0);
  int h = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (w >= global_size_dim0 || h >= global_size_dim1) {
    return;
  }
@@ -258,10 +243,7 @@ __kernel void in_out_image_to_buffer(
 }
 __kernel void arg_buffer_to_image(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                  UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                                  __private const int global_size_dim0,
-                                  __private const int global_size_dim1,
-#endif
                                  __global const DATA_TYPE *input, /* nhwc */
                                  __private const int input_offset,
                                  __private const int count,
@@ -269,7 +251,7 @@ __kernel void arg_buffer_to_image(
  int w = get_global_id(0);
  int h = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (w >= global_size_dim0 || h >= global_size_dim1) {
    return;
  }
@@ -297,17 +279,14 @@ __kernel void arg_buffer_to_image(
 }
 __kernel void arg_image_to_buffer(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                  UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                                  __private const int global_size_dim0,
-                                  __private const int global_size_dim1,
-#endif
                                  __global DATA_TYPE *output, /* nhwc */
                                  __private const int count,
                                  __read_only image2d_t input) {
  int w = get_global_id(0);
  int h = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (w >= global_size_dim0 || h >= global_size_dim1) {
    return;
  }
@@ -334,10 +313,7 @@ __kernel void arg_image_to_buffer(
 __kernel void in_out_height_buffer_to_image(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                            UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                                            __private const int global_size_dim0,
-                                            __private const int global_size_dim1,
-#endif
                                            __global const DATA_TYPE *input, //nhwc
                                            __private const int input_offset,
                                            __private const int height,
@@ -347,7 +323,7 @@ __kernel void in_out_height_buffer_to_image(
  int w = get_global_id(0);
  int h = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (w >= global_size_dim0 || h >= global_size_dim1) {
    return;
  }
@@ -380,10 +356,7 @@ __kernel void in_out_height_buffer_to_image(
 }
 __kernel void in_out_height_image_to_buffer(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                            UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                                            __private const int global_size_dim0,
-                                            __private const int global_size_dim1,
-#endif
                                            __global DATA_TYPE *output, //nhwc
                                            __private const int height,
                                            __private const int width,
@@ -392,7 +365,7 @@ __kernel void in_out_height_image_to_buffer(
  int w = get_global_id(0);
  int h = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (w >= global_size_dim0 || h >= global_size_dim1) {
    return;
  }
@@ -422,10 +395,7 @@ __kernel void in_out_height_image_to_buffer(
 __kernel void in_out_width_buffer_to_image(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                           UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                                           __private const int global_size_dim0,
-                                           __private const int global_size_dim1,
-#endif
                                           __global const DATA_TYPE *input, /* nhwc */
                                           __private const int input_offset,
                                           __private const int height,
@@ -435,7 +405,7 @@ __kernel void in_out_width_buffer_to_image(
  int w = get_global_id(0);
  int h = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (w >= global_size_dim0 || h >= global_size_dim1) {
    return;
  }
@@ -468,10 +438,7 @@ __kernel void in_out_width_buffer_to_image(
 // only support 3x3 now
 __kernel void winograd_filter_buffer_to_image(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                              UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                                              __private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-#endif
                                              __global const DATA_TYPE *input, //Oc, Ic, H, W
                                              __private const int input_offset,
                                              __private const int in_channels,
@@ -481,7 +448,7 @@ __kernel void winograd_filter_buffer_to_image(
  int w = get_global_id(0);
  int h = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (w >= global_size_dim0 || h >= global_size_dim1) {
    return;
  }
@@ -563,10 +530,7 @@ __kernel void winograd_filter_buffer_to_image(
 // only support 3x3 now
 __kernel void winograd_filter_image_to_buffer(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                              UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                                              __private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-#endif
                                              __global DATA_TYPE *output, //Oc, Ic, H, W
                                              __private const int height,
                                              __private const int width,
@@ -575,7 +539,7 @@ __kernel void winograd_filter_image_to_buffer(
  const int w = get_global_id(0);
  const int h = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (w >= global_size_dim0 || h >= global_size_dim1) {
    return;
  }

--- a/mace/kernels/opencl/cl/channel_shuffle.cl
+++ b/mace/kernels/opencl/cl/channel_shuffle.cl
@@ -2,11 +2,7 @@
 // assume channes_per_group mod 4 = 0 && groups mod 4 == 0
 __kernel void channel_shuffle(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                              UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                              __private const int global_size_dim0,
-                              __private const int global_size_dim1,
-                              __private const int global_size_dim2,
-#endif
                              __read_only image2d_t input,
                              __private const int groups,
                              __private const int channels_per_group,
@@ -15,7 +11,7 @@ __kernel void channel_shuffle(
  const int width_idx = get_global_id(1);
  const int hb_idx = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (group_chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
      || hb_idx >= global_size_dim2) {
    return;

--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -18,6 +18,23 @@
 #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
 #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
+#ifndef NON_UNIFORM_WORK_GROUP
+#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 \
+    __private const int global_size_dim0,       \
+    __private const int global_size_dim1,
+#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 \
+    __private const int global_size_dim0,       \
+    __private const int global_size_dim1,       \
+    __private const int global_size_dim2,
+#else
+#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+#endif
 __constant sampler_t SAMPLER =
    CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

--- a/mace/kernels/opencl/cl/concat.cl
+++ b/mace/kernels/opencl/cl/concat.cl
@@ -23,11 +23,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
 // Supported data type: half/float
 __kernel void concat_channel(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                             __private const int global_size_dim0,
-                             __private const int global_size_dim1,
-                             __private const int global_size_dim2,
-#endif
                             __read_only image2d_t input0,
                             __read_only image2d_t input1,
                             __private const int input0_chan,
@@ -36,7 +32,7 @@ __kernel void concat_channel(
  const int width_idx = get_global_id(1);
  const int hb_idx = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
      || hb_idx >= global_size_dim2) {
    return;
@@ -89,11 +85,7 @@ __kernel void concat_channel(
 // Required: All input channels are divisible by 4
 __kernel void concat_channel_multi(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                   UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                                   __private const int global_size_dim0,
-                                   __private const int global_size_dim1,
-                                   __private const int global_size_dim2,
-#endif
                                   __read_only image2d_t input,
                                   __private const int chan_blk_offset,
                                   __write_only image2d_t output) {
@@ -101,7 +93,7 @@ __kernel void concat_channel_multi(
  const int width_idx = get_global_id(1);
  const int hb_idx = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
      || hb_idx >= global_size_dim2) {
    return;

--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
 #include <common.h>
 __kernel void conv_2d(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1,
-                      __private const int global_size_dim2,
-#endif
                      __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                      __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
 #ifdef BIAS
@@ -29,7 +25,7 @@ __kernel void conv_2d(
  const int out_w_blk = get_global_id(1);
  const int out_hb = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
      || out_hb >= global_size_dim2) {
    return;

--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
 #include <common.h>
 __kernel void conv_2d_1x1(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                          UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                          __private const int global_size_dim0,
-                          __private const int global_size_dim1,
-                          __private const int global_size_dim2,
-#endif
                          __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                          __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
 #ifdef BIAS
@@ -23,7 +19,7 @@ __kernel void conv_2d_1x1(
  const int out_w_blk = get_global_id(1);
  const int out_hb = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
      || out_hb >= global_size_dim2) {
    return;

--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
 #include <common.h>
 __kernel void conv_2d_3x3(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                          UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                          __private const int global_size_dim0,
-                          __private const int global_size_dim1,
-                          __private const int global_size_dim2,
-#endif
                          __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                          __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
 #ifdef BIAS
@@ -27,7 +23,7 @@ __kernel void conv_2d_3x3(
  const int out_w_blk = get_global_id(1);
  const int out_hb = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
      || out_hb >= global_size_dim2) {
    return;

--- a/mace/kernels/opencl/cl/depth_to_space.cl
+++ b/mace/kernels/opencl/cl/depth_to_space.cl
 #include <common.h>
 __kernel void depth_to_space(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                             __private const int global_size_dim0,
-                             __private const int global_size_dim1,
-                             __private const int global_size_dim2,
-#endif
                             __read_only image2d_t input,
                             __private const int block_size,
                             __private const int output_depth,
@@ -14,7 +10,7 @@ __kernel void depth_to_space(
  const int out_w = get_global_id(1);
  const int out_h = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (out_d >= global_size_dim0 || out_w >= global_size_dim1
      || out_h >= global_size_dim2) {
    return;
@@ -43,11 +39,7 @@ __kernel void depth_to_space(
 }
 __kernel void space_to_depth(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                             __private const int global_size_dim0,
-                             __private const int global_size_dim1,
-                             __private const int global_size_dim2,
-#endif
                             __read_only image2d_t input,
                             __private const int block_size,
                             __private const int input_depth,
@@ -57,7 +49,7 @@ __kernel void space_to_depth(
  const int w = get_global_id(1);
  const int h = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (d >= global_size_dim0 || w >= global_size_dim1
      || h >= global_size_dim2) {
    return;

--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
@@ -2,11 +2,7 @@
 // Only multiplier = 1 is supported
 __kernel void depthwise_conv2d(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                               UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                               __private const int global_size_dim0,
-                               __private const int global_size_dim1,
-                               __private const int global_size_dim2,
-#endif
                               __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                               __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
@@ -29,7 +25,7 @@ __kernel void depthwise_conv2d(
  const short out_w_blk = get_global_id(1);
  const short out_hb = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
      || out_hb >= global_size_dim2) {
    return;
@@ -143,11 +139,7 @@ __kernel void depthwise_conv2d(
 }
 __kernel void depthwise_conv2d_s1(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                  UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                                  __private const int global_size_dim0,
-                                  __private const int global_size_dim1,
-                                  __private const int global_size_dim2,
-#endif
                                  __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                  __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
@@ -168,7 +160,7 @@ __kernel void depthwise_conv2d_s1(
  const short out_w_blk = get_global_id(1) << 2;
  const short out_hb = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (out_ch_blk >= global_size_dim0 || get_global_id(1) >= global_size_dim1
      || out_hb >= global_size_dim2) {
    return;

--- a/mace/kernels/opencl/cl/eltwise.cl
+++ b/mace/kernels/opencl/cl/eltwise.cl
 #include <common.h>
 __kernel void eltwise(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1,
-#endif
                      __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                      __read_only image2d_t input1,
 #ifdef COEFF_SUM
@@ -15,7 +12,7 @@ __kernel void eltwise(
  const int w = get_global_id(0);
  const int hb = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (w >= global_size_dim0 || hb >= global_size_dim1) return;
 #endif

--- a/mace/kernels/opencl/cl/matmul.cl
+++ b/mace/kernels/opencl/cl/matmul.cl
@@ -2,10 +2,7 @@
 // C = A * B
 __kernel void matmul(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                     __private const int global_size_dim0,
-                     __private const int global_size_dim1,
-#endif
                     __read_only image2d_t A,
                     __read_only image2d_t B,
                     __write_only image2d_t C,
@@ -17,7 +14,7 @@ __kernel void matmul(
  const int gx = get_global_id(0) << 2;
  const int hb = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (get_global_id(0) >= global_size_dim0 || hb >= global_size_dim1) return;
 #endif

--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
@@ -20,11 +20,7 @@ inline int calculate_avg_block_size(const int pool_size,
 // Supported data type: half/float
 __kernel void pooling(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1,
-                      __private const int global_size_dim2,
-#endif
                      __read_only image2d_t input,
                      __private const int in_height,
                      __private const int in_width,
@@ -39,7 +35,7 @@ __kernel void pooling(
  const int out_width_idx = get_global_id(1);
  const int out_hb_idx = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (out_chan_idx >= global_size_dim0 || out_width_idx >= global_size_dim1
      || out_hb_idx >= global_size_dim2) {
    return;

--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
 #include <common.h>
 __kernel void resize_bilinear_nocache(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                                      __private const int global_size_dim0,
-                                      __private const int global_size_dim1,
-                                      __private const int global_size_dim2,
-#endif
                                      __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                      __write_only image2d_t output,
                                      __private const float height_scale,
@@ -18,7 +14,7 @@ __kernel void resize_bilinear_nocache(
  const int w = get_global_id(1);
  const int hb = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (ch_blk >= global_size_dim0 || w >= global_size_dim1
      || hb >= global_size_dim2) {
    return;

--- a/mace/kernels/opencl/cl/slice.cl
+++ b/mace/kernels/opencl/cl/slice.cl
 #include <common.h>
 __kernel void slice(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                    UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                    __private const int global_size_dim0,
-                    __private const int global_size_dim1,
-                    __private const int global_size_dim2,
-#endif
                    __read_only image2d_t input,
                    __private const int chan_blk_offset,
                    __write_only image2d_t output) {
@@ -13,7 +9,7 @@ __kernel void slice(
  const int width_idx = get_global_id(1);
  const int hb_idx = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
      || hb_idx >= global_size_dim2) {
    return;

--- a/mace/kernels/opencl/cl/softmax.cl
+++ b/mace/kernels/opencl/cl/softmax.cl
 #include <common.h>
 __kernel void softmax(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1,
-                      __private const int global_size_dim2,
-#endif
                      __read_only image2d_t input,
                      __private const int channels,
                      __private const int remain_channels,
@@ -14,7 +10,7 @@ __kernel void softmax(
  const int width_idx = get_global_id(1);
  const int hb_idx = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
      || hb_idx >= global_size_dim2) {
    return;

--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
 #include <common.h>
 __kernel void space_to_batch(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                             __private const int global_size_dim0,
-                             __private const int global_size_dim1,
-                             __private const int global_size_dim2,
-#endif
                             __read_only image2d_t space_data,
                             __write_only image2d_t batch_data,
                             __private const int block_height,
@@ -20,7 +16,7 @@ __kernel void space_to_batch(
  const int batch_w_idx = get_global_id(1);
  const int batch_hb_idx = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
      || batch_hb_idx >= global_size_dim2) {
    return;
@@ -53,11 +49,7 @@ __kernel void space_to_batch(
 }
 __kernel void batch_to_space(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
-                             __private const int global_size_dim0,
-                             __private const int global_size_dim1,
-                             __private const int global_size_dim2,
-#endif
                             __read_only image2d_t batch_data,
                             __write_only image2d_t space_data,
                             __private const int block_height,
@@ -72,7 +64,7 @@ __kernel void batch_to_space(
  const int batch_w_idx = get_global_id(1);
  const int batch_hb_idx = get_global_id(2);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
      || batch_hb_idx >= global_size_dim2) {
    return;

--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
 #include <common.h>
 __kernel void winograd_transform_2x2(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1,
-#endif
                                     __read_only image2d_t input,
                                     __write_only image2d_t output,
                                     __private const int in_height,
@@ -17,7 +14,7 @@ __kernel void winograd_transform_2x2(
  int out_width_idx = get_global_id(0);
  int chan_blk_idx = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) {
    return;
  }
@@ -120,10 +117,7 @@ __kernel void winograd_transform_2x2(
 }
 __kernel void winograd_inverse_transform_2x2(
-#ifndef USE_QUALCOMM_OPENCL_2_0
+                                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
-                                             __private const int global_size_dim0,
-                                             __private const int global_size_dim1,
-#endif
                                             __read_only image2d_t input,
 #ifdef BIAS
                                             __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -137,7 +131,7 @@ __kernel void winograd_inverse_transform_2x2(
  const int width_idx = get_global_id(0);
  const int height_idx = get_global_id(1);
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
  if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
    return;
  }

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -18,7 +18,6 @@ static void Concat2(cl::Kernel *kernel,
                    std::vector<index_t> *prev_input_shape,
                    Tensor *output,
                    StatsFuture *future,
-                    bool *is_non_uniform_work_groups_supported,
                    uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
@@ -34,13 +33,11 @@ static void Concat2(cl::Kernel *kernel,
  auto runtime = OpenCLRuntime::Global();
  if (kernel->get() == nullptr) {
-    *is_non_uniform_work_groups_supported =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
    built_options.emplace("-Dconcat_channel=" + kernel_name);
-    if (*is_non_uniform_work_groups_supported) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    if (input0->dtype() == output->dtype()) {
      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
@@ -53,10 +50,13 @@ static void Concat2(cl::Kernel *kernel,
      built_options.emplace("-DDIVISIBLE_FOUR");
    }
    *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
  if (!IsVecEqual(*prev_input_shape, input0->shape())) {
    uint32_t idx = 0;
-    if (!(*is_non_uniform_work_groups_supported)) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel->setArg(idx++, gws[0]);
      kernel->setArg(idx++, gws[1]);
      kernel->setArg(idx++, gws[2]);
@@ -70,9 +70,6 @@ static void Concat2(cl::Kernel *kernel,
                   *(static_cast<cl::Image2D *>(output->opencl_image())));
    *prev_input_shape = input0->shape();
-    *kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
@@ -87,7 +84,6 @@ static void ConcatN(cl::Kernel *kernel,
                    const DataType dt,
                    Tensor *output,
                    StatsFuture *future,
-                    bool *is_non_uniform_work_groups_supported,
                    uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
@@ -97,17 +93,17 @@ static void ConcatN(cl::Kernel *kernel,
  auto runtime = OpenCLRuntime::Global();
  if (kernel->get() == nullptr) {
-    *is_non_uniform_work_groups_supported =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
    built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    if (*is_non_uniform_work_groups_supported) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
  const int inputs_count = input_list.size();
@@ -121,7 +117,7 @@ static void ConcatN(cl::Kernel *kernel,
    };
    uint32_t idx = 0;
-    if (!(*is_non_uniform_work_groups_supported)) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel->setArg(idx++, gws[0]);
      kernel->setArg(idx++, gws[1]);
      kernel->setArg(idx++, gws[2]);
@@ -131,8 +127,6 @@ static void ConcatN(cl::Kernel *kernel,
    kernel->setArg(idx++, *(output->opencl_image()));
    chan_blk_offset += input_channel_blk;
-    *kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
    const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
    std::stringstream ss;
    ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
@@ -178,13 +172,12 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
  switch (inputs_count) {
    case 2:
      Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
-              &input_shape_, output, future,
+              &input_shape_, output, future, &kwg_size_);
-              &is_non_uniform_work_groups_supported_, &kwg_size_);
      break;
    default:
      if (divisible_four) {
        ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future,
-            &is_non_uniform_work_groups_supported_, &kwg_size_);
+            &kwg_size_);
      } else {
        MACE_NOT_IMPLEMENTED;
      }

--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -21,7 +21,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
                             StatsFuture *future,
-                             bool *is_non_uniform_work_groups_supported,
                             uint32_t *kwg_size);
 extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
@@ -37,7 +36,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
                             StatsFuture *future,
-                             bool *is_non_uniform_work_groups_supported,
                             uint32_t *kwg_size);
 extern void Conv2dOpencl(cl::Kernel *kernel,
@@ -53,7 +51,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                         std::vector<index_t> *prev_input_shape,
                         Tensor *output,
                         StatsFuture *future,
-                         bool *is_non_uniform_work_groups_supported,
                         uint32_t *kwg_size);
 template <typename T>
@@ -68,7 +65,7 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
      const int *dilations, const ActivationType activation,
      const float relux_max_limit, const DataType dt,
      std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
-      bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size);
+      uint32_t *kwg_size);
  // Selection matrix: kernel_size x stride_size
  static const Conv2dOpenclFunction selector[5] = {
      Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
@@ -109,12 +106,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                dilations_, activation_, relux_max_limit_,
                DataTypeToEnum<T>::value, &input_shape_, output, future,
-                &is_non_uniform_work_groups_supported_, &kwg_size_);
+                &kwg_size_);
  } else {
    Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                 dilations_, activation_, relux_max_limit_,
                 DataTypeToEnum<T>::value, &input_shape_, output, future,
-                 &is_non_uniform_work_groups_supported_, &kwg_size_);
+                 &kwg_size_);
  }
 }

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -23,7 +23,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
                             StatsFuture *future,
-                             bool *is_non_uniform_work_groups_supported,
                             uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
@@ -41,8 +40,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
  auto runtime = OpenCLRuntime::Global();
  if (kernel->get() == nullptr) {
-    *is_non_uniform_work_groups_supported =
-        runtime->IsNonUniformWorkgroupsSupported();
    MACE_CHECK(input_batch == batch);
    std::set<std::string> built_options;
@@ -50,8 +47,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
    built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (*is_non_uniform_work_groups_supported) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    if (bias != nullptr) {
      built_options.emplace("-DBIAS");
@@ -76,6 +73,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
    }
    *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
@@ -84,7 +84,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    uint32_t idx = 0;
-    if (!(*is_non_uniform_work_groups_supported)) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel->setArg(idx++, gws[0]);
      kernel->setArg(idx++, gws[1]);
      kernel->setArg(idx++, gws[2]);
@@ -105,9 +105,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
    kernel->setArg(idx++, stride);
    *prev_input_shape = input->shape();
-    *kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -25,7 +25,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
                             StatsFuture *future,
-                             bool *is_non_uniform_work_groups_supported,
                             uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
@@ -40,15 +39,13 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
  auto runtime = OpenCLRuntime::Global();
  if (kernel->get() == nullptr) {
-    *is_non_uniform_work_groups_supported =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
    built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (*is_non_uniform_work_groups_supported) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation) {
@@ -71,6 +68,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
    }
    *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
@@ -79,7 +79,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    uint32_t idx = 0;
-    if (!(*is_non_uniform_work_groups_supported)) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel->setArg(idx++, gws[0]);
      kernel->setArg(idx++, gws[1]);
      kernel->setArg(idx++, gws[2]);
@@ -103,9 +103,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
    kernel->setArg(idx++, dilations[1]);
    *prev_input_shape = input->shape();
-    *kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
  const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 1};

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -25,7 +25,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                         std::vector<index_t> *prev_input_shape,
                         Tensor *output,
                         StatsFuture *future,
-                         bool *is_non_uniform_work_groups_supported,
                         uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
@@ -40,15 +39,13 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
  auto runtime = OpenCLRuntime::Global();
  if (kernel->get() == nullptr) {
-    *is_non_uniform_work_groups_supported =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
    built_options.emplace("-Dconv_2d=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (*is_non_uniform_work_groups_supported) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation) {
@@ -71,6 +68,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
    }
    *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
@@ -79,7 +79,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    uint32_t idx = 0;
-    if (!(*is_non_uniform_work_groups_supported)) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel->setArg(idx++, gws[0]);
      kernel->setArg(idx++, gws[1]);
      kernel->setArg(idx++, gws[2]);
@@ -105,9 +105,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
    kernel->setArg(idx++, dilations[1]);
    *prev_input_shape = input->shape();
-    *kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};

--- a/mace/kernels/opencl/depth_to_space_opencl.cc
+++ b/mace/kernels/opencl/depth_to_space_opencl.cc
@@ -48,8 +48,6 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
  if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::stringstream kernel_name_ss;
@@ -58,11 +56,14 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    kernel_ =
        runtime->BuildKernel("depth_to_space", kernel_name, built_options);
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  uint32_t gws[3];
@@ -83,7 +84,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
    }
    uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel_.setArg(idx++, gws[0]);
      kernel_.setArg(idx++, gws[1]);
      kernel_.setArg(idx++, gws[2]);
@@ -94,9 +95,6 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, *(output->opencl_image()));
    input_shape_ = input->shape();
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};

--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -24,7 +24,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
                     std::vector<index_t> *prev_input_shape,
                     Tensor *output,
                     StatsFuture *future,
-                     bool *is_non_uniform_work_groups_supported,
                     uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
@@ -45,8 +44,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
  auto runtime = OpenCLRuntime::Global();
  if (kernel->get() == nullptr) {
-    *is_non_uniform_work_groups_supported =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
    if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
@@ -55,8 +52,8 @@ void DepthwiseConv2d(cl::Kernel *kernel,
    } else {
      built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
    }
-    if (*is_non_uniform_work_groups_supported) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
@@ -83,6 +80,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
    *kernel =
        runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    const index_t input_batch = input->dim(0);
@@ -97,7 +97,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
               input_channels);
    uint32_t idx = 0;
-    if (!(*is_non_uniform_work_groups_supported)) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel->setArg(idx++, gws[0]);
      kernel->setArg(idx++, gws[1]);
      kernel->setArg(idx++, gws[2]);
@@ -124,9 +124,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
    }
    *prev_input_shape = input->shape();
-    *kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
@@ -185,7 +182,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
  DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                  dilations_, activation_, relux_max_limit_,
                  DataTypeToEnum<T>::value, &input_shape_, output, future,
-                  &is_non_uniform_work_groups_supported_, &kwg_size_);
+                  &kwg_size_);
 }
 template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;

--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -30,8 +30,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
  auto runtime = OpenCLRuntime::Global();
  if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
@@ -39,15 +37,18 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
    built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
-    if (is_non_uniform_work_groups_supported_) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
    kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  if (!IsVecEqual(input_shape_, input0->shape())) {
    uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel_.setArg(idx++, gws[0]);
      kernel_.setArg(idx++, gws[1]);
    }
@@ -60,9 +61,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
    kernel_.setArg(idx++, *(output->opencl_image()));
    input_shape_ = input0->shape();
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -200,8 +200,6 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
                         const std::vector<uint32_t> &lws,
                         StatsFuture *future) {
  auto runtime = OpenCLRuntime::Global();
-  const bool is_non_uniform_work_groups_supported =
-      runtime->IsNonUniformWorkgroupsSupported();
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    const uint32_t kwg_size =
@@ -239,7 +237,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
        << "Tuning parameters of 3D kernel must be 4D";
    cl_int error = CL_SUCCESS;
    std::vector<uint32_t> roundup_gws(3);
-    if (!is_non_uniform_work_groups_supported) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      for (size_t i = 0; i < 3; ++i) {
        roundup_gws[i] = RoundUp(gws[i], params[i]);
      }
@@ -252,7 +250,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
      for (uint32_t i = 0; i < num_blocks; ++i) {
        uint32_t gws2 =
            (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        if (is_non_uniform_work_groups_supported) {
+        if (runtime->IsNonUniformWorkgroupsSupported()) {
          error = runtime->command_queue().enqueueNDRangeKernel(
              kernel, cl::NDRange(0, 0, i * block_size),
              cl::NDRange(gws[0], gws[1], gws2),
@@ -268,7 +266,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
      }
    } else {
      timer->ClearTiming();
-      if (is_non_uniform_work_groups_supported) {
+      if (runtime->IsNonUniformWorkgroupsSupported()) {
        error = runtime->command_queue().enqueueNDRangeKernel(
            kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
@@ -293,7 +291,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
        for (uint32_t i = 0; i < num_blocks; ++i) {
          uint32_t gws2 =
              (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-          if (is_non_uniform_work_groups_supported) {
+          if (runtime->IsNonUniformWorkgroupsSupported()) {
            error = runtime->command_queue().enqueueNDRangeKernel(
                kernel, cl::NDRange(0, 0, i * block_size),
                cl::NDRange(gws[0], gws[1], gws2),
@@ -332,8 +330,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
                         const std::vector<uint32_t> &lws,
                         StatsFuture *future) {
  auto runtime = OpenCLRuntime::Global();
-  const bool is_non_uniform_work_groups_supported =
-      runtime->IsNonUniformWorkgroupsSupported();
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    const uint32_t kwg_size =
@@ -359,7 +355,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
        << "Tuning parameters of 2D kernel must be 3d";
    cl_int error = CL_SUCCESS;
    std::vector<uint32_t> roundup_gws(2);
-    if (!is_non_uniform_work_groups_supported) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      for (size_t i = 0; i < 2; ++i) {
        roundup_gws[i] = RoundUp(gws[i], params[i]);
      }
@@ -372,7 +368,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
      for (uint32_t i = 0; i < num_blocks; ++i) {
        uint32_t gws1 =
            (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-        if (is_non_uniform_work_groups_supported) {
+        if (runtime->IsNonUniformWorkgroupsSupported()) {
          error = runtime->command_queue().enqueueNDRangeKernel(
              kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
              cl::NDRange(params[0], params[1]), nullptr, &event);
@@ -387,7 +383,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
      }
    } else {
      timer->ClearTiming();
-      if (is_non_uniform_work_groups_supported) {
+      if (runtime->IsNonUniformWorkgroupsSupported()) {
        error = runtime->command_queue().enqueueNDRangeKernel(
            kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
            cl::NDRange(params[0], params[1]), nullptr, &event);
@@ -411,7 +407,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
        for (uint32_t i = 0; i < num_blocks; ++i) {
          uint32_t gws1 =
              (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-          if (is_non_uniform_work_groups_supported) {
+          if (runtime->IsNonUniformWorkgroupsSupported()) {
            error = runtime->command_queue().enqueueNDRangeKernel(
                kernel, cl::NDRange(0, i * block_size),
                cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]),

--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -34,21 +34,22 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
  auto runtime = OpenCLRuntime::Global();
  if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
    built_options.emplace("-Dmatmul=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  uint32_t idx = 0;
-  if (!is_non_uniform_work_groups_supported_) {
+  if (!runtime->IsNonUniformWorkgroupsSupported()) {
    kernel_.setArg(idx++, gws[0]);
    kernel_.setArg(idx++, gws[1]);
  }
@@ -61,8 +62,6 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
  kernel_.setArg(idx++, static_cast<int>(height_blocks));
  kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
-  kwg_size_ =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 1};
  std::stringstream ss;
  ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"

--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -21,8 +21,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  auto runtime = OpenCLRuntime::Global();
  if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
    const DataType dt = DataTypeToEnum<T>::value;
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
@@ -39,10 +37,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    if (pooling_type_ == AVG) {
      built_options.emplace("-DPOOL_AVG");
    }
-    if (is_non_uniform_work_groups_supported_) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  std::vector<uint32_t> gws;
@@ -81,7 +82,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    };
    uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel_.setArg(idx++, gws[0]);
      kernel_.setArg(idx++, gws[1]);
      kernel_.setArg(idx++, gws[2]);
@@ -97,9 +98,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    kernel_.setArg(idx++, *(output->opencl_image()));
    input_shape_ = input->shape();
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  } else {
    index_t batch = output->dim(0);
    index_t out_height = output->dim(1);

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -31,19 +31,20 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
  if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
    built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    kernel_ =
        runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  if (!IsVecEqual(input_shape_, input->shape())) {
    MACE_CHECK(out_height > 0 && out_width > 0);
@@ -60,7 +61,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
        CalculateResizeScale(in_width, out_width, align_corners_);
    uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel_.setArg(idx++, gws[0]);
      kernel_.setArg(idx++, gws[1]);
      kernel_.setArg(idx++, gws[2]);
@@ -74,9 +75,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, static_cast<int32_t>(out_height));
    input_shape_ = input->shape();
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};

--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -32,18 +32,19 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
  if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
    built_options.emplace("-Dslice=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE="
                           + DtToCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_non_uniform_work_groups_supported_) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const index_t channel_blk = RoundUpDiv4(output_channels);
@@ -53,8 +54,6 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
      static_cast<uint32_t>(input->dim(0) * input->dim(1)),
  };
-  kwg_size_ =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::stringstream ss;
  ss << "slice_opencl_kernel_"
@@ -65,7 +64,7 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
     << outputs_count;
  for (int i = 0; i < outputs_count; ++i) {
    uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel_.setArg(idx++, gws[0]);
      kernel_.setArg(idx++, gws[1]);
      kernel_.setArg(idx++, gws[2]);

--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -30,22 +30,23 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
  auto runtime = OpenCLRuntime::Global();
  if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
    built_options.emplace("-Dsoftmax=" + kernel_name);
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  if (!IsVecEqual(input_shape_, logits->shape())) {
    uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel_.setArg(idx++, gws[0]);
      kernel_.setArg(idx++, gws[1]);
      kernel_.setArg(idx++, gws[2]);
@@ -56,9 +57,6 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
    kernel_.setArg(idx++, *(output->opencl_image()));
    input_shape_ = logits->shape();
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};

--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -39,8 +39,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
  if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::set<std::string> built_options;
    std::stringstream kernel_name_ss;
@@ -49,15 +47,18 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE=" +
                          DtToCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_non_uniform_work_groups_supported_) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    kernel_ =
        runtime->BuildKernel("space_to_batch", kernel_name, built_options);
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  if (!IsVecEqual(space_shape_, space_tensor->shape())) {
    uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel_.setArg(idx++, gws[0]);
      kernel_.setArg(idx++, gws[1]);
      kernel_.setArg(idx++, gws[2]);
@@ -79,9 +80,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
    space_shape_ = space_tensor->shape();
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -18,8 +18,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
  if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::string obfuscated_kernel_name =
        MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
    std::set<std::string> built_options;
@@ -28,11 +26,14 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE=" +
                          DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_non_uniform_work_groups_supported_) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
                                   built_options);
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  std::vector<index_t> output_shape(4);
  std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
@@ -61,7 +62,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
    output_tensor->ResizeImage(output_shape, image_shape);
    uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel_.setArg(idx++, gws[0]);
      kernel_.setArg(idx++, gws[1]);
    }
@@ -76,9 +77,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
    input_shape_ = input_tensor->shape();
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
@@ -99,8 +97,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
  if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
    std::string obfuscated_kernel_name =
        MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
    std::set<std::string> built_options;
@@ -110,8 +106,8 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE=" +
                          DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_non_uniform_work_groups_supported_) {
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
    }
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation_) {
@@ -138,6 +134,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
                                   built_options);
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const uint32_t gws[2] = {
@@ -153,7 +152,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
    const uint32_t round_h = (height_ + 1) / 2;
    const uint32_t round_w = (width_ + 1) / 2;
    uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
      kernel_.setArg(idx++, gws[0]);
      kernel_.setArg(idx++, gws[1]);
    }
@@ -173,9 +172,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, relux_max_limit_);
    input_shape_ = input_tensor->shape();
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -186,7 +186,6 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -174,7 +174,6 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -62,7 +62,6 @@ struct SliceFunctor<DeviceType::OPENCL, T> {
                  StatsFuture *future);
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
 };
 }  // namespace kernels

--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -62,7 +62,6 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -57,7 +57,6 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> space_shape_;
 };

--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -52,7 +52,6 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };
@@ -111,7 +110,6 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
  cl::Kernel kernel_;
  uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };