diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h
index 5bb2fe4ac384add95b0177ba3ae1d192742481ce..88840910a586346f4f962e594e02fa6e2e8179d0 100644
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -156,7 +156,6 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
   T relux_max_limit_;
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   std::string tuning_key_prefix_;
   std::vector<index_t> input_shape_;
 };
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
index e2d875e9a73e9d12668e6f11388060a35454e8ec..c8bb601620f1965b126bc39d2ef2259d26e91e68 100644
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -91,7 +91,6 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index f17db80a48295d1bf7a24e5775fae4a17f9a81f0..5e8ae34f9a9aaef596090cc7113c440b425021cf 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -158,7 +158,6 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
                   StatsFuture *future);
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h
index f2f917f4f5073c3e585e70260678c95f9f13f59c..a3d1ff61b1c0fa0be7c7eb0506c495fc1331476c 100644
--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -65,7 +65,6 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
                   StatsFuture *future);
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h
index b93e657837a50f658aa9c3444b99e3a0d65cf761..69332e0182e4623a7ffb9e1e87fd05fbd9ca5b75 100644
--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -57,7 +57,6 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   const int groups_;
   std::vector<index_t> input_shape_;
 };
diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h
index 7186bde6e452983b3bc5620e3b620086907e19ab..6c803b5f2a32544cac7d79145b691b3166faeb9b 100644
--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -86,7 +86,6 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
                   StatsFuture *future);
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index f2d3dfbb53c40ca5ff5e7753333c88300ac8b535..4cd05a65d527a45caf8c34486be2696511406589 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -402,7 +402,6 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h
index 6b439db67ecb2c5c2f6ee2390e7900adfc90a307..6d30673976df6399351ad50c871d2f9e578cea22 100644
--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -109,7 +109,6 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   const int block_size_;
   bool d2s_;
   std::vector<index_t> input_shape_;
diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h
index ba4f74c8a12132b1780467b38e35f52a8e127063..90c17b19e71553424c5f68eee1cc3bc9ffa2b279 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -438,7 +438,6 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h
index 11d52bc97e8802b04058589c6eb3bdb057607f00..9c7f0a901a5968f1d0f4cf5c7af8ceeebb465f7e 100644
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -98,7 +98,6 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h
index 1ce9b6fd07f4a377664b03b821cf1b170dadea19..b025cbfebe29efa20d65838328458eb73befb823 100644
--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -242,7 +242,6 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc
index f41513c54dd66b38d28e9c0f21b21998711d0d0a..368a78bc9594da81e84831136fbfcfe93a4f295f 100644
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -27,16 +27,14 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
     built_options.emplace("-Dactivation=" + kernel_name);
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     switch (activation_) {
       case RELU:
@@ -63,6 +61,9 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
         LOG(FATAL) << "Unknown activation type: " << activation_;
     }
     kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
@@ -71,7 +72,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   if (!IsVecEqual(input_shape_, input->shape())) {
     int idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
       kernel_.setArg(idx++, gws[2]);
@@ -85,9 +86,6 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     kernel_.setArg(idx++, *(output->opencl_image()));
 
     input_shape_ = input->shape();
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index c2c19fa780a19e05aa43702322be799f97412417..dc2aabeb949f9c6e54b41ebc8c2eb1324f1e4694 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -35,8 +35,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
   }
 
   if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
     if (input_tensors.size() > 4) {
       MACE_NOT_IMPLEMENTED;
     }
@@ -47,11 +45,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
-    if (is_non_uniform_work_groups_supported_) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
 
     kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
+
+    kwg_size_ =
+      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   std::vector<index_t> output_shape = input_tensors[0]->shape();
@@ -70,7 +71,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     output_tensor->ResizeImage(output_shape, output_image_shape);
 
     uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
     }
@@ -80,9 +81,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, *(output_tensor->opencl_image()));
 
     input_shape_ = input_tensors[0]->shape();
-
-    kwg_size_ =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index 8065acba80374af51a2f1368e45f1e46ca4fc869..09be320fc680cb0e34306762a4672cd774f1d5ea 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -38,16 +38,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
 
   if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
     built_options.emplace("-Dbatch_norm=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     if (folded_constant_) {
       built_options.emplace("-DFOLDED_CONSTANT");
@@ -72,10 +70,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     }
 
     kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
   if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
       kernel_.setArg(idx++, gws[2]);
@@ -92,9 +93,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     kernel_.setArg(idx++, relux_max_limit_);
 
     input_shape_ = input->shape();
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc
index a518f074209a05b2676356ff6bc3d50bf890abd7..1197a3590599b4aac96ac97ce0c8fc58915d0046 100644
--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -30,22 +30,23 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
     built_options.emplace("-Dbias_add=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
   if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
       kernel_.setArg(idx++, gws[2]);
@@ -54,16 +55,13 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     kernel_.setArg(idx++, *(bias->opencl_image()));
     kernel_.setArg(idx++, *(output->opencl_image()));
     input_shape_ = input->shape();
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8};
 
   cl::Event event;
   cl_int error;
-  if (is_non_uniform_work_groups_supported_) {
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
     error = runtime->command_queue().enqueueNDRangeKernel(
         kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
         cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index 5d7ae4c1b47d7af2a0897ba17bf158148bdf5b12..8b570b3369aca067d53bb0286c3bf9c354f8f74a 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -62,16 +62,13 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_non_uniform_work_groups_supported =
-      runtime->IsNonUniformWorkgroupsSupported();
-
   std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
   std::set<std::string> built_options;
   std::stringstream kernel_name_ss;
   kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
   built_options.emplace(kernel_name_ss.str());
-  if (is_non_uniform_work_groups_supported) {
-    built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
   }
   if (buffer->dtype() == image->dtype()) {
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
@@ -87,7 +84,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
                                          obfuscated_kernel_name, built_options);
 
   uint32_t idx = 0;
-  if (!is_non_uniform_work_groups_supported) {
+  if (!runtime->IsNonUniformWorkgroupsSupported()) {
     b2f_kernel.setArg(idx++, gws[0]);
     b2f_kernel.setArg(idx++, gws[1]);
   }
@@ -123,7 +120,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
 
   cl::Event event;
   cl_int error;
-  if (is_non_uniform_work_groups_supported) {
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
     error = runtime->command_queue().enqueueNDRangeKernel(
         b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
         cl::NDRange(lws[0], lws[1]), nullptr, &event);
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
index 29097345417aa0a1e30f532a25c8735b2831dd05..59f2c9518061a7e1e6f0e8071a18cec699814be4 100644
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -37,24 +37,25 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
     built_options.emplace("-Dchannel_shuffle=" + kernel_name);
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
                                    built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   if (!IsVecEqual(input_shape_, input->shape())) {
     uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
       kernel_.setArg(idx++, gws[2]);
@@ -65,9 +66,6 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, *(output->opencl_image()));
 
     input_shape_ = input->shape();
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
diff --git a/mace/kernels/opencl/cl/activation.cl b/mace/kernels/opencl/cl/activation.cl
index 7976dd38c811a082c186507c2f7a5f446aadd6c9..42afc7012528242475b3fc61a8a9bdfdb5623772 100644
--- a/mace/kernels/opencl/cl/activation.cl
+++ b/mace/kernels/opencl/cl/activation.cl
@@ -1,11 +1,7 @@
 #include <common.h>
 
 __kernel void activation(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                         __private const int global_size_dim0,
-                         __private const int global_size_dim1,
-                         __private const int global_size_dim2,
-#endif
+                         UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                          __read_only image2d_t input,
 #ifdef USE_PRELU
                          __read_only image2d_t alpha,
@@ -16,7 +12,7 @@ __kernel void activation(
   const int w = get_global_id(1);
   const int hb = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (ch_blk >= global_size_dim0 || w >= global_size_dim1
       || hb >= global_size_dim2) {
     return;
diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl
index 09dd5c388de0bf5b30a873d6a20a55181e341d06..d0604f9ed074c5a5d2729fd8c66751d9ab7b751b 100644
--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
@@ -1,10 +1,7 @@
 #include <common.h>
 
 __kernel void addn(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                   __private const int global_size_dim0,
-                   __private const int global_size_dim1,
-#endif
+                   UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                    __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                    __read_only image2d_t input1,
 #if INPUT_NUM > 2
@@ -17,7 +14,7 @@ __kernel void addn(
   const int w = get_global_id(0);
   const int hb = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (w >= global_size_dim0 || hb >= global_size_dim1) return;
 #endif
 
diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl
index 0e592fdcbd26bfa3b274ea222d7d7cb3070b132d..0075932dbac599780803ac7041da293dfbbc1447 100644
--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
@@ -1,11 +1,7 @@
 #include <common.h>
 // Supported data types: half/float
 __kernel void batch_norm(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                         __private const int global_size_dim0,
-                         __private const int global_size_dim1,
-                         __private const int global_size_dim2,
-#endif
+                         UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                          __read_only image2d_t input,
                          __read_only image2d_t scale,
                          __read_only image2d_t offset,
@@ -20,7 +16,7 @@ __kernel void batch_norm(
   const int w = get_global_id(1);
   const int hb = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (ch_blk >= global_size_dim0 || w >= global_size_dim1
       || hb >= global_size_dim2) {
     return;
diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl
index ee7b60786a57466178e3517face1ee7900ac639c..a2d99abcc8e21e19e0710db8f752df3a6032d56f 100644
--- a/mace/kernels/opencl/cl/bias_add.cl
+++ b/mace/kernels/opencl/cl/bias_add.cl
@@ -1,11 +1,7 @@
 #include <common.h>
 // Supported data types: half/float
 __kernel void bias_add(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                       __private const int global_size_dim0,
-                       __private const int global_size_dim1,
-                       __private const int global_size_dim2,
-#endif
+                       UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                        __read_only image2d_t input,
                        __read_only image2d_t bias,
                        __write_only image2d_t output) {
@@ -13,7 +9,7 @@ __kernel void bias_add(
   const int w = get_global_id(1);
   const int hb = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (ch_blk >= global_size_dim0 || w >= global_size_dim1
       || hb >= global_size_dim2) {
     return;
diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl
index 7e764503656fbf7d4ed41c245753663899c85fcb..86071708117efe6a7d4f0580d0324e2ad0701962 100644
--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
@@ -1,10 +1,7 @@
 #include <common.h>
 
 __kernel void filter_buffer_to_image(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1,
-#endif
+                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                                      __global const DATA_TYPE *input, /* h, w, oc, ic */
                                      __private const int input_offset,
                                      __private const int filter_h,
@@ -15,7 +12,7 @@ __kernel void filter_buffer_to_image(
   int w = get_global_id(0);
   int h = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
@@ -57,10 +54,7 @@ __kernel void filter_buffer_to_image(
 }
 
 __kernel void filter_image_to_buffer(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1,
-#endif
+                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                                      __global DATA_TYPE *output, /* h, w, oc, ic */
                                      __private const int filter_h,
                                      __private const int filter_w,
@@ -70,7 +64,7 @@ __kernel void filter_image_to_buffer(
   int w = get_global_id(0);
   int h = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
@@ -109,10 +103,7 @@ __kernel void filter_image_to_buffer(
 }
 
 __kernel void dw_filter_buffer_to_image(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                        __private const int global_size_dim0,
-                                        __private const int global_size_dim1,
-#endif
+                                        UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                                         __global const DATA_TYPE *input, /* h, w, ic, m */
                                         __private const int input_offset,
                                         __private const int filter_w,
@@ -122,7 +113,7 @@ __kernel void dw_filter_buffer_to_image(
   const int w = get_global_id(0);
   const int h = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
@@ -170,10 +161,7 @@ __kernel void dw_filter_buffer_to_image(
 }
 
 __kernel void in_out_buffer_to_image(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1,
-#endif
+                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                                      __global const DATA_TYPE *input, /* nhwc */
                                      __private const int input_offset,
                                      __private const int height,
@@ -183,7 +171,7 @@ __kernel void in_out_buffer_to_image(
   int w = get_global_id(0);
   int h = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
@@ -215,10 +203,7 @@ __kernel void in_out_buffer_to_image(
 }
 
 __kernel void in_out_image_to_buffer(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1,
-#endif
+                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                                      __global DATA_TYPE *output, /* nhwc */
                                      __private const int height,
                                      __private const int width,
@@ -227,7 +212,7 @@ __kernel void in_out_image_to_buffer(
   int w = get_global_id(0);
   int h = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
@@ -258,10 +243,7 @@ __kernel void in_out_image_to_buffer(
 }
 
 __kernel void arg_buffer_to_image(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                  __private const int global_size_dim0,
-                                  __private const int global_size_dim1,
-#endif
+                                  UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                                   __global const DATA_TYPE *input, /* nhwc */
                                   __private const int input_offset,
                                   __private const int count,
@@ -269,7 +251,7 @@ __kernel void arg_buffer_to_image(
   int w = get_global_id(0);
   int h = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
@@ -297,17 +279,14 @@ __kernel void arg_buffer_to_image(
 }
 
 __kernel void arg_image_to_buffer(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                  __private const int global_size_dim0,
-                                  __private const int global_size_dim1,
-#endif
+                                  UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                                   __global DATA_TYPE *output, /* nhwc */
                                   __private const int count,
                                   __read_only image2d_t input) {
   int w = get_global_id(0);
   int h = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
@@ -334,10 +313,7 @@ __kernel void arg_image_to_buffer(
 
 
 __kernel void in_out_height_buffer_to_image(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                            __private const int global_size_dim0,
-                                            __private const int global_size_dim1,
-#endif
+                                            UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                                             __global const DATA_TYPE *input, //nhwc
                                             __private const int input_offset,
                                             __private const int height,
@@ -347,7 +323,7 @@ __kernel void in_out_height_buffer_to_image(
   int w = get_global_id(0);
   int h = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
@@ -380,10 +356,7 @@ __kernel void in_out_height_buffer_to_image(
 }
 
 __kernel void in_out_height_image_to_buffer(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                            __private const int global_size_dim0,
-                                            __private const int global_size_dim1,
-#endif
+                                            UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                                             __global DATA_TYPE *output, //nhwc
                                             __private const int height,
                                             __private const int width,
@@ -392,7 +365,7 @@ __kernel void in_out_height_image_to_buffer(
   int w = get_global_id(0);
   int h = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
@@ -422,10 +395,7 @@ __kernel void in_out_height_image_to_buffer(
 
 
 __kernel void in_out_width_buffer_to_image(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                           __private const int global_size_dim0,
-                                           __private const int global_size_dim1,
-#endif
+                                           UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                                            __global const DATA_TYPE *input, /* nhwc */
                                            __private const int input_offset,
                                            __private const int height,
@@ -435,7 +405,7 @@ __kernel void in_out_width_buffer_to_image(
   int w = get_global_id(0);
   int h = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
@@ -468,10 +438,7 @@ __kernel void in_out_width_buffer_to_image(
 
 // only support 3x3 now
 __kernel void winograd_filter_buffer_to_image(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                              __private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-#endif
+                                              UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                                               __global const DATA_TYPE *input, //Oc, Ic, H, W
                                               __private const int input_offset,
                                               __private const int in_channels,
@@ -481,7 +448,7 @@ __kernel void winograd_filter_buffer_to_image(
   int w = get_global_id(0);
   int h = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
@@ -563,10 +530,7 @@ __kernel void winograd_filter_buffer_to_image(
 
 // only support 3x3 now
 __kernel void winograd_filter_image_to_buffer(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                              __private const int global_size_dim0,
-                                              __private const int global_size_dim1,
-#endif
+                                              UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                                               __global DATA_TYPE *output, //Oc, Ic, H, W
                                               __private const int height,
                                               __private const int width,
@@ -575,7 +539,7 @@ __kernel void winograd_filter_image_to_buffer(
   const int w = get_global_id(0);
   const int h = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (w >= global_size_dim0 || h >= global_size_dim1) {
     return;
   }
diff --git a/mace/kernels/opencl/cl/channel_shuffle.cl b/mace/kernels/opencl/cl/channel_shuffle.cl
index 5bf0e067a30d4248eff32858ec11fe171fff53b5..3fa2894e8bf60b8e7528ccd2562fc179afd9f46e 100644
--- a/mace/kernels/opencl/cl/channel_shuffle.cl
+++ b/mace/kernels/opencl/cl/channel_shuffle.cl
@@ -2,11 +2,7 @@
 
 // assume channes_per_group mod 4 = 0 && groups mod 4 == 0
 __kernel void channel_shuffle(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                              __private const int global_size_dim0,
-                              __private const int global_size_dim1,
-                              __private const int global_size_dim2,
-#endif
+                              UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                               __read_only image2d_t input,
                               __private const int groups,
                               __private const int channels_per_group,
@@ -15,7 +11,7 @@ __kernel void channel_shuffle(
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (group_chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
       || hb_idx >= global_size_dim2) {
     return;
diff --git a/mace/kernels/opencl/cl/common.h b/mace/kernels/opencl/cl/common.h
index dd61012cebdb887639f132d63e48d9521898e443..30aad065ac12da29c3eb661bcf17069a023b16e6 100644
--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -18,6 +18,23 @@
 #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
 #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
 
+#ifndef NON_UNIFORM_WORK_GROUP
+
+#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 \
+    __private const int global_size_dim0,       \
+    __private const int global_size_dim1,
+#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 \
+    __private const int global_size_dim0,       \
+    __private const int global_size_dim1,       \
+    __private const int global_size_dim2,
+
+#else
+
+#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+
+#endif
+
 __constant sampler_t SAMPLER =
     CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl
index 2658025df79333f23c063a99310c1acecb3cd014..3b7370a8a30ba21a0c22305d1ef84e66314d7153 100644
--- a/mace/kernels/opencl/cl/concat.cl
+++ b/mace/kernels/opencl/cl/concat.cl
@@ -23,11 +23,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
 
 // Supported data type: half/float
 __kernel void concat_channel(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                             __private const int global_size_dim0,
-                             __private const int global_size_dim1,
-                             __private const int global_size_dim2,
-#endif
+                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                              __read_only image2d_t input0,
                              __read_only image2d_t input1,
                              __private const int input0_chan,
@@ -36,7 +32,7 @@ __kernel void concat_channel(
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
       || hb_idx >= global_size_dim2) {
     return;
@@ -89,11 +85,7 @@ __kernel void concat_channel(
 
 // Required: All input channels are divisible by 4
 __kernel void concat_channel_multi(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                   __private const int global_size_dim0,
-                                   __private const int global_size_dim1,
-                                   __private const int global_size_dim2,
-#endif
+                                   UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                                    __read_only image2d_t input,
                                    __private const int chan_blk_offset,
                                    __write_only image2d_t output) {
@@ -101,7 +93,7 @@ __kernel void concat_channel_multi(
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
       || hb_idx >= global_size_dim2) {
     return;
diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl
index f40f31dab55dd7da07385004185ea5d9e84b9bfc..1383557d89d96b4a282773f16dabbae59b7b798b 100644
--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -1,11 +1,7 @@
 #include <common.h>
 
 __kernel void conv_2d(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1,
-                      __private const int global_size_dim2,
-#endif
+                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                       __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                       __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
 #ifdef BIAS
@@ -29,7 +25,7 @@ __kernel void conv_2d(
   const int out_w_blk = get_global_id(1);
   const int out_hb = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
       || out_hb >= global_size_dim2) {
     return;
diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index 96d9a2c06aae8fd81e420eb1b0b407149bc13bdd..e993a159e3e82fa5c110881647ffb290b75c4832 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -1,11 +1,7 @@
 #include <common.h>
 
 __kernel void conv_2d_1x1(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                          __private const int global_size_dim0,
-                          __private const int global_size_dim1,
-                          __private const int global_size_dim2,
-#endif
+                          UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                           __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                           __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
 #ifdef BIAS
@@ -23,7 +19,7 @@ __kernel void conv_2d_1x1(
   const int out_w_blk = get_global_id(1);
   const int out_hb = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
       || out_hb >= global_size_dim2) {
     return;
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index b159fd6af0fdcdecd2f9586f9992f1412548398c..8bc27b33569109fad1e9207c910299c6ebcaac0b 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -1,11 +1,7 @@
 #include <common.h>
 
 __kernel void conv_2d_3x3(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                          __private const int global_size_dim0,
-                          __private const int global_size_dim1,
-                          __private const int global_size_dim2,
-#endif
+                          UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                           __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                           __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
 #ifdef BIAS
@@ -27,7 +23,7 @@ __kernel void conv_2d_3x3(
   const int out_w_blk = get_global_id(1);
   const int out_hb = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
       || out_hb >= global_size_dim2) {
     return;
diff --git a/mace/kernels/opencl/cl/depth_to_space.cl b/mace/kernels/opencl/cl/depth_to_space.cl
index 2a5a8893ea2c86ba6a13cc1d9dc24dfa728088ac..a52617c87367635f697fc29f7c56315b6347bf13 100644
--- a/mace/kernels/opencl/cl/depth_to_space.cl
+++ b/mace/kernels/opencl/cl/depth_to_space.cl
@@ -1,11 +1,7 @@
 #include <common.h>
 
 __kernel void depth_to_space(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                             __private const int global_size_dim0,
-                             __private const int global_size_dim1,
-                             __private const int global_size_dim2,
-#endif
+                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                              __read_only image2d_t input,
                              __private const int block_size,
                              __private const int output_depth,
@@ -14,7 +10,7 @@ __kernel void depth_to_space(
   const int out_w = get_global_id(1);
   const int out_h = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (out_d >= global_size_dim0 || out_w >= global_size_dim1
       || out_h >= global_size_dim2) {
     return;
@@ -43,11 +39,7 @@ __kernel void depth_to_space(
 }
 
 __kernel void space_to_depth(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                             __private const int global_size_dim0,
-                             __private const int global_size_dim1,
-                             __private const int global_size_dim2,
-#endif
+                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                              __read_only image2d_t input,
                              __private const int block_size,
                              __private const int input_depth,
@@ -57,7 +49,7 @@ __kernel void space_to_depth(
   const int w = get_global_id(1);
   const int h = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (d >= global_size_dim0 || w >= global_size_dim1
       || h >= global_size_dim2) {
     return;
diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl
index 1974d8db50935f2abc96e6a309e8fc41f448deb3..fff19613c9dfad3f3e4a80fed57c60e99d1ec43f 100644
--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
@@ -2,11 +2,7 @@
 
 // Only multiplier = 1 is supported
 __kernel void depthwise_conv2d(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                               __private const int global_size_dim0,
-                               __private const int global_size_dim1,
-                               __private const int global_size_dim2,
-#endif
+                               UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                                __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
@@ -29,7 +25,7 @@ __kernel void depthwise_conv2d(
   const short out_w_blk = get_global_id(1);
   const short out_hb = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
       || out_hb >= global_size_dim2) {
     return;
@@ -143,11 +139,7 @@ __kernel void depthwise_conv2d(
 }
 
 __kernel void depthwise_conv2d_s1(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                  __private const int global_size_dim0,
-                                  __private const int global_size_dim1,
-                                  __private const int global_size_dim2,
-#endif
+                                  UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                                   __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                   __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
@@ -168,7 +160,7 @@ __kernel void depthwise_conv2d_s1(
   const short out_w_blk = get_global_id(1) << 2;
   const short out_hb = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (out_ch_blk >= global_size_dim0 || get_global_id(1) >= global_size_dim1
       || out_hb >= global_size_dim2) {
     return;
diff --git a/mace/kernels/opencl/cl/eltwise.cl b/mace/kernels/opencl/cl/eltwise.cl
index 9a69af1af46cacff54c20c1c5583f25bfdec4907..def21f0a993b75d321729e5c89b080555c1dcdf7 100644
--- a/mace/kernels/opencl/cl/eltwise.cl
+++ b/mace/kernels/opencl/cl/eltwise.cl
@@ -1,10 +1,7 @@
 #include <common.h>
 
 __kernel void eltwise(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1,
-#endif
+                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                       __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                       __read_only image2d_t input1,
 #ifdef COEFF_SUM
@@ -15,7 +12,7 @@ __kernel void eltwise(
   const int w = get_global_id(0);
   const int hb = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (w >= global_size_dim0 || hb >= global_size_dim1) return;
 #endif
 
diff --git a/mace/kernels/opencl/cl/matmul.cl b/mace/kernels/opencl/cl/matmul.cl
index c3efc9f25a8869af9e10f27d74d6ea113d94826e..fe260e7a22477ea958936b30378b439c8c94fb2f 100644
--- a/mace/kernels/opencl/cl/matmul.cl
+++ b/mace/kernels/opencl/cl/matmul.cl
@@ -2,10 +2,7 @@
 
 // C = A * B
 __kernel void matmul(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                     __private const int global_size_dim0,
-                     __private const int global_size_dim1,
-#endif
+                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                      __read_only image2d_t A,
                      __read_only image2d_t B,
                      __write_only image2d_t C,
@@ -17,7 +14,7 @@ __kernel void matmul(
   const int gx = get_global_id(0) << 2;
   const int hb = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (get_global_id(0) >= global_size_dim0 || hb >= global_size_dim1) return;
 #endif
 
diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl
index 0a28b745d5f78cbd8b73181dff2b960c664aa6ae..ead839940e5081e6d90e841f3eda569339a2ffa1 100644
--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
@@ -20,11 +20,7 @@ inline int calculate_avg_block_size(const int pool_size,
 
 // Supported data type: half/float
 __kernel void pooling(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1,
-                      __private const int global_size_dim2,
-#endif
+                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                       __read_only image2d_t input,
                       __private const int in_height,
                       __private const int in_width,
@@ -39,7 +35,7 @@ __kernel void pooling(
   const int out_width_idx = get_global_id(1);
   const int out_hb_idx = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (out_chan_idx >= global_size_dim0 || out_width_idx >= global_size_dim1
       || out_hb_idx >= global_size_dim2) {
     return;
diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl
index 8c5b7a33b561c7bbd37366630725546f64a9d72d..83e6df85c5c7c1c4b1ee9facf62d73c7cd0f5a58 100644
--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
@@ -1,11 +1,7 @@
 #include <common.h>
 
 __kernel void resize_bilinear_nocache(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                      __private const int global_size_dim0,
-                                      __private const int global_size_dim1,
-                                      __private const int global_size_dim2,
-#endif
+                                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                                       __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                       __write_only image2d_t output,
                                       __private const float height_scale,
@@ -18,7 +14,7 @@ __kernel void resize_bilinear_nocache(
   const int w = get_global_id(1);
   const int hb = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (ch_blk >= global_size_dim0 || w >= global_size_dim1
       || hb >= global_size_dim2) {
     return;
diff --git a/mace/kernels/opencl/cl/slice.cl b/mace/kernels/opencl/cl/slice.cl
index 4517ec99a0f23c58377de0d784cb59f39bf604c1..eccdd882c75a809804d61599b5288a432d2d432e 100644
--- a/mace/kernels/opencl/cl/slice.cl
+++ b/mace/kernels/opencl/cl/slice.cl
@@ -1,11 +1,7 @@
 #include <common.h>
 
 __kernel void slice(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                    __private const int global_size_dim0,
-                    __private const int global_size_dim1,
-                    __private const int global_size_dim2,
-#endif
+                    UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                     __read_only image2d_t input,
                     __private const int chan_blk_offset,
                     __write_only image2d_t output) {
@@ -13,7 +9,7 @@ __kernel void slice(
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
       || hb_idx >= global_size_dim2) {
     return;
diff --git a/mace/kernels/opencl/cl/softmax.cl b/mace/kernels/opencl/cl/softmax.cl
index 11ff80bf9f601d874ea81df7fac2f12a8001fa2e..628d71cbb3a22b4ca240446c7e7c889dc1fc55ab 100644
--- a/mace/kernels/opencl/cl/softmax.cl
+++ b/mace/kernels/opencl/cl/softmax.cl
@@ -1,11 +1,7 @@
 #include <common.h>
 
 __kernel void softmax(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                      __private const int global_size_dim0,
-                      __private const int global_size_dim1,
-                      __private const int global_size_dim2,
-#endif
+                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                       __read_only image2d_t input,
                       __private const int channels,
                       __private const int remain_channels,
@@ -14,7 +10,7 @@ __kernel void softmax(
   const int width_idx = get_global_id(1);
   const int hb_idx = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
       || hb_idx >= global_size_dim2) {
     return;
diff --git a/mace/kernels/opencl/cl/space_to_batch.cl b/mace/kernels/opencl/cl/space_to_batch.cl
index 0a54601269284a5038d733154a836515618e7443..1e2024043f97f835c03c41ecc03fea7b86617ed2 100644
--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
@@ -1,11 +1,7 @@
 #include <common.h>
 
 __kernel void space_to_batch(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                             __private const int global_size_dim0,
-                             __private const int global_size_dim1,
-                             __private const int global_size_dim2,
-#endif
+                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                              __read_only image2d_t space_data,
                              __write_only image2d_t batch_data,
                              __private const int block_height,
@@ -20,7 +16,7 @@ __kernel void space_to_batch(
   const int batch_w_idx = get_global_id(1);
   const int batch_hb_idx = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
       || batch_hb_idx >= global_size_dim2) {
     return;
@@ -53,11 +49,7 @@ __kernel void space_to_batch(
 }
 
 __kernel void batch_to_space(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                             __private const int global_size_dim0,
-                             __private const int global_size_dim1,
-                             __private const int global_size_dim2,
-#endif
+                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
                              __read_only image2d_t batch_data,
                              __write_only image2d_t space_data,
                              __private const int block_height,
@@ -72,7 +64,7 @@ __kernel void batch_to_space(
   const int batch_w_idx = get_global_id(1);
   const int batch_hb_idx = get_global_id(2);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
       || batch_hb_idx >= global_size_dim2) {
     return;
diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl
index f3f99cfa567514ae4ba3a99c73930dd8e13efa6f..d447001e6ccd09f44f2d60c658be778c0e1fbff9 100644
--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
@@ -1,10 +1,7 @@
 #include <common.h>
 
 __kernel void winograd_transform_2x2(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                     __private const int global_size_dim0,
-                                     __private const int global_size_dim1,
-#endif
+                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                                      __read_only image2d_t input,
                                      __write_only image2d_t output,
                                      __private const int in_height,
@@ -17,7 +14,7 @@ __kernel void winograd_transform_2x2(
   int out_width_idx = get_global_id(0);
   int chan_blk_idx = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) {
     return;
   }
@@ -120,10 +117,7 @@ __kernel void winograd_transform_2x2(
 }
 
 __kernel void winograd_inverse_transform_2x2(
-#ifndef USE_QUALCOMM_OPENCL_2_0
-                                             __private const int global_size_dim0,
-                                             __private const int global_size_dim1,
-#endif
+                                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
                                              __read_only image2d_t input,
 #ifdef BIAS
                                              __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -137,7 +131,7 @@ __kernel void winograd_inverse_transform_2x2(
   const int width_idx = get_global_id(0);
   const int height_idx = get_global_id(1);
 
-#ifndef USE_QUALCOMM_OPENCL_2_0
+#ifndef NON_UNIFORM_WORK_GROUP
   if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
     return;
   }
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 56449d14188d778fc25cc5e906f2b083f9e9ce42..bccc8623997c7b356190ecc4818fc3394eb45d89 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -18,7 +18,6 @@ static void Concat2(cl::Kernel *kernel,
                     std::vector<index_t> *prev_input_shape,
                     Tensor *output,
                     StatsFuture *future,
-                    bool *is_non_uniform_work_groups_supported,
                     uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
@@ -34,13 +33,11 @@ static void Concat2(cl::Kernel *kernel,
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel->get() == nullptr) {
-    *is_non_uniform_work_groups_supported =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
     built_options.emplace("-Dconcat_channel=" + kernel_name);
-    if (*is_non_uniform_work_groups_supported) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     if (input0->dtype() == output->dtype()) {
       built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
@@ -53,10 +50,13 @@ static void Concat2(cl::Kernel *kernel,
       built_options.emplace("-DDIVISIBLE_FOUR");
     }
     *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
   if (!IsVecEqual(*prev_input_shape, input0->shape())) {
     uint32_t idx = 0;
-    if (!(*is_non_uniform_work_groups_supported)) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel->setArg(idx++, gws[0]);
       kernel->setArg(idx++, gws[1]);
       kernel->setArg(idx++, gws[2]);
@@ -70,9 +70,6 @@ static void Concat2(cl::Kernel *kernel,
                    *(static_cast<cl::Image2D *>(output->opencl_image())));
 
     *prev_input_shape = input0->shape();
-
-    *kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
 
   const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
@@ -87,7 +84,6 @@ static void ConcatN(cl::Kernel *kernel,
                     const DataType dt,
                     Tensor *output,
                     StatsFuture *future,
-                    bool *is_non_uniform_work_groups_supported,
                     uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
@@ -97,17 +93,17 @@ static void ConcatN(cl::Kernel *kernel,
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel->get() == nullptr) {
-    *is_non_uniform_work_groups_supported =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
     built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    if (*is_non_uniform_work_groups_supported) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
 
   const int inputs_count = input_list.size();
@@ -121,7 +117,7 @@ static void ConcatN(cl::Kernel *kernel,
     };
 
     uint32_t idx = 0;
-    if (!(*is_non_uniform_work_groups_supported)) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel->setArg(idx++, gws[0]);
       kernel->setArg(idx++, gws[1]);
       kernel->setArg(idx++, gws[2]);
@@ -131,8 +127,6 @@ static void ConcatN(cl::Kernel *kernel,
     kernel->setArg(idx++, *(output->opencl_image()));
 
     chan_blk_offset += input_channel_blk;
-    *kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
     const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
     std::stringstream ss;
     ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
@@ -178,13 +172,12 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
   switch (inputs_count) {
     case 2:
       Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
-              &input_shape_, output, future,
-              &is_non_uniform_work_groups_supported_, &kwg_size_);
+              &input_shape_, output, future, &kwg_size_);
       break;
     default:
       if (divisible_four) {
         ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future,
-            &is_non_uniform_work_groups_supported_, &kwg_size_);
+            &kwg_size_);
       } else {
         MACE_NOT_IMPLEMENTED;
       }
diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc
index b9fa2d4c86b259bf9f9691654a92746071cad545..5a002666320dfbfbea8263d1693bae53231da952 100644
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -21,7 +21,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                              std::vector<index_t> *prev_input_shape,
                              Tensor *output,
                              StatsFuture *future,
-                             bool *is_non_uniform_work_groups_supported,
                              uint32_t *kwg_size);
 
 extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
@@ -37,7 +36,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                              std::vector<index_t> *prev_input_shape,
                              Tensor *output,
                              StatsFuture *future,
-                             bool *is_non_uniform_work_groups_supported,
                              uint32_t *kwg_size);
 
 extern void Conv2dOpencl(cl::Kernel *kernel,
@@ -53,7 +51,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                          std::vector<index_t> *prev_input_shape,
                          Tensor *output,
                          StatsFuture *future,
-                         bool *is_non_uniform_work_groups_supported,
                          uint32_t *kwg_size);
 
 template <typename T>
@@ -68,7 +65,7 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
       const int *dilations, const ActivationType activation,
       const float relux_max_limit, const DataType dt,
       std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
-      bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size);
+      uint32_t *kwg_size);
   // Selection matrix: kernel_size x stride_size
   static const Conv2dOpenclFunction selector[5] = {
       Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
@@ -109,12 +106,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                 dilations_, activation_, relux_max_limit_,
                 DataTypeToEnum<T>::value, &input_shape_, output, future,
-                &is_non_uniform_work_groups_supported_, &kwg_size_);
+                &kwg_size_);
   } else {
     Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                  dilations_, activation_, relux_max_limit_,
                  DataTypeToEnum<T>::value, &input_shape_, output, future,
-                 &is_non_uniform_work_groups_supported_, &kwg_size_);
+                 &kwg_size_);
   }
 }
 
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 34055189421eeb7fb583b4e610f51cd01731b1ed..5cdf8e5608386ea7547f20d239e5126d69d50efc 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -23,7 +23,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                              std::vector<index_t> *prev_input_shape,
                              Tensor *output,
                              StatsFuture *future,
-                             bool *is_non_uniform_work_groups_supported,
                              uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
@@ -41,8 +40,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel->get() == nullptr) {
-    *is_non_uniform_work_groups_supported =
-        runtime->IsNonUniformWorkgroupsSupported();
     MACE_CHECK(input_batch == batch);
 
     std::set<std::string> built_options;
@@ -50,8 +47,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (*is_non_uniform_work_groups_supported) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     if (bias != nullptr) {
       built_options.emplace("-DBIAS");
@@ -76,6 +73,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     }
 
     *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
@@ -84,7 +84,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
 
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
     uint32_t idx = 0;
-    if (!(*is_non_uniform_work_groups_supported)) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel->setArg(idx++, gws[0]);
       kernel->setArg(idx++, gws[1]);
       kernel->setArg(idx++, gws[2]);
@@ -105,9 +105,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     kernel->setArg(idx++, stride);
 
     *prev_input_shape = input->shape();
-
-    *kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
 
   const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index 88793dacb06dc5e75ca6bee7a8cb57c8dd0c6775..d0f587452579a7dfe6dbd64d29fa02ab9bf73297 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -25,7 +25,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                              std::vector<index_t> *prev_input_shape,
                              Tensor *output,
                              StatsFuture *future,
-                             bool *is_non_uniform_work_groups_supported,
                              uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
@@ -40,15 +39,13 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel->get() == nullptr) {
-    *is_non_uniform_work_groups_supported =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
     built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (*is_non_uniform_work_groups_supported) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation) {
@@ -71,6 +68,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
     }
 
     *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
@@ -79,7 +79,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
 
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
     uint32_t idx = 0;
-    if (!(*is_non_uniform_work_groups_supported)) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel->setArg(idx++, gws[0]);
       kernel->setArg(idx++, gws[1]);
       kernel->setArg(idx++, gws[2]);
@@ -103,9 +103,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
     kernel->setArg(idx++, dilations[1]);
 
     *prev_input_shape = input->shape();
-
-    *kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
 
   const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 1};
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index 191322095748de1b7a8ee2c4f6ca7049fca5a8b2..b9b2fec56a13dc55ee97649098f0674425ddcd4e 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -25,7 +25,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                          std::vector<index_t> *prev_input_shape,
                          Tensor *output,
                          StatsFuture *future,
-                         bool *is_non_uniform_work_groups_supported,
                          uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
@@ -40,15 +39,13 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel->get() == nullptr) {
-    *is_non_uniform_work_groups_supported =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
     built_options.emplace("-Dconv_2d=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (*is_non_uniform_work_groups_supported) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation) {
@@ -71,6 +68,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
     }
 
     *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
 
   const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
@@ -79,7 +79,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
 
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
     uint32_t idx = 0;
-    if (!(*is_non_uniform_work_groups_supported)) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel->setArg(idx++, gws[0]);
       kernel->setArg(idx++, gws[1]);
       kernel->setArg(idx++, gws[2]);
@@ -105,9 +105,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
     kernel->setArg(idx++, dilations[1]);
 
     *prev_input_shape = input->shape();
-
-    *kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
 
   const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
diff --git a/mace/kernels/opencl/depth_to_space_opencl.cc b/mace/kernels/opencl/depth_to_space_opencl.cc
index 83cff27354a563d63094d1541a4735e51b136396..8fc0924704badcf1f37d9a55b8c0188e65b295de 100644
--- a/mace/kernels/opencl/depth_to_space_opencl.cc
+++ b/mace/kernels/opencl/depth_to_space_opencl.cc
@@ -48,8 +48,6 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
     std::stringstream kernel_name_ss;
@@ -58,11 +56,14 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     kernel_ =
         runtime->BuildKernel("depth_to_space", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   uint32_t gws[3];
@@ -83,7 +84,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
     }
 
     uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
       kernel_.setArg(idx++, gws[2]);
@@ -94,9 +95,6 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, *(output->opencl_image()));
 
     input_shape_ = input->shape();
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc
index 11bb38b3a603055b738e4ed2a6ea609a12e1d24b..5e0b99ba1e5a00e25eec931f209c0a28c8ed85b2 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -24,7 +24,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
                      std::vector<index_t> *prev_input_shape,
                      Tensor *output,
                      StatsFuture *future,
-                     bool *is_non_uniform_work_groups_supported,
                      uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
@@ -45,8 +44,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel->get() == nullptr) {
-    *is_non_uniform_work_groups_supported =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
     if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
@@ -55,8 +52,8 @@ void DepthwiseConv2d(cl::Kernel *kernel,
     } else {
       built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
     }
-    if (*is_non_uniform_work_groups_supported) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
@@ -83,6 +80,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
 
     *kernel =
         runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
   if (!IsVecEqual(*prev_input_shape, input->shape())) {
     const index_t input_batch = input->dim(0);
@@ -97,7 +97,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
                input_channels);
 
     uint32_t idx = 0;
-    if (!(*is_non_uniform_work_groups_supported)) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel->setArg(idx++, gws[0]);
       kernel->setArg(idx++, gws[1]);
       kernel->setArg(idx++, gws[2]);
@@ -124,9 +124,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
     }
 
     *prev_input_shape = input->shape();
-
-    *kwg_size =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
 
   const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
@@ -185,7 +182,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
   DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                   dilations_, activation_, relux_max_limit_,
                   DataTypeToEnum<T>::value, &input_shape_, output, future,
-                  &is_non_uniform_work_groups_supported_, &kwg_size_);
+                  &kwg_size_);
 }
 
 template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc
index 38a231da40224da36eb0a9e972764b0a1915ead9..c23534bbce5ca423314d23bd470a5cbc2289ae1e 100644
--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -30,8 +30,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
@@ -39,15 +37,18 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
-    if (is_non_uniform_work_groups_supported_) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
     kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
   if (!IsVecEqual(input_shape_, input0->shape())) {
     uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
     }
@@ -60,9 +61,6 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
     kernel_.setArg(idx++, *(output->opencl_image()));
 
     input_shape_ = input0->shape();
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index b8b8d6a3cc51de74c9cab7a8f85f86f61d218abd..2ab8cde257de73b8dee80300afa89122dd3cf126 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -200,8 +200,6 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
                          const std::vector<uint32_t> &lws,
                          StatsFuture *future) {
   auto runtime = OpenCLRuntime::Global();
-  const bool is_non_uniform_work_groups_supported =
-      runtime->IsNonUniformWorkgroupsSupported();
 
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     const uint32_t kwg_size =
@@ -239,7 +237,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
         << "Tuning parameters of 3D kernel must be 4D";
     cl_int error = CL_SUCCESS;
     std::vector<uint32_t> roundup_gws(3);
-    if (!is_non_uniform_work_groups_supported) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       for (size_t i = 0; i < 3; ++i) {
         roundup_gws[i] = RoundUp(gws[i], params[i]);
       }
@@ -252,7 +250,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
       for (uint32_t i = 0; i < num_blocks; ++i) {
         uint32_t gws2 =
             (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        if (is_non_uniform_work_groups_supported) {
+        if (runtime->IsNonUniformWorkgroupsSupported()) {
           error = runtime->command_queue().enqueueNDRangeKernel(
               kernel, cl::NDRange(0, 0, i * block_size),
               cl::NDRange(gws[0], gws[1], gws2),
@@ -268,7 +266,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
       }
     } else {
       timer->ClearTiming();
-      if (is_non_uniform_work_groups_supported) {
+      if (runtime->IsNonUniformWorkgroupsSupported()) {
         error = runtime->command_queue().enqueueNDRangeKernel(
             kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
             cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
@@ -293,7 +291,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
         for (uint32_t i = 0; i < num_blocks; ++i) {
           uint32_t gws2 =
               (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-          if (is_non_uniform_work_groups_supported) {
+          if (runtime->IsNonUniformWorkgroupsSupported()) {
             error = runtime->command_queue().enqueueNDRangeKernel(
                 kernel, cl::NDRange(0, 0, i * block_size),
                 cl::NDRange(gws[0], gws[1], gws2),
@@ -332,8 +330,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
                          const std::vector<uint32_t> &lws,
                          StatsFuture *future) {
   auto runtime = OpenCLRuntime::Global();
-  const bool is_non_uniform_work_groups_supported =
-      runtime->IsNonUniformWorkgroupsSupported();
 
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     const uint32_t kwg_size =
@@ -359,7 +355,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
         << "Tuning parameters of 2D kernel must be 3d";
     cl_int error = CL_SUCCESS;
     std::vector<uint32_t> roundup_gws(2);
-    if (!is_non_uniform_work_groups_supported) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       for (size_t i = 0; i < 2; ++i) {
         roundup_gws[i] = RoundUp(gws[i], params[i]);
       }
@@ -372,7 +368,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
       for (uint32_t i = 0; i < num_blocks; ++i) {
         uint32_t gws1 =
             (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-        if (is_non_uniform_work_groups_supported) {
+        if (runtime->IsNonUniformWorkgroupsSupported()) {
           error = runtime->command_queue().enqueueNDRangeKernel(
               kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
               cl::NDRange(params[0], params[1]), nullptr, &event);
@@ -387,7 +383,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
       }
     } else {
       timer->ClearTiming();
-      if (is_non_uniform_work_groups_supported) {
+      if (runtime->IsNonUniformWorkgroupsSupported()) {
         error = runtime->command_queue().enqueueNDRangeKernel(
             kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
             cl::NDRange(params[0], params[1]), nullptr, &event);
@@ -411,7 +407,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
         for (uint32_t i = 0; i < num_blocks; ++i) {
           uint32_t gws1 =
               (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-          if (is_non_uniform_work_groups_supported) {
+          if (runtime->IsNonUniformWorkgroupsSupported()) {
             error = runtime->command_queue().enqueueNDRangeKernel(
                 kernel, cl::NDRange(0, i * block_size),
                 cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]),
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index 79dcc40d67b0c2a855fb68ab632409e0d868aa6f..7a4822096309297d8c11e80c1f7eb6ea5069b48f 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -34,21 +34,22 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
     built_options.emplace("-Dmatmul=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
   uint32_t idx = 0;
-  if (!is_non_uniform_work_groups_supported_) {
+  if (!runtime->IsNonUniformWorkgroupsSupported()) {
     kernel_.setArg(idx++, gws[0]);
     kernel_.setArg(idx++, gws[1]);
   }
@@ -61,8 +62,6 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
   kernel_.setArg(idx++, static_cast<int>(height_blocks));
   kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
 
-  kwg_size_ =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 1};
   std::stringstream ss;
   ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index 9b2f96c8f1e92d7c64897bd3d5c3c7f83bcd63ea..774fd5ee2ac60875491deb00f1ce4fcbef8ba97e 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -21,8 +21,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
     const DataType dt = DataTypeToEnum<T>::value;
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
@@ -39,10 +37,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     if (pooling_type_ == AVG) {
       built_options.emplace("-DPOOL_AVG");
     }
-    if (is_non_uniform_work_groups_supported_) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   std::vector<uint32_t> gws;
@@ -81,7 +82,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     };
 
     uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
       kernel_.setArg(idx++, gws[2]);
@@ -97,9 +98,6 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     kernel_.setArg(idx++, *(output->opencl_image()));
 
     input_shape_ = input->shape();
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   } else {
     index_t batch = output->dim(0);
     index_t out_height = output->dim(1);
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index ce2fe7bfbc28c9d4563e6e900d87ec753179af84..ac2733e9e511c8a32dee3371c9ed404be964cb90 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -31,19 +31,20 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
     built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     kernel_ =
         runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
   if (!IsVecEqual(input_shape_, input->shape())) {
     MACE_CHECK(out_height > 0 && out_width > 0);
@@ -60,7 +61,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
         CalculateResizeScale(in_width, out_width, align_corners_);
 
     uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
       kernel_.setArg(idx++, gws[2]);
@@ -74,9 +75,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, static_cast<int32_t>(out_height));
 
     input_shape_ = input->shape();
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
index d610e1e14b6bfd49d61ed93fc95c150afd1a665e..850f08f6df5b091750d0b6ac203bf3e72e00099c 100644
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -32,18 +32,19 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
     built_options.emplace("-Dslice=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE="
                            + DtToCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_non_uniform_work_groups_supported_) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
   const index_t channel_blk = RoundUpDiv4(output_channels);
 
@@ -53,8 +54,6 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
       static_cast<uint32_t>(input->dim(0) * input->dim(1)),
   };
 
-  kwg_size_ =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
   std::stringstream ss;
   ss << "slice_opencl_kernel_"
@@ -65,7 +64,7 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
      << outputs_count;
   for (int i = 0; i < outputs_count; ++i) {
     uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
       kernel_.setArg(idx++, gws[2]);
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
index 61ea022890a30875a845b6034e37ef8b80081e9b..ea4f0b3e13d2d023a9dc98691ce47cb269f97714 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -30,22 +30,23 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
     built_options.emplace("-Dsoftmax=" + kernel_name);
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_non_uniform_work_groups_supported_) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
   if (!IsVecEqual(input_shape_, logits->shape())) {
     uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
       kernel_.setArg(idx++, gws[2]);
@@ -56,9 +57,6 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
     kernel_.setArg(idx++, *(output->opencl_image()));
 
     input_shape_ = logits->shape();
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
index 38da0548b974269a114022d71b6e792493e2c71e..31b5013b737335c40255d9d4163e1d2fb8572d68 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -39,8 +39,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
     std::set<std::string> built_options;
     std::stringstream kernel_name_ss;
@@ -49,15 +47,18 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_non_uniform_work_groups_supported_) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     kernel_ =
         runtime->BuildKernel("space_to_batch", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
   if (!IsVecEqual(space_shape_, space_tensor->shape())) {
     uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
       kernel_.setArg(idx++, gws[2]);
@@ -79,9 +80,6 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, static_cast<int32_t>(batch_tensor->dim(2)));
 
     space_shape_ = space_tensor->shape();
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index 092a60cd805acd2b28cb909395846ed8ec3bd1e2..f4fd5525960706e31ae3a303d00c50e534bfeaec 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -18,8 +18,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::string obfuscated_kernel_name =
         MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
     std::set<std::string> built_options;
@@ -28,11 +26,14 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
                           DtToUpstreamCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_non_uniform_work_groups_supported_) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
                                    built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
   std::vector<index_t> output_shape(4);
   std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
@@ -61,7 +62,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
     output_tensor->ResizeImage(output_shape, image_shape);
 
     uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
     }
@@ -76,9 +77,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, static_cast<uint32_t>(paddings[1] / 2));
 
     input_shape_ = input_tensor->shape();
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
@@ -99,8 +97,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
   auto runtime = OpenCLRuntime::Global();
 
   if (kernel_.get() == nullptr) {
-    is_non_uniform_work_groups_supported_ =
-        runtime->IsNonUniformWorkgroupsSupported();
     std::string obfuscated_kernel_name =
         MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
     std::set<std::string> built_options;
@@ -110,8 +106,8 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
                           DtToUpstreamCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_non_uniform_work_groups_supported_) {
-      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
     }
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
     switch (activation_) {
@@ -138,6 +134,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
 
     kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
                                    built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   const uint32_t gws[2] = {
@@ -153,7 +152,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
     const uint32_t round_h = (height_ + 1) / 2;
     const uint32_t round_w = (width_ + 1) / 2;
     uint32_t idx = 0;
-    if (!is_non_uniform_work_groups_supported_) {
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
       kernel_.setArg(idx++, gws[0]);
       kernel_.setArg(idx++, gws[1]);
     }
@@ -173,9 +172,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, relux_max_limit_);
 
     input_shape_ = input_tensor->shape();
-
-    kwg_size_ =
-        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
   const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index 52dd12342ec360c07de992d413eac509b8f5778b..ac5c7987fbeda1e76c7e5a13f4a5166c3a2f6c07 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -186,7 +186,6 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h
index 09ae3ba5075bc959e7b571db40d06dc548b0bdd4..4e0c5ae3cb04def05794faab46cfc5dc90727e3c 100644
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -174,7 +174,6 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h
index ce7431da3da8d0f2b39d6c5c38b694867c866365..1bde41e539b0aa9bd8b458261913a9957da2ec0a 100644
--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -62,7 +62,6 @@ struct SliceFunctor<DeviceType::OPENCL, T> {
                   StatsFuture *future);
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h
index b491e2ad39249f1e66233375aaa3c904951f2b84..e2b8efc8d8673f26b01a2124dcc3fb64730ad0d5 100644
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -62,7 +62,6 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h
index 6bd66cbb3e721beb254b06486b12ebb52ab184cd..4e0d4c591e9b6cfe8544c1bfc4d98936c921dee1 100644
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -57,7 +57,6 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> space_shape_;
 };
 
diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h
index df12ab36227eab19372c53e02f0f4110c937bd00..e3169541dd197764d284d35c7eabbb61a2ba38b6 100644
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -52,7 +52,6 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
@@ -111,7 +110,6 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
 
   cl::Kernel kernel_;
   uint32_t kwg_size_;
-  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };