diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index d5ba7010c8b58c809e98ea7b1ba9afb840a948e5..dcd7fab4a019c5a49772b7794c1aa17e0bbd4e26 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -147,16 +147,9 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
     if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) {
       *device_ = device;
       gpu_detected = true;
+
       const std::string device_name = device.getInfo<CL_DEVICE_NAME>();
-      constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
-      constexpr const char *kMaliGPUStr = "Mali";
-      if (device_name == kQualcommAdrenoGPUStr) {
-        gpu_type_ = GPU_TYPE::QUALCOMM_ADRENO;
-      } else if (device_name.find(kMaliGPUStr) != std::string::npos) {
-        gpu_type_ = GPU_TYPE::MALI;
-      } else {
-        gpu_type_ = GPU_TYPE::UNKNOWN;
-      }
+      gpu_type_ = ParseGPUTypeFromDeviceName(device_name);
 
       const std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
       opencl_version_ = device_version.substr(7, 3);
@@ -178,7 +171,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
   }
 
   cl_int err;
-  if (gpu_type_ == GPU_TYPE::QUALCOMM_ADRENO) {
+  if (gpu_type_ == GPUType::QUALCOMM_ADRENO) {
     std::vector<cl_context_properties> context_properties;
     context_properties.reserve(5);
     GetAdrenoContextProperties(&context_properties, gpu_perf_hint,
@@ -357,12 +350,30 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
   return size;
 }
 
-const GPU_TYPE OpenCLRuntime::GetGPUType() const {
-  return gpu_type_;
+const bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() {
+  if (gpu_type_ == GPUType::QUALCOMM_ADRENO &&
+      opencl_version_ == "2.0") {
+    return true;
+  } else {
+    return false;
+  }
 }
 
-const std::string &OpenCLRuntime::GetOpenclVersion() const {
-  return opencl_version_;
+const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName(
+    const std::string &device_name) {
+  constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
+  constexpr const char *kMaliGPUStr = "Mali";
+  constexpr const char *kPowerVRGPUStr = "PowerVR";
+
+  if (device_name == kQualcommAdrenoGPUStr) {
+    return GPUType::QUALCOMM_ADRENO;
+  } else if (device_name.find(kMaliGPUStr) != std::string::npos) {
+    return GPUType::MALI;
+  } else if (device_name.find(kPowerVRGPUStr) != std::string::npos) {
+    return GPUType::PowerVR;
+  } else {
+    return GPUType::UNKNOWN;
+  }
 }
 
 }  // namespace mace
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index d3cc5cc7037ddea9a717ab537e8889aa6ce50bd3..8a3ce06abb63f078efa89ca43b242e46a13e5a3e 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -18,9 +18,10 @@
 
 namespace mace {
 
-enum GPU_TYPE {
+enum GPUType {
   QUALCOMM_ADRENO,
   MALI,
+  PowerVR,
   UNKNOWN,
 };
 
@@ -55,8 +56,8 @@ class OpenCLRuntime {
   uint64_t GetDeviceMaxWorkGroupSize();
   uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
   uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
-  const GPU_TYPE GetGPUType() const;
-  const std::string &GetOpenclVersion() const;
+  const bool IsNonUniformWorkgroupsSupported();
+  const GPUType ParseGPUTypeFromDeviceName(const std::string &device_name);
   cl::Kernel BuildKernel(const std::string &program_name,
                          const std::string &kernel_name,
                          const std::set<std::string> &build_options);
@@ -82,7 +83,7 @@ class OpenCLRuntime {
   std::map<std::string, cl::Program> built_program_map_;
   std::mutex program_build_mutex_;
   std::string kernel_path_;
-  GPU_TYPE gpu_type_;
+  GPUType gpu_type_;
   std::string opencl_version_;
 
   static GPUPerfHint gpu_perf_hint_;
diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h
index 55368c3ca83c8aa7dd9e8d76efb47bde568ec4ce..5bb2fe4ac384add95b0177ba3ae1d192742481ce 100644
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -155,6 +155,8 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
   ActivationType activation_;
   T relux_max_limit_;
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   std::string tuning_key_prefix_;
   std::vector<index_t> input_shape_;
 };
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
index 70d9583ba798babd3a27737c9ed7487913441bf6..e2d875e9a73e9d12668e6f11388060a35454e8ec 100644
--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -90,6 +90,8 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
index 28b8d776c967e48a4af835ee55913c437aa3d3ea..f17db80a48295d1bf7a24e5775fae4a17f9a81f0 100644
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -157,6 +157,8 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
                   Tensor *output,
                   StatsFuture *future);
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h
index d5372850bcf604b0f1e01e630c0c30b59e95abc0..f2f917f4f5073c3e585e70260678c95f9f13f59c 100644
--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -64,6 +64,8 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
                   Tensor *output,
                   StatsFuture *future);
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h
index f1e258337a2d9a871bbb3ac4aec70faf1a18edf9..b93e657837a50f658aa9c3444b99e3a0d65cf761 100644
--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -56,6 +56,8 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
   void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
 
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   const int groups_;
   std::vector<index_t> input_shape_;
 };
diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h
index de34ed69fa5803f61e9f6785b9d4b7185be2cccc..7186bde6e452983b3bc5620e3b620086907e19ab 100644
--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -85,6 +85,8 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
                   Tensor *output,
                   StatsFuture *future);
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index 47516291d14ec21ba2202e2089bee03d6387c433..f2d3dfbb53c40ca5ff5e7753333c88300ac8b535 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -401,6 +401,8 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h
index 3f6577f32159309bba931eaef58011902ecc2045..6b439db67ecb2c5c2f6ee2390e7900adfc90a307 100644
--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -108,6 +108,8 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
   void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
 
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   const int block_size_;
   bool d2s_;
   std::vector<index_t> input_shape_;
diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h
index 166ea18a644ead1d53af2a7c3b83c73c617554d6..ba4f74c8a12132b1780467b38e35f52a8e127063 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -437,6 +437,8 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h
index 0f9e9b40061890a62e36104746bcaf0120bfab0f..11d52bc97e8802b04058589c6eb3bdb057607f00 100644
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -97,6 +97,8 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h
index 62590400bf038773c9f16fae68f4c42de4ee9130..1ce9b6fd07f4a377664b03b821cf1b170dadea19 100644
--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -241,6 +241,8 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc
index d7b89336d196ac701572a76ac23b3eedba4c46a8..d3e6c7f90748ed0061f4671b4d540cf8b7129563 100644
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -26,16 +26,16 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
     built_options.emplace("-Dactivation=" + kernel_name);
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     switch (activation_) {
@@ -83,11 +83,12 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     kernel_.setArg(idx++, gws[2]);
 
     input_shape_ = input->shape();
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
   std::string tuning_key =
       Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
              output->dim(3));
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index 37e6062a989f47baf8e613a9e1847c233d3061dc..d7c149a9720ea30b45c9c3745422cd19f4c7820a 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -26,8 +26,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   for (int i = 1; i < size; ++i) {
     MACE_CHECK_NOTNULL(input_tensors[i]);
     MACE_CHECK(batch == input_tensors[i]->dim(0));
@@ -37,6 +35,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
   }
 
   if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
     if (input_tensors.size() > 4) {
       MACE_NOT_IMPLEMENTED;
     }
@@ -47,7 +47,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
 
@@ -78,11 +78,12 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, gws[1]);
 
     input_shape_ = input_tensors[0]->shape();
-  }
 
-  const uint32_t kwg_size =
+    kwg_size_ =
       static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {kwg_size / 16, 16, 1};
+  }
+
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
   std::stringstream ss;
   ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
      << "_" << output_shape[2] << "_" << output_shape[3];
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index 10b956de57e9715de1008940ead6a48d60a362f8..c3a1765ce83a08e446c6d27393420a18ea61d544 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -36,16 +36,17 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
 
   if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
     built_options.emplace("-Dbatch_norm=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     if (folded_constant_) {
@@ -89,11 +90,12 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     kernel_.setArg(idx++, gws[2]);
 
     input_shape_ = input->shape();
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
   std::string tuning_key =
       Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3), folded_constant_);
diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc
index 2fb1252b1309b72b0218396e049e3ff68d89b874..e67ebe712c9c4604566a0f4c3baddd273a205ee1 100644
--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -29,16 +29,16 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
     built_options.emplace("-Dbias_add=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
@@ -52,15 +52,16 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     kernel_.setArg(idx++, gws[1]);
     kernel_.setArg(idx++, gws[2]);
     input_shape_ = input->shape();
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8};
 
   cl::Event event;
   cl_int error;
-  if (is_qualcomm_opencl200) {
+  if (is_non_uniform_work_groups_supported_) {
     error = runtime->command_queue().enqueueNDRangeKernel(
         kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
         cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc
index 0cec970aa48989d4f263c999d1f7da3ad83c7201..7a5df69d9ec43953025ee2d1f208e5aac7332ce3 100644
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -62,14 +62,15 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+  const bool is_non_uniform_work_groups_supported =
+      runtime->IsNonUniformWorkgroupsSupported();
 
   std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
   std::set<std::string> built_options;
   std::stringstream kernel_name_ss;
   kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
   built_options.emplace(kernel_name_ss.str());
-  if (is_qualcomm_opencl200) {
+  if (is_non_uniform_work_groups_supported) {
     built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
   }
   if (buffer->dtype() == image->dtype()) {
@@ -115,7 +116,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
 
   cl::Event event;
   cl_int error;
-  if (is_qualcomm_opencl200) {
+  if (is_non_uniform_work_groups_supported) {
     error = runtime->command_queue().enqueueNDRangeKernel(
         b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
         cl::NDRange(lws[0], lws[1]), nullptr, &event);
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
index 9d566477eccd4b0349b2a27d3233a1d39518f030..316ae62a2217b03cf09b1b6dd92e0142fd89b3b0 100644
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -36,16 +36,16 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
     built_options.emplace("-Dchannel_shuffle=" + kernel_name);
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
@@ -63,11 +63,12 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, gws[2]);
 
     input_shape_ = input->shape();
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
   std::stringstream ss;
   ss << "channel_shuffle_opencl_kernel_"
      << output->dim(0) << "_"
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index ce5a77182ff74522b168300aae9e1b3b2914f6d4..111b7a9c17b130f582b5d37ca209f77ced0dc9ba 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -17,7 +17,9 @@ static void Concat2(cl::Kernel *kernel,
                     const DataType dt,
                     std::vector<index_t> *prev_input_shape,
                     Tensor *output,
-                    StatsFuture *future) {
+                    StatsFuture *future,
+                    bool *is_non_uniform_work_groups_supported,
+                    uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -31,13 +33,13 @@ static void Concat2(cl::Kernel *kernel,
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel->get() == nullptr) {
+    *is_non_uniform_work_groups_supported =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
     built_options.emplace("-Dconcat_channel=" + kernel_name);
-    if (is_qualcomm_opencl200) {
+    if (*is_non_uniform_work_groups_supported) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     if (input0->dtype() == output->dtype()) {
@@ -66,11 +68,12 @@ static void Concat2(cl::Kernel *kernel,
     kernel->setArg(idx++, gws[2]);
 
     *prev_input_shape = input0->shape();
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
   std::stringstream ss;
   ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
      << "_" << output->dim(2) << "_" << output->dim(3);
@@ -81,7 +84,9 @@ static void ConcatN(cl::Kernel *kernel,
                     const std::vector<const Tensor *> &input_list,
                     const DataType dt,
                     Tensor *output,
-                    StatsFuture *future) {
+                    StatsFuture *future,
+                    bool *is_non_uniform_work_groups_supported,
+                    uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -89,15 +94,15 @@ static void ConcatN(cl::Kernel *kernel,
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel->get() == nullptr) {
+    *is_non_uniform_work_groups_supported =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
     built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (*is_non_uniform_work_groups_supported) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
@@ -122,9 +127,9 @@ static void ConcatN(cl::Kernel *kernel,
     kernel->setArg(idx++, gws[2]);
 
     chan_blk_offset += input_channel_blk;
-    const uint32_t kwg_size =
+    *kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-    const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+    const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
     std::stringstream ss;
     ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
        << batch * height;
@@ -169,11 +174,13 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
   switch (inputs_count) {
     case 2:
       Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
-              &input_shape_, output, future);
+              &input_shape_, output, future,
+              &is_non_uniform_work_groups_supported_, &kwg_size_);
       break;
     default:
       if (divisible_four) {
-        ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future);
+        ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future,
+            &is_non_uniform_work_groups_supported_, &kwg_size_);
       } else {
         MACE_NOT_IMPLEMENTED;
       }
diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc
index 468d80f09c60bd9584225d2c263766cef6c790e5..b9fa2d4c86b259bf9f9691654a92746071cad545 100644
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -20,7 +20,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                              const DataType dt,
                              std::vector<index_t> *prev_input_shape,
                              Tensor *output,
-                             StatsFuture *future);
+                             StatsFuture *future,
+                             bool *is_non_uniform_work_groups_supported,
+                             uint32_t *kwg_size);
 
 extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                              const Tensor *input,
@@ -34,7 +36,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                              const DataType dt,
                              std::vector<index_t> *prev_input_shape,
                              Tensor *output,
-                             StatsFuture *future);
+                             StatsFuture *future,
+                             bool *is_non_uniform_work_groups_supported,
+                             uint32_t *kwg_size);
 
 extern void Conv2dOpencl(cl::Kernel *kernel,
                          const Tensor *input,
@@ -48,7 +52,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                          const DataType dt,
                          std::vector<index_t> *prev_input_shape,
                          Tensor *output,
-                         StatsFuture *future);
+                         StatsFuture *future,
+                         bool *is_non_uniform_work_groups_supported,
+                         uint32_t *kwg_size);
 
 template <typename T>
 void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
@@ -61,7 +67,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
       const Tensor *bias, const int stride, const int *padding,
       const int *dilations, const ActivationType activation,
       const float relux_max_limit, const DataType dt,
-      std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future);
+      std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
+      bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size);
   // Selection matrix: kernel_size x stride_size
   static const Conv2dOpenclFunction selector[5] = {
       Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
@@ -101,11 +108,13 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     auto conv2d_func = selector[kernel_h - 1];
     conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                 dilations_, activation_, relux_max_limit_,
-                DataTypeToEnum<T>::value, &input_shape_, output, future);
+                DataTypeToEnum<T>::value, &input_shape_, output, future,
+                &is_non_uniform_work_groups_supported_, &kwg_size_);
   } else {
     Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                  dilations_, activation_, relux_max_limit_,
-                 DataTypeToEnum<T>::value, &input_shape_, output, future);
+                 DataTypeToEnum<T>::value, &input_shape_, output, future,
+                 &is_non_uniform_work_groups_supported_, &kwg_size_);
   }
 }
 
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index ad2af5a73a1a3e682c1334bbaa92945c0d49df97..be2fd08b8c25e82d681ab67b6ca8eecb0fe431ae 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -22,7 +22,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                              const DataType dt,
                              std::vector<index_t> *prev_input_shape,
                              Tensor *output,
-                             StatsFuture *future) {
+                             StatsFuture *future,
+                             bool *is_non_uniform_work_groups_supported,
+                             uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -38,9 +40,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel->get() == nullptr) {
+    *is_non_uniform_work_groups_supported =
+        runtime->IsNonUniformWorkgroupsSupported();
     MACE_CHECK(input_batch == batch);
 
     std::set<std::string> built_options;
@@ -48,7 +50,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (*is_non_uniform_work_groups_supported) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     if (bias != nullptr) {
@@ -101,11 +103,12 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     kernel->setArg(idx++, gws[2]);
 
     *prev_input_shape = input->shape();
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
   std::string tuning_key =
       Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index 6ac0fa569ebe4ab1d58ca8a9a87cd1cc56564f44..cec0927fa44b79ffc897470272bd1827cd0c1308 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -24,7 +24,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                              const DataType dt,
                              std::vector<index_t> *prev_input_shape,
                              Tensor *output,
-                             StatsFuture *future) {
+                             StatsFuture *future,
+                             bool *is_non_uniform_work_groups_supported,
+                             uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -37,15 +39,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel->get() == nullptr) {
+    *is_non_uniform_work_groups_supported =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
     built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (*is_non_uniform_work_groups_supported) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
@@ -99,11 +101,12 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
     kernel->setArg(idx++, gws[2]);
 
     *prev_input_shape = input->shape();
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  const std::vector<uint32_t> lws = {4, kwg_size / 32, 8, 1};
+  const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 1};
   std::string tuning_key =
       Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index 0fc944422fd1a22c4b37a3cce0123158b7bee1f3..a9151b480fa5a19b010d9220e62532fe588fc85d 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -24,7 +24,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                          const DataType dt,
                          std::vector<index_t> *prev_input_shape,
                          Tensor *output,
-                         StatsFuture *future) {
+                         StatsFuture *future,
+                         bool *is_non_uniform_work_groups_supported,
+                         uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -37,15 +39,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel->get() == nullptr) {
+    *is_non_uniform_work_groups_supported =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
     built_options.emplace("-Dconv_2d=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (*is_non_uniform_work_groups_supported) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
@@ -101,11 +103,12 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
     kernel->setArg(idx++, gws[2]);
 
     *prev_input_shape = input->shape();
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
   std::string tuning_key =
       Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
diff --git a/mace/kernels/opencl/depth_to_space_opencl.cc b/mace/kernels/opencl/depth_to_space_opencl.cc
index 0bafecd8ccdc994754d454dbcb2807390d6c8836..4daeac61bf58589fd29676aaa9001ec37aab065d 100644
--- a/mace/kernels/opencl/depth_to_space_opencl.cc
+++ b/mace/kernels/opencl/depth_to_space_opencl.cc
@@ -47,9 +47,9 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
     std::stringstream kernel_name_ss;
@@ -58,7 +58,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     kernel_ =
@@ -93,11 +93,12 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, gws[2]);
 
     input_shape_ = input->shape();
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
   TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
 }
 
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc
index c43799db2d96312a63898904f5266bc8528ea810..873a16a40e937443f341119421cb85af4d1f749a 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -23,7 +23,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
                      const DataType dt,
                      std::vector<index_t> *prev_input_shape,
                      Tensor *output,
-                     StatsFuture *future) {
+                     StatsFuture *future,
+                     bool *is_non_uniform_work_groups_supported,
+                     uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
   const index_t width = output->dim(2);
@@ -42,9 +44,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel->get() == nullptr) {
+    *is_non_uniform_work_groups_supported =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
     if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
@@ -53,7 +55,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
     } else {
       built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
     }
-    if (is_qualcomm_opencl200) {
+    if (*is_non_uniform_work_groups_supported) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
@@ -118,12 +120,14 @@ void DepthwiseConv2d(cl::Kernel *kernel,
     kernel->setArg(idx++, gws[0]);
     kernel->setArg(idx++, gws[1]);
     kernel->setArg(idx++, gws[2]);
+
     *prev_input_shape = input->shape();
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
   std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
                                   batch, height, width, channels, multiplier);
   TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
@@ -178,7 +182,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
 
   DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                   dilations_, activation_, relux_max_limit_,
-                  DataTypeToEnum<T>::value, &input_shape_, output, future);
+                  DataTypeToEnum<T>::value, &input_shape_, output, future,
+                  &is_non_uniform_work_groups_supported_, &kwg_size_);
 }
 
 template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc
index e2a68396d18045e94c4697295f3f1f6c8e1ec691..13413130c485ecb1ffb68a42a079be27c543046d 100644
--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -29,9 +29,9 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
@@ -39,7 +39,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
     built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
@@ -56,12 +56,14 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
     kernel_.setArg(idx++, *(output->opencl_image()));
     kernel_.setArg(idx++, gws[0]);
     kernel_.setArg(idx++, gws[1]);
+
     input_shape_ = input0->shape();
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {kwg_size / 16, 16, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
   std::stringstream ss;
   ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
      << "_" << output->dim(2) << "_" << output->dim(3);
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index b386cfc6e057fe3d82e0fb306f1291a1947bf898..ba788a26977750ae69d37f90e6661e6612cdcf08 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -194,24 +194,14 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
   }
 }
 
-const bool IsQualcommOpenCL200() {
-  auto runtime = OpenCLRuntime::Global();
-
-  if (runtime->GetGPUType() == GPU_TYPE::QUALCOMM_ADRENO &&
-      runtime->GetOpenclVersion() == "2.0") {
-    return true;
-  } else {
-    return false;
-  }
-}
-
 void TuningOrRun3DKernel(const cl::Kernel &kernel,
                          const std::string tuning_key,
                          const uint32_t *gws,
                          const std::vector<uint32_t> &lws,
                          StatsFuture *future) {
   auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+  const bool is_non_uniform_work_groups_supported =
+      runtime->IsNonUniformWorkgroupsSupported();
 
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     const uint32_t kwg_size =
@@ -249,7 +239,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
         << "Tuning parameters of 3D kernel must be 4D";
     cl_int error = CL_SUCCESS;
     std::vector<uint32_t> roundup_gws(3);
-    if (!is_qualcomm_opencl200) {
+    if (!is_non_uniform_work_groups_supported) {
       for (size_t i = 0; i < 3; ++i) {
         roundup_gws[i] = RoundUp(gws[i], params[i]);
       }
@@ -262,7 +252,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
       for (uint32_t i = 0; i < num_blocks; ++i) {
         uint32_t gws2 =
             (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        if (is_qualcomm_opencl200) {
+        if (is_non_uniform_work_groups_supported) {
           error = runtime->command_queue().enqueueNDRangeKernel(
               kernel, cl::NDRange(0, 0, i * block_size),
               cl::NDRange(gws[0], gws[1], gws2),
@@ -278,7 +268,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
       }
     } else {
       timer->ClearTiming();
-      if (is_qualcomm_opencl200) {
+      if (is_non_uniform_work_groups_supported) {
         error = runtime->command_queue().enqueueNDRangeKernel(
             kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
             cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
@@ -303,7 +293,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
         for (uint32_t i = 0; i < num_blocks; ++i) {
           uint32_t gws2 =
               (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-          if (is_qualcomm_opencl200) {
+          if (is_non_uniform_work_groups_supported) {
             error = runtime->command_queue().enqueueNDRangeKernel(
                 kernel, cl::NDRange(0, 0, i * block_size),
                 cl::NDRange(gws[0], gws[1], gws2),
@@ -342,7 +332,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
                          const std::vector<uint32_t> &lws,
                          StatsFuture *future) {
   auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+  const bool is_non_uniform_work_groups_supported =
+      runtime->IsNonUniformWorkgroupsSupported();
 
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     const uint32_t kwg_size =
@@ -368,7 +359,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
         << "Tuning parameters of 2D kernel must be 3d";
     cl_int error = CL_SUCCESS;
     std::vector<uint32_t> roundup_gws(2);
-    if (!is_qualcomm_opencl200) {
+    if (!is_non_uniform_work_groups_supported) {
       for (size_t i = 0; i < 2; ++i) {
         roundup_gws[i] = RoundUp(gws[i], params[i]);
       }
@@ -381,7 +372,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
       for (uint32_t i = 0; i < num_blocks; ++i) {
         uint32_t gws1 =
             (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-        if (is_qualcomm_opencl200) {
+        if (is_non_uniform_work_groups_supported) {
           error = runtime->command_queue().enqueueNDRangeKernel(
               kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
               cl::NDRange(params[0], params[1]), nullptr, &event);
@@ -396,7 +387,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
       }
     } else {
       timer->ClearTiming();
-      if (is_qualcomm_opencl200) {
+      if (is_non_uniform_work_groups_supported) {
         error = runtime->command_queue().enqueueNDRangeKernel(
             kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
             cl::NDRange(params[0], params[1]), nullptr, &event);
@@ -420,7 +411,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
         for (uint32_t i = 0; i < num_blocks; ++i) {
           uint32_t gws1 =
               (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-          if (is_qualcomm_opencl200) {
+          if (is_non_uniform_work_groups_supported) {
             error = runtime->command_queue().enqueueNDRangeKernel(
                 kernel, cl::NDRange(0, i * block_size),
                 cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]),
diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h
index 5b4e028318c1487825f553dce28079d4bc2faccf..89712c9b96aa043f5019cde6eae23aa07109f6f7 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -102,8 +102,6 @@ std::string Concat(Args... args) {
   return ss.str();
 }
 
-const bool IsQualcommOpenCL200();
-
 }  // namespace kernels
 }  // namespace mace
 #endif  // MACE_KERNELS_OPENCL_HELPER_H_
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index 9e29306186f0714839a7c8f0763c5967bc11e21e..19769f3d3d3d37389f55de983ed58d1480ba6935 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -33,16 +33,16 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     auto dt = DataTypeToEnum<T>::value;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
     built_options.emplace("-Dmatmul=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
@@ -59,9 +59,9 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
   kernel_.setArg(idx++, gws[0]);
   kernel_.setArg(idx++, gws[1]);
 
-  const uint32_t kwg_size =
+  kwg_size_ =
       static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {kwg_size / 64, 64, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 1};
   std::stringstream ss;
   ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
      << C->dim(2) << "_" << C->dim(3);
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index f3d4714cd325b48714f5ddf25e1b24d85aecb39b..fa9e157716919773dbc3a6d1f99beb016508a19d 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -20,9 +20,9 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
     const DataType dt = DataTypeToEnum<T>::value;
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
@@ -39,13 +39,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     if (pooling_type_ == AVG) {
       built_options.emplace("-DPOOL_AVG");
     }
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
   }
 
-  uint32_t gws[3];
+  std::vector<uint32_t> gws;
   if (!IsVecEqual(input_shape_, input->shape())) {
     std::vector<index_t> output_shape(4);
     std::vector<index_t> filter_shape = {kernels_[0], kernels_[1],
@@ -75,9 +75,10 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
     index_t channel_blocks = (channels + 3) / 4;
 
-    gws[0] = static_cast<uint32_t>(channel_blocks);
-    gws[1] = static_cast<uint32_t>(out_width);
-    gws[2] = static_cast<uint32_t>(batch * out_height);
+    gws = {
+        static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
+        static_cast<uint32_t>(batch * out_height),
+    };
 
     uint32_t idx = 0;
     kernel_.setArg(idx++, *(input->opencl_image()));
@@ -94,26 +95,16 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     kernel_.setArg(idx++, gws[2]);
 
     input_shape_ = input->shape();
-  } else {
-    index_t batch = output->dim(0);
-    index_t out_height = output->dim(1);
-    index_t out_width = output->dim(2);
-    index_t channels = output->dim(3);
-
-    index_t channel_blocks = (channels + 3) / 4;
 
-    gws[0] = static_cast<uint32_t>(channel_blocks);
-    gws[1] = static_cast<uint32_t>(out_width);
-    gws[2] = static_cast<uint32_t>(batch * out_height);
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
   std::stringstream ss;
   ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
      << "_" << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future);
 }
 
 template struct PoolingFunctor<DeviceType::OPENCL, float>;
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index 63c71ea7fc4eb410b68ebba3dc707b5c331809c0..5bcb53e37f4b354b950ec7eca44589b50d1a6dbd 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -30,16 +30,16 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
     built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     kernel_ =
@@ -72,11 +72,12 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, gws[2]);
 
     input_shape_ = input->shape();
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
   std::stringstream ss;
   ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
      << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
index 55773a521c34c47635032b2b3d2dd4b8da346189..94f541b2418afe4906710be2b5d7f89b9d61c06b 100644
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -31,16 +31,16 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
     built_options.emplace("-Dslice=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE="
                            + DtToCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
@@ -53,9 +53,9 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
       static_cast<uint32_t>(input->dim(0) * input->dim(1)),
   };
 
-  const uint32_t kwg_size =
+  kwg_size_ =
       static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
   std::stringstream ss;
   ss << "slice_opencl_kernel_"
      << input->dim(0) << "_"
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc
index 321d7c296f9e756ca671e45ab4a6d554d72f40d8..6b06cc8fb4c408a233b4799c97fb15326d695721 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -29,16 +29,16 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::set<std::string> built_options;
     std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
     built_options.emplace("-Dsoftmax=" + kernel_name);
     auto dt = DataTypeToEnum<T>::value;
     built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
@@ -52,12 +52,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
     kernel_.setArg(idx++, gws[0]);
     kernel_.setArg(idx++, gws[1]);
     kernel_.setArg(idx++, gws[2]);
+
     input_shape_ = logits->shape();
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
   std::stringstream ss;
   ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
      << "_" << output->dim(2) << "_" << output->dim(3);
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc
index 128164f967f3ddadd547efa3862cd79529868fee..6e00f6ea2ef726cb52b69ac580a4072f8a2d84a4 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -38,9 +38,9 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
     std::set<std::string> built_options;
     std::stringstream kernel_name_ss;
@@ -49,7 +49,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
     built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     kernel_ =
@@ -77,11 +77,12 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, gws[2]);
 
     space_shape_ = space_tensor->shape();
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
   std::stringstream ss;
   ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
      << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index c4a20a0307e34e024556a0680051a6e36774772d..905b1346a18e33f42cabfe53fd4f436d10c602bd 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -17,9 +17,9 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::string obfuscated_kernel_name =
         MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
     std::set<std::string> built_options;
@@ -28,7 +28,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
                           DtToUpstreamCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
@@ -74,11 +74,12 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, gws[1]);
 
     input_shape_ = input_tensor->shape();
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {kwg_size / 8, 8, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
   std::stringstream ss;
   ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
      << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
@@ -95,9 +96,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
 
   auto runtime = OpenCLRuntime::Global();
 
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
-
   if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
     std::string obfuscated_kernel_name =
         MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
     std::set<std::string> built_options;
@@ -107,7 +108,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
                           DtToUpstreamCLDt(DataTypeToEnum<T>::value));
     built_options.emplace("-DCMD_DATA_TYPE=" +
                           DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
       built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
     }
     built_options.emplace(bias != nullptr ? "-DBIAS" : "");
@@ -168,11 +169,12 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
     kernel_.setArg(idx++, gws[1]);
 
     input_shape_ = input_tensor->shape();
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
   }
 
-  const uint32_t kwg_size =
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {kwg_size / 8, 8, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
 
   std::stringstream ss;
   ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index 15cc691e71927300bec48224a7666f1468eb74c1..52dd12342ec360c07de992d413eac509b8f5778b 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -185,6 +185,8 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h
index 65e5121211d4d836d6d17809a843e0778defaecb..09ae3ba5075bc959e7b571db40d06dc548b0bdd4 100644
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -173,6 +173,8 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
   void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
 
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h
index 59d9d667b0a63da1e1d3ee471aecec9efd9be1e9..ce7431da3da8d0f2b39d6c5c38b694867c866365 100644
--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -61,6 +61,8 @@ struct SliceFunctor<DeviceType::OPENCL, T> {
                   const std::vector<Tensor *> &output_list,
                   StatsFuture *future);
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h
index a1c4ea2f6e5b9200f17d54906316a83cbefaa49a..b491e2ad39249f1e66233375aaa3c904951f2b84 100644
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -61,6 +61,8 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
   void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
 
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h
index 757f784820f90fee842fc385606db4755cb52293..6bd66cbb3e721beb254b06486b12ebb52ab184cd 100644
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -56,6 +56,8 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> space_shape_;
 };
 
diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h
index 6f483dacb06f920c54b14930dba3fd05ff845e44..df12ab36227eab19372c53e02f0f4110c937bd00 100644
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -51,6 +51,8 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
   void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
 
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
@@ -108,6 +110,8 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
                   StatsFuture *future);
 
   cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
   std::vector<index_t> input_shape_;
 };
 
diff --git a/tools/build_mace_run.sh b/tools/build_mace_run.sh
index 669918d28247a654a28d7792e24c218c6fd1660e..4606fde6ca4a2299200266873b831a7113134a27 100644
--- a/tools/build_mace_run.sh
+++ b/tools/build_mace_run.sh
@@ -43,6 +43,10 @@ else
     HEXAGON_MODE_BUILD_FLAG="--define hexagon=true"
   fi
 
+  if [ x"$TARGET_ABI" = x"arm64-v8a" ]; then
+    NEON_ENABLE_FLAG="--define neon=true"
+  fi
+
   bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \
     --crosstool_top=//external:android/crosstool \
     --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
@@ -54,6 +58,7 @@ else
     --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
     --define openmp=true \
     --copt="-O3" \
+    $NEON_ENABLE_FLAG \
     $PRODUCTION_MODE_BUILD_FLAGS \
     $HEXAGON_MODE_BUILD_FLAG || exit 1
 fi