format code and reduce get kwg size

ed267833 · yejianwu · 44d4903d · ed267833 · ed267833 · ed267833
43 changed file
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -147,16 +147,9 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
    if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) {
      *device_ = device;
      gpu_detected = true;
      const std::string device_name = device.getInfo<CL_DEVICE_NAME>();
-      constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
+      gpu_type_ = ParseGPUTypeFromDeviceName(device_name);
-      constexpr const char *kMaliGPUStr = "Mali";
-      if (device_name == kQualcommAdrenoGPUStr) {
-        gpu_type_ = GPU_TYPE::QUALCOMM_ADRENO;
-      } else if (device_name.find(kMaliGPUStr) != std::string::npos) {
-        gpu_type_ = GPU_TYPE::MALI;
-      } else {
-        gpu_type_ = GPU_TYPE::UNKNOWN;
-      }
      const std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
      opencl_version_ = device_version.substr(7, 3);
@@ -178,7 +171,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
  }
  cl_int err;
-  if (gpu_type_ == GPU_TYPE::QUALCOMM_ADRENO) {
+  if (gpu_type_ == GPUType::QUALCOMM_ADRENO) {
    std::vector<cl_context_properties> context_properties;
    context_properties.reserve(5);
    GetAdrenoContextProperties(&context_properties, gpu_perf_hint,
@@ -357,12 +350,30 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
  return size;
 }
-const GPU_TYPE OpenCLRuntime::GetGPUType() const {
+const bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() {
-  return gpu_type_;
+  if (gpu_type_ == GPUType::QUALCOMM_ADRENO &&
+      opencl_version_ == "2.0") {
+    return true;
+  } else {
+    return false;
+  }
 }
-const std::string &OpenCLRuntime::GetOpenclVersion() const {
+const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName(
-  return opencl_version_;
+    const std::string &device_name) {
+  constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
+  constexpr const char *kMaliGPUStr = "Mali";
+  constexpr const char *kPowerVRGPUStr = "PowerVR";
+  if (device_name == kQualcommAdrenoGPUStr) {
+    return GPUType::QUALCOMM_ADRENO;
+  } else if (device_name.find(kMaliGPUStr) != std::string::npos) {
+    return GPUType::MALI;
+  } else if (device_name.find(kPowerVRGPUStr) != std::string::npos) {
+    return GPUType::PowerVR;
+  } else {
+    return GPUType::UNKNOWN;
+  }
 }
 }  // namespace mace
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -18,9 +18,10 @@
 namespace mace {
-enum GPU_TYPE {
+enum GPUType {
  QUALCOMM_ADRENO,
  MALI,
+  PowerVR,
  UNKNOWN,
 };
@@ -55,8 +56,8 @@ class OpenCLRuntime {
  uint64_t GetDeviceMaxWorkGroupSize();
  uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
  uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
-  const GPU_TYPE GetGPUType() const;
+  const bool IsNonUniformWorkgroupsSupported();
-  const std::string &GetOpenclVersion() const;
+  const GPUType ParseGPUTypeFromDeviceName(const std::string &device_name);
  cl::Kernel BuildKernel(const std::string &program_name,
                         const std::string &kernel_name,
                         const std::set<std::string> &build_options);
@@ -82,7 +83,7 @@ class OpenCLRuntime {
  std::map<std::string, cl::Program> built_program_map_;
  std::mutex program_build_mutex_;
  std::string kernel_path_;
-  GPU_TYPE gpu_type_;
+  GPUType gpu_type_;
  std::string opencl_version_;
  static GPUPerfHint gpu_perf_hint_;

--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -155,6 +155,8 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
  ActivationType activation_;
  T relux_max_limit_;
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  std::string tuning_key_prefix_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -90,6 +90,8 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -157,6 +157,8 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
                  Tensor *output,
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -64,6 +64,8 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
                  Tensor *output,
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -56,6 +56,8 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  const int groups_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -85,6 +85,8 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
                  Tensor *output,
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -401,6 +401,8 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -108,6 +108,8 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  const int block_size_;
  bool d2s_;
  std::vector<index_t> input_shape_;

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -437,6 +437,8 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -97,6 +97,8 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -241,6 +241,8 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
 };
 }  // namespace kernels

--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -26,16 +26,16 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
    built_options.emplace("-Dactivation=" + kernel_name);
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    switch (activation_) {
@@ -83,11 +83,12 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    kernel_.setArg(idx++, gws[2]);
    input_shape_ = input->shape();
-  }
-  const uint32_t kwg_size =
+    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  }
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::string tuning_key =
      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
             output->dim(3));

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -26,8 +26,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  for (int i = 1; i < size; ++i) {
    MACE_CHECK_NOTNULL(input_tensors[i]);
    MACE_CHECK(batch == input_tensors[i]->dim(0));
@@ -37,6 +35,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
  }
  if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
    if (input_tensors.size() > 4) {
      MACE_NOT_IMPLEMENTED;
    }
@@ -47,7 +47,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
    built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
@@ -78,11 +78,12 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, gws[1]);
    input_shape_ = input_tensors[0]->shape();
-  }
-  const uint32_t kwg_size =
+    kwg_size_ =
      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {kwg_size / 16, 16, 1};
+  }
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
  std::stringstream ss;
  ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
     << "_" << output_shape[2] << "_" << output_shape[3];

--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -36,16 +36,17 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
    built_options.emplace("-Dbatch_norm=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    if (folded_constant_) {
@@ -89,11 +90,12 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    kernel_.setArg(idx++, gws[2]);
    input_shape_ = input->shape();
-  }
-  const uint32_t kwg_size =
+    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  }
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::string tuning_key =
      Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
             output->dim(1), output->dim(2), output->dim(3), folded_constant_);

--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -29,16 +29,16 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add");
    built_options.emplace("-Dbias_add=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
@@ -52,15 +52,16 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    kernel_.setArg(idx++, gws[1]);
    kernel_.setArg(idx++, gws[2]);
    input_shape_ = input->shape();
-  }
-  const uint32_t kwg_size =
+    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8};
+  }
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8};
  cl::Event event;
  cl_int error;
-  if (is_qualcomm_opencl200) {
+  if (is_non_uniform_work_groups_supported_) {
    error = runtime->command_queue().enqueueNDRangeKernel(
        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -62,14 +62,15 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+  const bool is_non_uniform_work_groups_supported =
+      runtime->IsNonUniformWorkgroupsSupported();
  std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
  std::set<std::string> built_options;
  std::stringstream kernel_name_ss;
  kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
  built_options.emplace(kernel_name_ss.str());
-  if (is_qualcomm_opencl200) {
+  if (is_non_uniform_work_groups_supported) {
    built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
  }
  if (buffer->dtype() == image->dtype()) {
@@ -115,7 +116,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
  cl::Event event;
  cl_int error;
-  if (is_qualcomm_opencl200) {
+  if (is_non_uniform_work_groups_supported) {
    error = runtime->command_queue().enqueueNDRangeKernel(
        b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
        cl::NDRange(lws[0], lws[1]), nullptr, &event);

--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -36,16 +36,16 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
    built_options.emplace("-Dchannel_shuffle=" + kernel_name);
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
@@ -63,11 +63,12 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, gws[2]);
    input_shape_ = input->shape();
-  }
-  const uint32_t kwg_size =
+    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  }
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::stringstream ss;
  ss << "channel_shuffle_opencl_kernel_"
     << output->dim(0) << "_"

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -17,7 +17,9 @@ static void Concat2(cl::Kernel *kernel,
                    const DataType dt,
                    std::vector<index_t> *prev_input_shape,
                    Tensor *output,
-                    StatsFuture *future) {
+                    StatsFuture *future,
+                    bool *is_non_uniform_work_groups_supported,
+                    uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -31,13 +33,13 @@ static void Concat2(cl::Kernel *kernel,
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel->get() == nullptr) {
+    *is_non_uniform_work_groups_supported =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
    built_options.emplace("-Dconcat_channel=" + kernel_name);
-    if (is_qualcomm_opencl200) {
+    if (*is_non_uniform_work_groups_supported) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    if (input0->dtype() == output->dtype()) {
@@ -66,11 +68,12 @@ static void Concat2(cl::Kernel *kernel,
    kernel->setArg(idx++, gws[2]);
    *prev_input_shape = input0->shape();
-  }
-  const uint32_t kwg_size =
+    *kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  }
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
  std::stringstream ss;
  ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
     << "_" << output->dim(2) << "_" << output->dim(3);
@@ -81,7 +84,9 @@ static void ConcatN(cl::Kernel *kernel,
                    const std::vector<const Tensor *> &input_list,
                    const DataType dt,
                    Tensor *output,
-                    StatsFuture *future) {
+                    StatsFuture *future,
+                    bool *is_non_uniform_work_groups_supported,
+                    uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -89,15 +94,15 @@ static void ConcatN(cl::Kernel *kernel,
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel->get() == nullptr) {
+    *is_non_uniform_work_groups_supported =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
    built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (*is_non_uniform_work_groups_supported) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
@@ -122,9 +127,9 @@ static void ConcatN(cl::Kernel *kernel,
    kernel->setArg(idx++, gws[2]);
    chan_blk_offset += input_channel_blk;
-    const uint32_t kwg_size =
+    *kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-    const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+    const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
    std::stringstream ss;
    ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
       << batch * height;
@@ -169,11 +174,13 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
  switch (inputs_count) {
    case 2:
      Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
-              &input_shape_, output, future);
+              &input_shape_, output, future,
+              &is_non_uniform_work_groups_supported_, &kwg_size_);
      break;
    default:
      if (divisible_four) {
-        ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future);
+        ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future,
+            &is_non_uniform_work_groups_supported_, &kwg_size_);
      } else {
        MACE_NOT_IMPLEMENTED;
      }

--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -20,7 +20,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                             const DataType dt,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
-                             StatsFuture *future);
+                             StatsFuture *future,
+                             bool *is_non_uniform_work_groups_supported,
+                             uint32_t *kwg_size);
 extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                             const Tensor *input,
@@ -34,7 +36,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                             const DataType dt,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
-                             StatsFuture *future);
+                             StatsFuture *future,
+                             bool *is_non_uniform_work_groups_supported,
+                             uint32_t *kwg_size);
 extern void Conv2dOpencl(cl::Kernel *kernel,
                         const Tensor *input,
@@ -48,7 +52,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                         const DataType dt,
                         std::vector<index_t> *prev_input_shape,
                         Tensor *output,
-                         StatsFuture *future);
+                         StatsFuture *future,
+                         bool *is_non_uniform_work_groups_supported,
+                         uint32_t *kwg_size);
 template <typename T>
 void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
@@ -61,7 +67,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
      const Tensor *bias, const int stride, const int *padding,
      const int *dilations, const ActivationType activation,
      const float relux_max_limit, const DataType dt,
-      std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future);
+      std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
+      bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size);
  // Selection matrix: kernel_size x stride_size
  static const Conv2dOpenclFunction selector[5] = {
      Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
@@ -101,11 +108,13 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    auto conv2d_func = selector[kernel_h - 1];
    conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                dilations_, activation_, relux_max_limit_,
-                DataTypeToEnum<T>::value, &input_shape_, output, future);
+                DataTypeToEnum<T>::value, &input_shape_, output, future,
+                &is_non_uniform_work_groups_supported_, &kwg_size_);
  } else {
    Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                 dilations_, activation_, relux_max_limit_,
-                 DataTypeToEnum<T>::value, &input_shape_, output, future);
+                 DataTypeToEnum<T>::value, &input_shape_, output, future,
+                 &is_non_uniform_work_groups_supported_, &kwg_size_);
  }
 }

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -22,7 +22,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                             const DataType dt,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
-                             StatsFuture *future) {
+                             StatsFuture *future,
+                             bool *is_non_uniform_work_groups_supported,
+                             uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -38,9 +40,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel->get() == nullptr) {
+    *is_non_uniform_work_groups_supported =
+        runtime->IsNonUniformWorkgroupsSupported();
    MACE_CHECK(input_batch == batch);
    std::set<std::string> built_options;
@@ -48,7 +50,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
    built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (*is_non_uniform_work_groups_supported) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    if (bias != nullptr) {
@@ -101,11 +103,12 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
    kernel->setArg(idx++, gws[2]);
    *prev_input_shape = input->shape();
-  }
-  const uint32_t kwg_size =
+    *kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  }
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
  std::string tuning_key =
      Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
             output->dim(1), output->dim(2), output->dim(3));

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -24,7 +24,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                             const DataType dt,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
-                             StatsFuture *future) {
+                             StatsFuture *future,
+                             bool *is_non_uniform_work_groups_supported,
+                             uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -37,15 +39,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel->get() == nullptr) {
+    *is_non_uniform_work_groups_supported =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
    built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (*is_non_uniform_work_groups_supported) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
@@ -99,11 +101,12 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
    kernel->setArg(idx++, gws[2]);
    *prev_input_shape = input->shape();
-  }
-  const uint32_t kwg_size =
+    *kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  const std::vector<uint32_t> lws = {4, kwg_size / 32, 8, 1};
+  }
+  const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 1};
  std::string tuning_key =
      Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
             output->dim(1), output->dim(2), output->dim(3));

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -24,7 +24,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                         const DataType dt,
                         std::vector<index_t> *prev_input_shape,
                         Tensor *output,
-                         StatsFuture *future) {
+                         StatsFuture *future,
+                         bool *is_non_uniform_work_groups_supported,
+                         uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -37,15 +39,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel->get() == nullptr) {
+    *is_non_uniform_work_groups_supported =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
    built_options.emplace("-Dconv_2d=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (*is_non_uniform_work_groups_supported) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
@@ -101,11 +103,12 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
    kernel->setArg(idx++, gws[2]);
    *prev_input_shape = input->shape();
-  }
-  const uint32_t kwg_size =
+    *kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  }
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
  std::string tuning_key =
      Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
             output->dim(1), output->dim(2), output->dim(3));

--- a/mace/kernels/opencl/depth_to_space_opencl.cc
+++ b/mace/kernels/opencl/depth_to_space_opencl.cc
@@ -47,9 +47,9 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::stringstream kernel_name_ss;
@@ -58,7 +58,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    kernel_ =
@@ -93,11 +93,12 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, gws[2]);
    input_shape_ = input->shape();
-  }
-  const uint32_t kwg_size =
+    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  }
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
 }

--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -23,7 +23,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
                     const DataType dt,
                     std::vector<index_t> *prev_input_shape,
                     Tensor *output,
-                     StatsFuture *future) {
+                     StatsFuture *future,
+                     bool *is_non_uniform_work_groups_supported,
+                     uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -42,9 +44,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel->get() == nullptr) {
+    *is_non_uniform_work_groups_supported =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
    if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
@@ -53,7 +55,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
    } else {
      built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
    }
-    if (is_qualcomm_opencl200) {
+    if (*is_non_uniform_work_groups_supported) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
@@ -118,12 +120,14 @@ void DepthwiseConv2d(cl::Kernel *kernel,
    kernel->setArg(idx++, gws[0]);
    kernel->setArg(idx++, gws[1]);
    kernel->setArg(idx++, gws[2]);
    *prev_input_shape = input->shape();
-  }
-  const uint32_t kwg_size =
+    *kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  }
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
  std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
                                  batch, height, width, channels, multiplier);
  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
@@ -178,7 +182,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
  DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                  dilations_, activation_, relux_max_limit_,
-                  DataTypeToEnum<T>::value, &input_shape_, output, future);
+                  DataTypeToEnum<T>::value, &input_shape_, output, future,
+                  &is_non_uniform_work_groups_supported_, &kwg_size_);
 }
 template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;

--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -29,9 +29,9 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
@@ -39,7 +39,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
    built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
@@ -56,12 +56,14 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
    kernel_.setArg(idx++, *(output->opencl_image()));
    kernel_.setArg(idx++, gws[0]);
    kernel_.setArg(idx++, gws[1]);
    input_shape_ = input0->shape();
-  }
-  const uint32_t kwg_size =
+    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {kwg_size / 16, 16, 1};
+  }
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
  std::stringstream ss;
  ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
     << "_" << output->dim(2) << "_" << output->dim(3);

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -194,24 +194,14 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
  }
 }
-const bool IsQualcommOpenCL200() {
-  auto runtime = OpenCLRuntime::Global();
-  if (runtime->GetGPUType() == GPU_TYPE::QUALCOMM_ADRENO &&
-      runtime->GetOpenclVersion() == "2.0") {
-    return true;
-  } else {
-    return false;
-  }
-}
 void TuningOrRun3DKernel(const cl::Kernel &kernel,
                         const std::string tuning_key,
                         const uint32_t *gws,
                         const std::vector<uint32_t> &lws,
                         StatsFuture *future) {
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+  const bool is_non_uniform_work_groups_supported =
+      runtime->IsNonUniformWorkgroupsSupported();
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    const uint32_t kwg_size =
@@ -249,7 +239,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
        << "Tuning parameters of 3D kernel must be 4D";
    cl_int error = CL_SUCCESS;
    std::vector<uint32_t> roundup_gws(3);
-    if (!is_qualcomm_opencl200) {
+    if (!is_non_uniform_work_groups_supported) {
      for (size_t i = 0; i < 3; ++i) {
        roundup_gws[i] = RoundUp(gws[i], params[i]);
      }
@@ -262,7 +252,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
      for (uint32_t i = 0; i < num_blocks; ++i) {
        uint32_t gws2 =
            (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-        if (is_qualcomm_opencl200) {
+        if (is_non_uniform_work_groups_supported) {
          error = runtime->command_queue().enqueueNDRangeKernel(
              kernel, cl::NDRange(0, 0, i * block_size),
              cl::NDRange(gws[0], gws[1], gws2),
@@ -278,7 +268,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
      }
    } else {
      timer->ClearTiming();
-      if (is_qualcomm_opencl200) {
+      if (is_non_uniform_work_groups_supported) {
        error = runtime->command_queue().enqueueNDRangeKernel(
            kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
@@ -303,7 +293,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
        for (uint32_t i = 0; i < num_blocks; ++i) {
          uint32_t gws2 =
              (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
-          if (is_qualcomm_opencl200) {
+          if (is_non_uniform_work_groups_supported) {
            error = runtime->command_queue().enqueueNDRangeKernel(
                kernel, cl::NDRange(0, 0, i * block_size),
                cl::NDRange(gws[0], gws[1], gws2),
@@ -342,7 +332,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
                         const std::vector<uint32_t> &lws,
                         StatsFuture *future) {
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
+  const bool is_non_uniform_work_groups_supported =
+      runtime->IsNonUniformWorkgroupsSupported();
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    const uint32_t kwg_size =
@@ -368,7 +359,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
        << "Tuning parameters of 2D kernel must be 3d";
    cl_int error = CL_SUCCESS;
    std::vector<uint32_t> roundup_gws(2);
-    if (!is_qualcomm_opencl200) {
+    if (!is_non_uniform_work_groups_supported) {
      for (size_t i = 0; i < 2; ++i) {
        roundup_gws[i] = RoundUp(gws[i], params[i]);
      }
@@ -381,7 +372,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
      for (uint32_t i = 0; i < num_blocks; ++i) {
        uint32_t gws1 =
            (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-        if (is_qualcomm_opencl200) {
+        if (is_non_uniform_work_groups_supported) {
          error = runtime->command_queue().enqueueNDRangeKernel(
              kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
              cl::NDRange(params[0], params[1]), nullptr, &event);
@@ -396,7 +387,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
      }
    } else {
      timer->ClearTiming();
-      if (is_qualcomm_opencl200) {
+      if (is_non_uniform_work_groups_supported) {
        error = runtime->command_queue().enqueueNDRangeKernel(
            kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
            cl::NDRange(params[0], params[1]), nullptr, &event);
@@ -420,7 +411,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
        for (uint32_t i = 0; i < num_blocks; ++i) {
          uint32_t gws1 =
              (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
-          if (is_qualcomm_opencl200) {
+          if (is_non_uniform_work_groups_supported) {
            error = runtime->command_queue().enqueueNDRangeKernel(
                kernel, cl::NDRange(0, i * block_size),
                cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]),

--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -102,8 +102,6 @@ std::string Concat(Args... args) {
  return ss.str();
 }
-const bool IsQualcommOpenCL200();
 }  // namespace kernels
 }  // namespace mace
 #endif  // MACE_KERNELS_OPENCL_HELPER_H_
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -33,16 +33,16 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
    built_options.emplace("-Dmatmul=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
@@ -59,9 +59,9 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
  kernel_.setArg(idx++, gws[0]);
  kernel_.setArg(idx++, gws[1]);
-  const uint32_t kwg_size =
+  kwg_size_ =
      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {kwg_size / 64, 64, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 1};
  std::stringstream ss;
  ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
     << C->dim(2) << "_" << C->dim(3);

--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -20,9 +20,9 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
    const DataType dt = DataTypeToEnum<T>::value;
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
@@ -39,13 +39,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    if (pooling_type_ == AVG) {
      built_options.emplace("-DPOOL_AVG");
    }
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
  }
-  uint32_t gws[3];
+  std::vector<uint32_t> gws;
  if (!IsVecEqual(input_shape_, input->shape())) {
    std::vector<index_t> output_shape(4);
    std::vector<index_t> filter_shape = {kernels_[0], kernels_[1],
@@ -75,9 +75,10 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    index_t channel_blocks = (channels + 3) / 4;
-    gws[0] = static_cast<uint32_t>(channel_blocks);
+    gws = {
-    gws[1] = static_cast<uint32_t>(out_width);
+        static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
-    gws[2] = static_cast<uint32_t>(batch * out_height);
+        static_cast<uint32_t>(batch * out_height),
+    };
    uint32_t idx = 0;
    kernel_.setArg(idx++, *(input->opencl_image()));
@@ -94,26 +95,16 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    kernel_.setArg(idx++, gws[2]);
    input_shape_ = input->shape();
-  } else {
-    index_t batch = output->dim(0);
-    index_t out_height = output->dim(1);
-    index_t out_width = output->dim(2);
-    index_t channels = output->dim(3);
-    index_t channel_blocks = (channels + 3) / 4;
-    gws[0] = static_cast<uint32_t>(channel_blocks);
+    kwg_size_ =
-    gws[1] = static_cast<uint32_t>(out_width);
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-    gws[2] = static_cast<uint32_t>(batch * out_height);
  }
-  const uint32_t kwg_size =
+  std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
-      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
  std::stringstream ss;
  ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
     << "_" << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future);
 }
 template struct PoolingFunctor<DeviceType::OPENCL, float>;

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -30,16 +30,16 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
    built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    kernel_ =
@@ -72,11 +72,12 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, gws[2]);
    input_shape_ = input->shape();
-  }
-  const uint32_t kwg_size =
+    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  }
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::stringstream ss;
  ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
     << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);

--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -31,16 +31,16 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
    built_options.emplace("-Dslice=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE="
                           + DtToCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
@@ -53,9 +53,9 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
      static_cast<uint32_t>(input->dim(0) * input->dim(1)),
  };
-  const uint32_t kwg_size =
+  kwg_size_ =
      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::stringstream ss;
  ss << "slice_opencl_kernel_"
     << input->dim(0) << "_"

--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -29,16 +29,16 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
    built_options.emplace("-Dsoftmax=" + kernel_name);
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
@@ -52,12 +52,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
    kernel_.setArg(idx++, gws[0]);
    kernel_.setArg(idx++, gws[1]);
    kernel_.setArg(idx++, gws[2]);
    input_shape_ = logits->shape();
-  }
-  const uint32_t kwg_size =
+    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  }
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::stringstream ss;
  ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
     << "_" << output->dim(2) << "_" << output->dim(3);

--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -38,9 +38,9 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::set<std::string> built_options;
    std::stringstream kernel_name_ss;
@@ -49,7 +49,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE=" +
                          DtToCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    kernel_ =
@@ -77,11 +77,12 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, gws[2]);
    space_shape_ = space_tensor->shape();
-  }
-  const uint32_t kwg_size =
+    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {8, kwg_size / 64, 8, 1};
+  }
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::stringstream ss;
  ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
     << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -17,9 +17,9 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::string obfuscated_kernel_name =
        MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
    std::set<std::string> built_options;
@@ -28,7 +28,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE=" +
                          DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
@@ -74,11 +74,12 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, gws[1]);
    input_shape_ = input_tensor->shape();
-  }
-  const uint32_t kwg_size =
+    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {kwg_size / 8, 8, 1};
+  }
+  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
  std::stringstream ss;
  ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
     << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
@@ -95,9 +96,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
  auto runtime = OpenCLRuntime::Global();
-  const bool is_qualcomm_opencl200 = IsQualcommOpenCL200();
  if (kernel_.get() == nullptr) {
+    is_non_uniform_work_groups_supported_ =
+        runtime->IsNonUniformWorkgroupsSupported();
    std::string obfuscated_kernel_name =
        MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
    std::set<std::string> built_options;
@@ -107,7 +108,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE=" +
                          DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
-    if (is_qualcomm_opencl200) {
+    if (is_non_uniform_work_groups_supported_) {
      built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0");
    }
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
@@ -168,11 +169,12 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, gws[1]);
    input_shape_ = input_tensor->shape();
-  }
-  const uint32_t kwg_size =
+    kwg_size_ =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
-  const std::vector<uint32_t> lws = {kwg_size / 8, 8, 1};
+  }
+  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
  std::stringstream ss;
  ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -185,6 +185,8 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -173,6 +173,8 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -61,6 +61,8 @@ struct SliceFunctor<DeviceType::OPENCL, T> {
                  const std::vector<Tensor *> &output_list,
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
 };
 }  // namespace kernels

--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -61,6 +61,8 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
  void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -56,6 +56,8 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> space_shape_;
 };

--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -51,6 +51,8 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };
@@ -108,6 +110,8 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
+  bool is_non_uniform_work_groups_supported_;
  std::vector<index_t> input_shape_;
 };

--- a/tools/build_mace_run.sh
+++ b/tools/build_mace_run.sh
@@ -43,6 +43,10 @@ else
    HEXAGON_MODE_BUILD_FLAG="--define hexagon=true"
  fi
+  if [ x"$TARGET_ABI" = x"arm64-v8a" ]; then
+    NEON_ENABLE_FLAG="--define neon=true"
+  fi
  bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \
    --crosstool_top=//external:android/crosstool \
    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
@@ -54,6 +58,7 @@ else
    --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
    --define openmp=true \
    --copt="-O3" \
+    $NEON_ENABLE_FLAG \
    $PRODUCTION_MODE_BUILD_FLAGS \
    $HEXAGON_MODE_BUILD_FLAG || exit 1
 fi