Merge branch 'compatible_with_opencl_1.1_and_1.2' into 'master'

Compatible with opencl 1.1 and 1.2 See merge request !333

Merge branch 'compatible_with_opencl_1.1_and_1.2' into 'master'
Compatible with opencl 1.1 and 1.2 See merge request !333
33b049e6 · 刘琦 · aeda046c · 0fd7ce42 · 33b049e6 · 33b049e6
65 changed file
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -142,17 +142,17 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
  }

  bool gpu_detected = false;
-  bool is_adreno_gpu = false;
  device_ = std::make_shared<cl::Device>();
  for (auto device : all_devices) {
    if (device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) {
      *device_ = device;
      gpu_detected = true;
+
      const std::string device_name = device.getInfo<CL_DEVICE_NAME>();
-      constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
-      if (device_name == kQualcommAdrenoGPUStr) {
-        is_adreno_gpu = true;
-      }
+      gpu_type_ = ParseGPUTypeFromDeviceName(device_name);
+
+      const std::string device_version = device.getInfo<CL_DEVICE_VERSION>();
+      opencl_version_ = device_version.substr(7, 3);

      VLOG(1) << "Using device: " << device_name;
      break;
@@ -171,7 +171,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
  }

  cl_int err;
-  if (is_adreno_gpu) {
+  if (gpu_type_ == GPUType::QUALCOMM_ADRENO) {
    std::vector<cl_context_properties> context_properties;
    context_properties.reserve(5);
    GetAdrenoContextProperties(&context_properties, gpu_perf_hint,
@@ -350,4 +350,30 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
  return size;
 }

+const bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() {
+  if (gpu_type_ == GPUType::QUALCOMM_ADRENO &&
+      opencl_version_ == "2.0") {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName(
+    const std::string &device_name) {
+  constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)";
+  constexpr const char *kMaliGPUStr = "Mali";
+  constexpr const char *kPowerVRGPUStr = "PowerVR";
+
+  if (device_name == kQualcommAdrenoGPUStr) {
+    return GPUType::QUALCOMM_ADRENO;
+  } else if (device_name.find(kMaliGPUStr) != std::string::npos) {
+    return GPUType::MALI;
+  } else if (device_name.find(kPowerVRGPUStr) != std::string::npos) {
+    return GPUType::PowerVR;
+  } else {
+    return GPUType::UNKNOWN;
+  }
+}
+
 }  // namespace mace
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -19,6 +19,13 @@

 namespace mace {

+enum GPUType {
+  QUALCOMM_ADRENO,
+  MALI,
+  PowerVR,
+  UNKNOWN,
+};
+
 class OpenCLProfilingTimer : public Timer {
 public:
  explicit OpenCLProfilingTimer(const cl::Event *event)
@@ -50,6 +57,8 @@ class OpenCLRuntime {
  uint64_t GetDeviceMaxWorkGroupSize();
  uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
  uint64_t GetKernelWaveSize(const cl::Kernel &kernel);
+  const bool IsNonUniformWorkgroupsSupported();
+  const GPUType ParseGPUTypeFromDeviceName(const std::string &device_name);
  cl::Kernel BuildKernel(const std::string &program_name,
                         const std::string &kernel_name,
                         const std::set<std::string> &build_options);
@@ -75,6 +84,8 @@ class OpenCLRuntime {
  std::map<std::string, cl::Program> built_program_map_;
  std::mutex program_build_mutex_;
  std::string kernel_path_;
+  GPUType gpu_type_;
+  std::string opencl_version_;

  static GPUPerfHint gpu_perf_hint_;
  static GPUPriorityHint gpu_priority_hint_;

--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -155,6 +155,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
  ActivationType activation_;
  T relux_max_limit_;
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  std::string tuning_key_prefix_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/addn.h
+++ b/mace/kernels/addn.h
@@ -90,6 +90,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
                  StatsFuture *future);

  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };


--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -157,6 +157,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
                  Tensor *output,
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };


--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -64,6 +64,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
                  Tensor *output,
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };


--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -56,6 +56,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);

  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  const int groups_;
  std::vector<index_t> input_shape_;
 };

--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -85,6 +85,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
                  Tensor *output,
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };


--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -401,6 +401,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
                  StatsFuture *future);

  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };


--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -108,6 +108,7 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);

  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  const int block_size_;
  bool d2s_;
  std::vector<index_t> input_shape_;

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -437,6 +437,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
                  StatsFuture *future);

  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };


--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -97,6 +97,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
                  StatsFuture *future);

  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };


--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.h
@@ -241,6 +241,7 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
                  StatsFuture *future);

  cl::Kernel kernel_;
+  uint32_t kwg_size_;
 };

 }  // namespace kernels

--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -24,15 +24,18 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,

  const index_t channel_blocks = RoundUpDiv4(channels);

-  if (kernel_.get() == nullptr) {
  auto runtime = OpenCLRuntime::Global();

+  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation");
    built_options.emplace("-Dactivation=" + kernel_name);
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    switch (activation_) {
      case RELU:
        tuning_key_prefix_ = "relu_opencl_kernel_";
@@ -58,10 +61,22 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
        LOG(FATAL) << "Unknown activation type: " << activation_;
    }
    kernel_ = runtime->BuildKernel("activation", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }

+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
  if (!IsVecEqual(input_shape_, input->shape())) {
    int idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    if (activation_ == PRELU) {
      MACE_CHECK_NOTNULL(alpha);
@@ -73,10 +88,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    input_shape_ = input->shape();
  }

-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::string tuning_key =
      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
             output->dim(3));

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -24,6 +24,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
  const index_t width = input_tensors[0]->dim(2);
  const index_t channels = input_tensors[0]->dim(3);

+  auto runtime = OpenCLRuntime::Global();
+
  for (int i = 1; i < size; ++i) {
    MACE_CHECK_NOTNULL(input_tensors[i]);
    MACE_CHECK(batch == input_tensors[i]->dim(0));
@@ -36,7 +38,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
    if (input_tensors.size() > 4) {
      MACE_NOT_IMPLEMENTED;
    }
-    auto runtime = OpenCLRuntime::Global();
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("addn");
@@ -44,7 +45,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
    built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size()));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
+
    kernel_ = runtime->BuildKernel("addn", kernel_name, built_options);
+
+    kwg_size_ =
+      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }

  std::vector<index_t> output_shape = input_tensors[0]->shape();
@@ -53,6 +61,9 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
  const index_t width_pixels = channel_blocks * width;
  const index_t batch_height_pixels = batch * height;

+  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
+                           static_cast<uint32_t>(batch_height_pixels)};
+
  if (!IsVecEqual(input_shape_, input_tensors[0]->shape())) {
    std::vector<size_t> output_image_shape;
    CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
@@ -60,6 +71,10 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
    output_tensor->ResizeImage(output_shape, output_image_shape);

    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+    }
    for (auto input : input_tensors) {
      kernel_.setArg(idx++, *(input->opencl_image()));
    }
@@ -68,9 +83,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
    input_shape_ = input_tensors[0]->shape();
  }

-  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
-                           static_cast<uint32_t>(batch_height_pixels)};
-  const std::vector<uint32_t> lws = {64, 16, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
  std::stringstream ss;
  ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
     << "_" << output_shape[2] << "_" << output_shape[3];

--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -30,14 +30,23 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,

  const index_t channel_blocks = RoundUpDiv4(channels);

-  if (kernel_.get() == nullptr) {
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
  auto runtime = OpenCLRuntime::Global();
+
+
+  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm");
    built_options.emplace("-Dbatch_norm=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    if (folded_constant_) {
      built_options.emplace("-DFOLDED_CONSTANT");
    }
@@ -61,9 +70,17 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    }

    kernel_ = runtime->BuildKernel("batch_norm", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, *(scale->opencl_image()));
    kernel_.setArg(idx++, *(offset->opencl_image()));
@@ -78,10 +95,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    input_shape_ = input->shape();
  }

-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::string tuning_key =
      Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
             output->dim(1), output->dim(2), output->dim(3), folded_constant_);

--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -23,7 +23,12 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,

  const index_t channel_blocks = RoundUpDiv4(channels);

+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
  auto runtime = OpenCLRuntime::Global();
+
  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
@@ -31,25 +36,46 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    built_options.emplace("-Dbias_add=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, *(bias->opencl_image()));
    kernel_.setArg(idx++, *(output->opencl_image()));
    input_shape_ = input->shape();
  }

-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8};

  cl::Event event;
-  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
        kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }
+
+    error = runtime->command_queue().enqueueNDRangeKernel(
+        kernel_, cl::NullRange,
+        cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+        cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event);
+  }
  MACE_CHECK(error == CL_SUCCESS);
  if (future != nullptr) {
    future->wait_fn = [runtime, event](CallStats *stats) {

--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -26,7 +26,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
    buffer->Resize(image->shape());
  }

-  size_t gws[2] = {image_shape[0], image_shape[1]};
+  uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
+                     static_cast<uint32_t>(image_shape[1])};
  std::string kernel_name;
  switch (type) {
    case CONV2D_FILTER:
@@ -58,11 +59,17 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
                         : "winograd_filter_buffer_to_image";
      break;
  }
+
+  auto runtime = OpenCLRuntime::Global();
+
  std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
  std::set<std::string> built_options;
  std::stringstream kernel_name_ss;
  kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
  built_options.emplace(kernel_name_ss.str());
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+  }
  if (buffer->dtype() == image->dtype()) {
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE=" +
@@ -73,11 +80,14 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
    built_options.emplace("-DCMD_DATA_TYPE=" +
                          DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
  }
-  auto runtime = OpenCLRuntime::Global();
  auto b2f_kernel = runtime->BuildKernel("buffer_to_image",
                                         obfuscated_kernel_name, built_options);

  uint32_t idx = 0;
+  if (!runtime->IsNonUniformWorkgroupsSupported()) {
+    b2f_kernel.setArg(idx++, gws[0]);
+    b2f_kernel.setArg(idx++, gws[1]);
+  }
  b2f_kernel.setArg(idx++, *(buffer->opencl_buffer()));
  if (!i2b_) {
    MACE_CHECK(buffer->buffer_offset() % GetEnumTypeSize(buffer->dtype()) == 0,
@@ -103,13 +113,28 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
    b2f_kernel.setArg(idx++, static_cast<uint32_t>(buffer->dim(3)));
  }
  b2f_kernel.setArg(idx++, *(image->opencl_image()));
-  const std::vector<uint32_t> lws = {16, 64};
+
+  const uint32_t kwg_size =
+      static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(b2f_kernel));
+  const std::vector<uint32_t> lws = {16, kwg_size / 16};
+
  cl::Event event;
-  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+  cl_int error;
+  if (runtime->IsNonUniformWorkgroupsSupported()) {
+    error = runtime->command_queue().enqueueNDRangeKernel(
        b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
        cl::NDRange(lws[0], lws[1]), nullptr, &event);
-  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+  } else {
+    std::vector<uint32_t> roundup_gws(lws.size());
+    for (size_t i = 0; i < lws.size(); ++i) {
+      roundup_gws[i] = RoundUp(gws[i], lws[i]);
+    }

+    error = runtime->command_queue().enqueueNDRangeKernel(
+        b2f_kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
+        cl::NDRange(lws[0], lws[1]), nullptr, &event);
+  }
+  MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
  if (future != nullptr) {
    future->wait_fn = [runtime, event](CallStats *stats) {
      event.wait();

--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -30,20 +30,36 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
             "groups must be multiple of 4");
  const index_t group_channel_blocks = RoundUpDiv4(channels_per_group);

-  if (kernel_.get() == nullptr) {
+  const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
  auto runtime = OpenCLRuntime::Global();

+  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle");
    built_options.emplace("-Dchannel_shuffle=" + kernel_name);
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name,
                                   built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
+
  if (!IsVecEqual(input_shape_, input->shape())) {
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, groups_);
    kernel_.setArg(idx++, static_cast<uint32_t>(channels_per_group));
@@ -51,10 +67,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(

    input_shape_ = input->shape();
  }
-  const uint32_t gws[3] = {static_cast<uint32_t>(group_channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8, 1};
+
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::stringstream ss;
  ss << "channel_shuffle_opencl_kernel_"
     << output->dim(0) << "_"

--- a/mace/kernels/opencl/cl/activation.cl
+++ b/mace/kernels/opencl/cl/activation.cl
 #include <common.h>

-__kernel void activation(__read_only image2d_t input,
+__kernel void activation(
+                         UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                         __read_only image2d_t input,
 #ifdef USE_PRELU
                         __read_only image2d_t alpha,
 #endif
@@ -9,7 +11,16 @@ __kernel void activation(__read_only image2d_t input,
  const int ch_blk = get_global_id(0);
  const int w = get_global_id(1);
  const int hb = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (ch_blk >= global_size_dim0 || w >= global_size_dim1
+      || hb >= global_size_dim2) {
+    return;
+  }
+  const int width = global_size_dim1;
+#else
  const int width = get_global_size(1);
+#endif

  const int pos = mad24(ch_blk, width, w);
  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));

--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
 #include <common.h>

-__kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
+__kernel void addn(
+                   UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                   __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                   __read_only image2d_t input1,
 #if INPUT_NUM > 2
                   __read_only image2d_t input2,
@@ -12,6 +14,10 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
  const int w = get_global_id(0);
  const int hb = get_global_id(1);

+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || hb >= global_size_dim1) return;
+#endif
+
  DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb));
  DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb));
  DATA_TYPE4 out = in0 + in1;

--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
 #include <common.h>
 // Supported data types: half/float
-__kernel void batch_norm(__read_only image2d_t input,
+__kernel void batch_norm(
+                         UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                         __read_only image2d_t input,
                         __read_only image2d_t scale,
                         __read_only image2d_t offset,
 #ifndef FOLDED_CONSTANT
@@ -13,7 +15,16 @@ __kernel void batch_norm(__read_only image2d_t input,
  const int ch_blk = get_global_id(0);
  const int w = get_global_id(1);
  const int hb = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (ch_blk >= global_size_dim0 || w >= global_size_dim1
+      || hb >= global_size_dim2) {
+    return;
+  }
+  const int width = global_size_dim1;
+#else
  const int width = get_global_size(1);
+#endif

 #ifdef FOLDED_CONSTANT
  DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));

--- a/mace/kernels/opencl/cl/bias_add.cl
+++ b/mace/kernels/opencl/cl/bias_add.cl
 #include <common.h>
 // Supported data types: half/float
-__kernel void bias_add(__read_only image2d_t input,
+__kernel void bias_add(
+                       UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                       __read_only image2d_t input,
                       __read_only image2d_t bias,
                       __write_only image2d_t output) {
  const int ch_blk = get_global_id(0);
  const int w = get_global_id(1);
  const int hb = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (ch_blk >= global_size_dim0 || w >= global_size_dim1
+      || hb >= global_size_dim2) {
+    return;
+  }
+  const int width = global_size_dim1;
+#else
  const int width = get_global_size(1);
+#endif

  const int pos = mad24(ch_blk, width, w);
  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));

--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
 #include <common.h>

-__kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, oc, ic */
+__kernel void filter_buffer_to_image(
+                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                                     __global const DATA_TYPE *input, /* h, w, oc, ic */
                                     __private const int input_offset,
                                     __private const int filter_h,
                                     __private const int filter_w,
@@ -9,6 +11,13 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
                                     __write_only image2d_t output) {
  int w = get_global_id(0);
  int h = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+#endif
+
  const int in_channel_idx = w;
  const int hw_size = filter_w * filter_h;
  const int out_channel_idx = h / hw_size * 4;
@@ -44,7 +53,9 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic */
+__kernel void filter_image_to_buffer(
+                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                                     __global DATA_TYPE *output, /* h, w, oc, ic */
                                     __private const int filter_h,
                                     __private const int filter_w,
                                     __private const int out_channel,
@@ -52,6 +63,13 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
                                     __read_only image2d_t input) {
  int w = get_global_id(0);
  int h = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+#endif
+
  const int in_channel_idx = w;
  const int hw_size = filter_w * filter_h;
  const int out_channel_idx = h / hw_size * 4;
@@ -84,7 +102,9 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
  }
 }

-__kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, m */
+__kernel void dw_filter_buffer_to_image(
+                                        UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                                        __global const DATA_TYPE *input, /* h, w, ic, m */
                                        __private const int input_offset,
                                        __private const int filter_w,
                                        __private const int in_channel,
@@ -93,6 +113,12 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
  const int w = get_global_id(0);
  const int h = get_global_id(1);

+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+#endif
+
  DATA_TYPE4 values = 0;
  if (multiplier == 1) {
    const int in_channel_idx = h << 2;
@@ -134,7 +160,9 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+__kernel void in_out_buffer_to_image(
+                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                                     __global const DATA_TYPE *input, /* nhwc */
                                     __private const int input_offset,
                                     __private const int height,
                                     __private const int width,
@@ -142,6 +170,13 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
                                     __write_only image2d_t output) {
  int w = get_global_id(0);
  int h = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+#endif
+
  const int batch_idx = h / height;
  const int height_idx = h % height;
  const int width_idx = w % width;
@@ -167,13 +202,22 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
+__kernel void in_out_image_to_buffer(
+                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                                     __global DATA_TYPE *output, /* nhwc */
                                     __private const int height,
                                     __private const int width,
                                     __private const int channels,
                                     __read_only image2d_t input) {
  int w = get_global_id(0);
  int h = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+#endif
+
  const int batch_idx = h / height;
  const int height_idx = h % height;
  const int width_idx = w % width;
@@ -198,13 +242,21 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
  }
 }

-__kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+__kernel void arg_buffer_to_image(
+                                  UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                                  __global const DATA_TYPE *input, /* nhwc */
                                  __private const int input_offset,
                                  __private const int count,
                                  __write_only image2d_t output) {
  int w = get_global_id(0);
  int h = get_global_id(1);

+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+#endif
+
  const int offset = input_offset + w * 4;
  const int size = count - w * 4;

@@ -226,11 +278,20 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
+__kernel void arg_image_to_buffer(
+                                  UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                                  __global DATA_TYPE *output, /* nhwc */
                                  __private const int count,
                                  __read_only image2d_t input) {
  int w = get_global_id(0);
  int h = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+#endif
+
  const int offset = w * 4;

  int2 coord = (int2)(w, h);
@@ -251,7 +312,9 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
 }


-__kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //nhwc
+__kernel void in_out_height_buffer_to_image(
+                                            UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                                            __global const DATA_TYPE *input, //nhwc
                                            __private const int input_offset,
                                            __private const int height,
                                            __private const int width,
@@ -259,6 +322,13 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
                                            __write_only image2d_t output) {
  int w = get_global_id(0);
  int h = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+#endif
+
  const int wc = width * channels;
  const int height_blks = (height + 3) / 4;
  const int batch_idx = h / height_blks;
@@ -285,13 +355,22 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
  WRITE_IMAGET(output, coord, values);
 }

-__kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
+__kernel void in_out_height_image_to_buffer(
+                                            UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                                            __global DATA_TYPE *output, //nhwc
                                            __private const int height,
                                            __private const int width,
                                            __private const int channels,
                                            __read_only image2d_t input) {
  int w = get_global_id(0);
  int h = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+#endif
+
  const int height_blks = (height + 3) / 4;
  const int batch_idx = h / height_blks;
  const int height_idx = (h % height_blks) << 2;
@@ -315,7 +394,9 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
 }


-__kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
+__kernel void in_out_width_buffer_to_image(
+                                           UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                                           __global const DATA_TYPE *input, /* nhwc */
                                           __private const int input_offset,
                                           __private const int height,
                                           __private const int width,
@@ -323,6 +404,13 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
                                           __write_only image2d_t output) {
  int w = get_global_id(0);
  int h = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+#endif
+
  const int width_blks = (width + 3) / 4;
  const int batch_idx = h / height;
  const int height_idx = h % height;
@@ -349,7 +437,9 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
 }

 // only support 3x3 now
-__kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, //Oc, Ic, H, W
+__kernel void winograd_filter_buffer_to_image(
+                                              UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                                              __global const DATA_TYPE *input, //Oc, Ic, H, W
                                              __private const int input_offset,
                                              __private const int in_channels,
                                              __private const int height,
@@ -357,7 +447,16 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
                                              __write_only image2d_t output) {
  int w = get_global_id(0);
  int h = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+  const int out_channels = global_size_dim1;
+#else
  const int out_channels = get_global_size(1);
+#endif
+
  const int out_channel_idx = h;
  const int in_channel_idx = w << 2;
  const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
@@ -430,13 +529,22 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
 }

 // only support 3x3 now
-__kernel void winograd_filter_image_to_buffer(__global DATA_TYPE *output, //Oc, Ic, H, W
+__kernel void winograd_filter_image_to_buffer(
+                                              UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                                              __global DATA_TYPE *output, //Oc, Ic, H, W
                                              __private const int height,
                                              __private const int width,
                                              __private const int channel,
                                              __read_only image2d_t input) {
  const int w = get_global_id(0);
  const int h = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || h >= global_size_dim1) {
+    return;
+  }
+#endif
+
  const int width_idx = w << 2;
  const int size = width - width_idx;
  int offset = h * width + width_idx;

--- a/mace/kernels/opencl/cl/channel_shuffle.cl
+++ b/mace/kernels/opencl/cl/channel_shuffle.cl
 #include <common.h>

 // assume channes_per_group mod 4 = 0 && groups mod 4 == 0
-__kernel void channel_shuffle(__read_only image2d_t input,
+__kernel void channel_shuffle(
+                              UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                              __read_only image2d_t input,
                              __private const int groups,
                              __private const int channels_per_group,
                              __write_only image2d_t output) {
  const int group_chan_blk_idx = get_global_id(0);
  const int width_idx = get_global_id(1);
-  const int width = get_global_size(1);
  const int hb_idx = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (group_chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
+      || hb_idx >= global_size_dim2) {
+    return;
+  }
+  const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
+
  const int group_blks = groups / 4;
  const int groups_blks_width = group_blks * width;
  const int channels_per_group_blks = channels_per_group / 4;

--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -18,6 +18,23 @@
 #define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
 #define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)

+#ifndef NON_UNIFORM_WORK_GROUP
+
+#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 \
+    __private const int global_size_dim0,       \
+    __private const int global_size_dim1,
+#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 \
+    __private const int global_size_dim0,       \
+    __private const int global_size_dim1,       \
+    __private const int global_size_dim2,
+
+#else
+
+#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+
+#endif
+
 __constant sampler_t SAMPLER =
    CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;


--- a/mace/kernels/opencl/cl/concat.cl
+++ b/mace/kernels/opencl/cl/concat.cl
@@ -22,14 +22,26 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
 }

 // Supported data type: half/float
-__kernel void concat_channel(__read_only image2d_t input0,
+__kernel void concat_channel(
+                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                             __read_only image2d_t input0,
                             __read_only image2d_t input1,
                             __private const int input0_chan,
                             __write_only image2d_t output) {
  const int chan_blk_idx = get_global_id(0);
  const int width_idx = get_global_id(1);
-  const int width = get_global_size(1);
  const int hb_idx = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
+      || hb_idx >= global_size_dim2) {
+    return;
+  }
+  const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
+
  const int input0_chan_blk = (input0_chan + 3) >> 2;

  DATA_TYPE4 data = 0;
@@ -72,13 +84,25 @@ __kernel void concat_channel(__read_only image2d_t input0,
 }

 // Required: All input channels are divisible by 4
-__kernel void concat_channel_multi(__read_only image2d_t input,
+__kernel void concat_channel_multi(
+                                   UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                                   __read_only image2d_t input,
                                   __private const int chan_blk_offset,
                                   __write_only image2d_t output) {
  const int chan_blk_idx = get_global_id(0);
  const int width_idx = get_global_id(1);
-  const int width = get_global_size(1);
  const int hb_idx = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
+      || hb_idx >= global_size_dim2) {
+    return;
+  }
+  const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
+
  DATA_TYPE4 data = 0;
  data = READ_IMAGET(input,
                     SAMPLER,

--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
 #include <common.h>

-__kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void conv_2d(
+                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                      __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                      __read_only image2d_t filter, /* cout%4 * cin, kh * kw * cout/4 */
 #ifdef BIAS
    __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -21,9 +23,18 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                      __private const int dilation_w) {
  const int out_ch_blk = get_global_id(0);
  const int out_w_blk = get_global_id(1);
-  const int out_w_blks = get_global_size(1);
  const int out_hb = get_global_id(2);

+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
+      || out_hb >= global_size_dim2) {
+    return;
+  }
+  const int out_w_blks = global_size_dim1;
+#else
+  const int out_w_blks = get_global_size(1);
+#endif
+
 #ifdef BIAS
  DATA_TYPE4 out0 =
     READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));

--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
 #include <common.h>

-__kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void conv_2d_1x1(
+                          UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                          __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                          __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
 #ifdef BIAS
                          __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -15,9 +17,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                          __private const int stride) {
  const int out_ch_blk = get_global_id(0);
  const int out_w_blk = get_global_id(1);
-  const int out_w_blks = get_global_size(1);
  const int out_hb = get_global_id(2);

+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
+      || out_hb >= global_size_dim2) {
+    return;
+  }
+  const int out_w_blks = global_size_dim1;
+#else
+  const int out_w_blks = get_global_size(1);
+#endif
+
 #ifdef BIAS
  DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
  DATA_TYPE4 out1 = out0;

--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
 #include <common.h>

-__kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void conv_2d_3x3(
+                          UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                          __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                          __read_only image2d_t filter, /* cout%4 * cin , kh * kw * cout/4 */
 #ifdef BIAS
                          __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -19,9 +21,18 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                          __private const int dilation_w) {
  const int out_ch_blk = get_global_id(0);
  const int out_w_blk = get_global_id(1);
-  const int out_w_blks = get_global_size(1);
  const int out_hb = get_global_id(2);

+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
+      || out_hb >= global_size_dim2) {
+    return;
+  }
+  const int out_w_blks = global_size_dim1;
+#else
+  const int out_w_blks = get_global_size(1);
+#endif
+
 #ifdef BIAS
  DATA_TYPE4 out0 =
     READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));

--- a/mace/kernels/opencl/cl/depth_to_space.cl
+++ b/mace/kernels/opencl/cl/depth_to_space.cl
 #include <common.h>

-__kernel void depth_to_space(__read_only image2d_t input,
+__kernel void depth_to_space(
+                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                             __read_only image2d_t input,
                             __private const int block_size,
                             __private const int output_depth,
                             __write_only image2d_t output) {
  const int out_d = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_h = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_d >= global_size_dim0 || out_w >= global_size_dim1
+      || out_h >= global_size_dim2) {
+    return;
+  }
+  const int output_width = global_size_dim1;
+#else
  const int output_width = get_global_size(1);
+#endif

  const int out_pos = mad24(out_d, output_width, out_w);

@@ -27,14 +38,27 @@ __kernel void depth_to_space(__read_only image2d_t input,
  WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data);
 }

-__kernel void space_to_depth(__read_only image2d_t input,
+__kernel void space_to_depth(
+                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                             __read_only image2d_t input,
                             __private const int block_size,
                             __private const int input_depth,
                             __write_only image2d_t output) {
+
  const int d = get_global_id(0);
  const int w = get_global_id(1);
  const int h = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (d >= global_size_dim0 || w >= global_size_dim1
+      || h >= global_size_dim2) {
+    return;
+  }
+  const int input_width = global_size_dim1;
+#else
  const int input_width = get_global_size(1);
+#endif
+
  const int in_pos = mad24(d, input_width, w);
  const int output_width = input_width / block_size;


--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
 #include <common.h>

 // Only multiplier = 1 is supported
-__kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void depthwise_conv2d(
+                               UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                               __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                               __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
    __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -21,8 +23,18 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
                               __private const short dilation_w) {
  const short out_ch_blk = get_global_id(0);
  const short out_w_blk = get_global_id(1);
-  const short out_w_blks = get_global_size(1);
  const short out_hb = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_ch_blk >= global_size_dim0 || out_w_blk >= global_size_dim1
+      || out_hb >= global_size_dim2) {
+    return;
+  }
+  const short out_w_blks = global_size_dim1;
+#else
+  const short out_w_blks = get_global_size(1);
+#endif
+
  const short rounded_in_ch = in_ch_blks << 2;
  const short in_ch_blk = out_ch_blk; // multiplier = 1

@@ -126,7 +138,9 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
 }

-__kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void depthwise_conv2d_s1(
+                                  UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                                  __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                  __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
 #ifdef BIAS
    __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -145,6 +159,14 @@ __kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4
  const short out_ch_blk = get_global_id(0);
  const short out_w_blk = get_global_id(1) << 2;
  const short out_hb = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_ch_blk >= global_size_dim0 || get_global_id(1) >= global_size_dim1
+      || out_hb >= global_size_dim2) {
+    return;
+  }
+#endif
+
  const short rounded_in_ch = in_ch_blks << 2;
  const short in_ch_blk = out_ch_blk; // multiplier = 1


--- a/mace/kernels/opencl/cl/eltwise.cl
+++ b/mace/kernels/opencl/cl/eltwise.cl
 #include <common.h>

-__kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
+__kernel void eltwise(
+                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                      __read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
                      __read_only image2d_t input1,
 #ifdef COEFF_SUM
                      __private const float coeff0,
@@ -10,6 +12,10 @@ __kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
  const int w = get_global_id(0);
  const int hb = get_global_id(1);

+#ifndef NON_UNIFORM_WORK_GROUP
+  if (w >= global_size_dim0 || hb >= global_size_dim1) return;
+#endif
+
  DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb));
  DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb));
  DATA_TYPE4 out;

--- a/mace/kernels/opencl/cl/matmul.cl
+++ b/mace/kernels/opencl/cl/matmul.cl
 #include <common.h>

 // C = A * B
-__kernel void matmul(__read_only image2d_t A,
+__kernel void matmul(
+                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                     __read_only image2d_t A,
                     __read_only image2d_t B,
                     __write_only image2d_t C,
                     __private const int M,
@@ -11,6 +13,11 @@ __kernel void matmul(__read_only image2d_t A,
                     __private const int k_blocks) {
  const int gx = get_global_id(0) << 2;
  const int hb = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (get_global_id(0) >= global_size_dim0 || hb >= global_size_dim1) return;
+#endif
+
  const int batch = hb / height_blocks;
  const int ty = (hb % height_blocks);
  const int gy = mad24(batch, height_blocks, ty);

--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
@@ -19,7 +19,9 @@ inline int calculate_avg_block_size(const int pool_size,
 }

 // Supported data type: half/float
-__kernel void pooling(__read_only image2d_t input,
+__kernel void pooling(
+                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                      __read_only image2d_t input,
                      __private const int in_height,
                      __private const int in_width,
                      __private const int out_height,
@@ -28,11 +30,21 @@ __kernel void pooling(__read_only image2d_t input,
                      __private const int stride,
                      __private const int pooling_size,
                      __write_only image2d_t output) {
+
  const int out_chan_idx = get_global_id(0);
  const int out_width_idx = get_global_id(1);
-  const int out_width = get_global_size(1);
  const int out_hb_idx = get_global_id(2);

+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_chan_idx >= global_size_dim0 || out_width_idx >= global_size_dim1
+      || out_hb_idx >= global_size_dim2) {
+    return;
+  }
+  const int out_width = global_size_dim1;
+#else
+  const int out_width = get_global_size(1);
+#endif
+
  const int batch_idx = mul24((out_hb_idx / out_height), in_height);
  const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top;
  const int in_width_start = mul24(out_width_idx, stride) - pad_left;

--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
 #include <common.h>

-__kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+__kernel void resize_bilinear_nocache(
+                                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                                      __read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                                      __write_only image2d_t output,
                                      __private const float height_scale,
                                      __private const float width_scale,
                                      __private const int in_height,
                                      __private const int in_width,
                                      __private const int out_height) {
+
  const int ch_blk = get_global_id(0);
-  const int ch_blks = get_global_size(0);
  const int w = get_global_id(1);
-  const int out_width = get_global_size(1);
  const int hb = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (ch_blk >= global_size_dim0 || w >= global_size_dim1
+      || hb >= global_size_dim2) {
+    return;
+  }
+  const int ch_blks = global_size_dim0;
+  const int out_width = global_size_dim1;
+#else
+  const int ch_blks = get_global_size(0);
+  const int out_width = get_global_size(1);
+#endif
+
  const int b = hb / out_height;
  const int h = hb % out_height;


--- a/mace/kernels/opencl/cl/slice.cl
+++ b/mace/kernels/opencl/cl/slice.cl
 #include <common.h>

-__kernel void slice(__read_only image2d_t input,
+__kernel void slice(
+                    UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                    __read_only image2d_t input,
                    __private const int chan_blk_offset,
                    __write_only image2d_t output) {
  const int chan_blk_idx = get_global_id(0);
  const int width_idx = get_global_id(1);
-  const int width = get_global_size(1);
  const int hb_idx = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
+      || hb_idx >= global_size_dim2) {
+    return;
+  }
+  const int width = global_size_dim1;
+#else
+  const int width = get_global_size(1);
+#endif
+
  DATA_TYPE4 data = READ_IMAGET(input, SAMPLER,
                                (int2)(mad24(chan_blk_idx + chan_blk_offset,
                                             width, width_idx), hb_idx));

--- a/mace/kernels/opencl/cl/softmax.cl
+++ b/mace/kernels/opencl/cl/softmax.cl
 #include <common.h>

-__kernel void softmax(__read_only image2d_t input,
+__kernel void softmax(
+                      UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                      __read_only image2d_t input,
                      __private const int channels,
                      __private const int remain_channels,
                      __write_only image2d_t output) {
  const int chan_blk_idx = get_global_id(0);
  const int width_idx = get_global_id(1);
  const int hb_idx = get_global_id(2);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
+      || hb_idx >= global_size_dim2) {
+    return;
+  }
+  const int chan_blks = global_size_dim0 - 1;
+  const int width = global_size_dim1;
+#else
  const int chan_blks = get_global_size(0) - 1;
  const int width = get_global_size(1);
+#endif

  int pos = width_idx;
  DATA_TYPE max_value = -FLT_MAX;

--- a/mace/kernels/opencl/cl/space_to_batch.cl
+++ b/mace/kernels/opencl/cl/space_to_batch.cl
 #include <common.h>

-__kernel void space_to_batch(__read_only image2d_t space_data,
+__kernel void space_to_batch(
+                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                             __read_only image2d_t space_data,
                             __write_only image2d_t batch_data,
                             __private const int block_height,
                             __private const int block_width,
@@ -14,6 +16,13 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
  const int batch_w_idx = get_global_id(1);
  const int batch_hb_idx = get_global_id(2);

+#ifndef NON_UNIFORM_WORK_GROUP
+  if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
+      || batch_hb_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+
  const int batch_b_idx = batch_hb_idx / batch_height;
  const int batch_h_idx = batch_hb_idx % batch_height;

@@ -39,7 +48,9 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
  WRITE_IMAGET(batch_data, batch_coord, value);
 }

-__kernel void batch_to_space(__read_only image2d_t batch_data,
+__kernel void batch_to_space(
+                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
+                             __read_only image2d_t batch_data,
                             __write_only image2d_t space_data,
                             __private const int block_height,
                             __private const int block_width,
@@ -53,6 +64,13 @@ __kernel void batch_to_space(__read_only image2d_t batch_data,
  const int batch_w_idx = get_global_id(1);
  const int batch_hb_idx = get_global_id(2);

+#ifndef NON_UNIFORM_WORK_GROUP
+  if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
+      || batch_hb_idx >= global_size_dim2) {
+    return;
+  }
+#endif
+
  const int batch_b_idx = batch_hb_idx / batch_height;
  const int batch_h_idx = batch_hb_idx % batch_height;


--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
 #include <common.h>

-__kernel void winograd_transform_2x2(__read_only image2d_t input,
+__kernel void winograd_transform_2x2(
+                                     UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                                     __read_only image2d_t input,
                                     __write_only image2d_t output,
                                     __private const int in_height,
                                     __private const int in_width,
@@ -11,7 +13,15 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
                                     __private const int padding_left) {
  int out_width_idx = get_global_id(0);
  int chan_blk_idx = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (out_width_idx >= global_size_dim0 || chan_blk_idx >= global_size_dim1) {
+    return;
+  }
+  const int chan_blk_size = global_size_dim1;
+#else
  const int chan_blk_size = get_global_size(1);
+#endif

  const int batch_idx = out_width_idx / round_hw;
  const int t_idx = out_width_idx % round_hw;
@@ -106,7 +116,9 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
  }
 }

-__kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
+__kernel void winograd_inverse_transform_2x2(
+                                             UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
+                                             __read_only image2d_t input,
 #ifdef BIAS
                                             __read_only image2d_t bias, /* cout%4 * cout/4 */
 #endif
@@ -118,7 +130,16 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
                                             __private const float relux_max_limit) {
  const int width_idx = get_global_id(0);
  const int height_idx = get_global_id(1);
+
+#ifndef NON_UNIFORM_WORK_GROUP
+  if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
+    return;
+  }
+  const int out_channel = global_size_dim1;
+#else
  const int out_channel = get_global_size(1);
+#endif
+
  int width = width_idx;
  int height = height_idx;


--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -17,19 +17,28 @@ static void Concat2(cl::Kernel *kernel,
                    const DataType dt,
                    std::vector<index_t> *prev_input_shape,
                    Tensor *output,
-                    StatsFuture *future) {
+                    StatsFuture *future,
+                    uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
  const index_t channel = output->dim(3);

  const int channel_blk = RoundUpDiv4(channel);
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(width),
+      static_cast<uint32_t>(batch * height),
+  };

-  if (kernel->get() == nullptr) {
  auto runtime = OpenCLRuntime::Global();
+
+  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel");
    built_options.emplace("-Dconcat_channel=" + kernel_name);
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    if (input0->dtype() == output->dtype()) {
      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
@@ -41,9 +50,17 @@ static void Concat2(cl::Kernel *kernel,
      built_options.emplace("-DDIVISIBLE_FOUR");
    }
    *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
  if (!IsVecEqual(*prev_input_shape, input0->shape())) {
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
    kernel->setArg(idx++,
                   *(static_cast<const cl::Image2D *>(input0->opencl_image())));
    kernel->setArg(idx++,
@@ -51,14 +68,11 @@ static void Concat2(cl::Kernel *kernel,
    kernel->setArg(idx++, static_cast<int32_t>(input0->dim(3)));
    kernel->setArg(idx++,
                   *(static_cast<cl::Image2D *>(output->opencl_image())));
+
    *prev_input_shape = input0->shape();
  }

-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(channel_blk), static_cast<uint32_t>(width),
-      static_cast<uint32_t>(batch * height),
-  };
-  const std::vector<uint32_t> lws = {8, 16, 8, 1};
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
  std::stringstream ss;
  ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
     << "_" << output->dim(2) << "_" << output->dim(3);
@@ -69,38 +83,51 @@ static void ConcatN(cl::Kernel *kernel,
                    const std::vector<const Tensor *> &input_list,
                    const DataType dt,
                    Tensor *output,
-                    StatsFuture *future) {
+                    StatsFuture *future,
+                    uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
  const index_t channel = output->dim(3);

-  if (kernel->get() == nullptr) {
  auto runtime = OpenCLRuntime::Global();
+
+  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi");
    built_options.emplace("-Dconcat_channel_multi=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    *kernel = runtime->BuildKernel("concat", kernel_name, built_options);
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }

  const int inputs_count = input_list.size();
  index_t chan_blk_offset = 0;
  for (int i = 0; i < inputs_count; ++i) {
    const Tensor *input = input_list[i];
+    index_t input_channel_blk = input->dim(3) / 4;
+    const uint32_t gws[3] = {
+        static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
+        static_cast<uint32_t>(batch * height),
+    };
+
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, static_cast<int32_t>(chan_blk_offset));
    kernel->setArg(idx++, *(output->opencl_image()));

-    index_t input_channel_blk = input->dim(3) / 4;
    chan_blk_offset += input_channel_blk;
-    const uint32_t gws[3] = {
-        static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
-        static_cast<uint32_t>(batch * height),
-    };
-    const std::vector<uint32_t> lws = {8, 16, 8, 1};
+    const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
    std::stringstream ss;
    ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_"
       << batch * height;
@@ -145,11 +172,12 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
  switch (inputs_count) {
    case 2:
      Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum<T>::value,
-              &input_shape_, output, future);
+              &input_shape_, output, future, &kwg_size_);
      break;
    default:
      if (divisible_four) {
-        ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future);
+        ConcatN(&kernel_, input_list, DataTypeToEnum<T>::value, output, future,
+            &kwg_size_);
      } else {
        MACE_NOT_IMPLEMENTED;
      }

--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -20,7 +20,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                             const DataType dt,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
-                             StatsFuture *future);
+                             StatsFuture *future,
+                             uint32_t *kwg_size);

 extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                             const Tensor *input,
@@ -34,7 +35,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                             const DataType dt,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
-                             StatsFuture *future);
+                             StatsFuture *future,
+                             uint32_t *kwg_size);

 extern void Conv2dOpencl(cl::Kernel *kernel,
                         const Tensor *input,
@@ -48,7 +50,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                         const DataType dt,
                         std::vector<index_t> *prev_input_shape,
                         Tensor *output,
-                         StatsFuture *future);
+                         StatsFuture *future,
+                         uint32_t *kwg_size);

 template <typename T>
 void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
@@ -61,7 +64,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
      const Tensor *bias, const int stride, const int *padding,
      const int *dilations, const ActivationType activation,
      const float relux_max_limit, const DataType dt,
-      std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future);
+      std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
+      uint32_t *kwg_size);
  // Selection matrix: kernel_size x stride_size
  static const Conv2dOpenclFunction selector[5] = {
      Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
@@ -101,11 +105,13 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    auto conv2d_func = selector[kernel_h - 1];
    conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                dilations_, activation_, relux_max_limit_,
-                DataTypeToEnum<T>::value, &input_shape_, output, future);
+                DataTypeToEnum<T>::value, &input_shape_, output, future,
+                &kwg_size_);
  } else {
    Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                 dilations_, activation_, relux_max_limit_,
-                 DataTypeToEnum<T>::value, &input_shape_, output, future);
+                 DataTypeToEnum<T>::value, &input_shape_, output, future,
+                 &kwg_size_);
  }
 }


--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -22,7 +22,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                             const DataType dt,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
-                             StatsFuture *future) {
+                             StatsFuture *future,
+                             uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -36,6 +37,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
  const index_t width_blocks = RoundUpDiv4(width);
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);

+  auto runtime = OpenCLRuntime::Global();
+
  if (kernel->get() == nullptr) {
    MACE_CHECK(input_batch == batch);

@@ -44,6 +47,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
    built_options.emplace("-Dconv_2d_1x1=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    if (bias != nullptr) {
      built_options.emplace("-DBIAS");
    }
@@ -66,11 +72,23 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
        LOG(FATAL) << "Unknown activation type: " << activation;
    }

-    auto runtime = OpenCLRuntime::Global();
    *kernel = runtime->BuildKernel("conv_2d_1x1", kernel_name, built_options);
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, *(filter->opencl_image()));
    if (bias != nullptr) {
@@ -89,10 +107,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }

-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width_blocks),
-                           static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 15, 8, 1};
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
  std::string tuning_key =
      Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
             output->dim(1), output->dim(2), output->dim(3));

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -24,7 +24,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                             const DataType dt,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
-                             StatsFuture *future) {
+                             StatsFuture *future,
+                             uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -35,12 +36,17 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
  const index_t width_blocks = RoundUpDiv<index_t, 5>(width);

+  auto runtime = OpenCLRuntime::Global();
+
  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3");
    built_options.emplace("-Dconv_2d_3x3=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation) {
      case NOOP:
@@ -61,11 +67,23 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
        LOG(FATAL) << "Unknown activation type: " << activation;
    }

-    auto runtime = OpenCLRuntime::Global();
    *kernel = runtime->BuildKernel("conv_2d_3x3", kernel_name, built_options);
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, *(filter->opencl_image()));
    if (bias != nullptr) {
@@ -87,10 +105,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }

-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width_blocks),
-                           static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {4, 15, 8, 1};
+  const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 1};
  std::string tuning_key =
      Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
             output->dim(1), output->dim(2), output->dim(3));

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -24,7 +24,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                         const DataType dt,
                         std::vector<index_t> *prev_input_shape,
                         Tensor *output,
-                         StatsFuture *future) {
+                         StatsFuture *future,
+                         uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -35,12 +36,17 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
  const index_t width_blocks = RoundUpDiv4(width);

+  auto runtime = OpenCLRuntime::Global();
+
  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d");
    built_options.emplace("-Dconv_2d=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation) {
      case NOOP:
@@ -61,11 +67,23 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
        LOG(FATAL) << "Unknown activation type: " << activation;
    }

-    auto runtime = OpenCLRuntime::Global();
    *kernel = runtime->BuildKernel("conv_2d", kernel_name, built_options);
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, *(filter->opencl_image()));
    if (bias != nullptr) {
@@ -89,10 +107,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }

-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width_blocks),
-                           static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8, 1};
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
  std::string tuning_key =
      Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
             output->dim(1), output->dim(2), output->dim(3));

--- a/mace/kernels/opencl/depth_to_space_opencl.cc
+++ b/mace/kernels/opencl/depth_to_space_opencl.cc
@@ -45,8 +45,9 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, &image_shape);
  output->ResizeImage(output_shape, image_shape);

-  if (kernel_.get() == nullptr) {
  auto runtime = OpenCLRuntime::Global();
+
+  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
    std::stringstream kernel_name_ss;
@@ -55,38 +56,49 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    kernel_ =
        runtime->BuildKernel("depth_to_space", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
+
+  uint32_t gws[3];
+  std::stringstream ss;
  if (!IsVecEqual(input_shape_, input->shape())) {
+    if (d2s_) {
+      gws[0] = static_cast<uint32_t>(depth_blocks);
+      gws[1] = static_cast<uint32_t>(output_width);
+      gws[2] = static_cast<uint32_t>(output_height * batch);
+      ss << "depth_to_space_opencl_kernel_" << output->dim(0) << "_"
+         << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
+    } else {
+      gws[0] = static_cast<uint32_t>(depth_blocks);
+      gws[1] = static_cast<uint32_t>(input_width);
+      gws[2] = static_cast<uint32_t>(input_height * batch);
+      ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_"
+         << input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3);
+    }
+
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, block_size_);
    kernel_.setArg(idx++, depth_blocks);
    kernel_.setArg(idx++, *(output->opencl_image()));
+
    input_shape_ = input->shape();
  }

-  if (d2s_) {
-    const uint32_t gws[3] = {static_cast<uint32_t>(depth_blocks),
-                             static_cast<uint32_t>(output_width),
-                             static_cast<uint32_t>(output_height * batch)};
-    const std::vector<uint32_t> lws = {8, 16, 8, 1};
-    std::stringstream ss;
-    ss << "depth_to_space_opencl_kernel_" << output->dim(0) << "_"
-       << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
-
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
-  } else {
-    const uint32_t gws[3] = {static_cast<uint32_t>(depth_blocks),
-                             static_cast<uint32_t>(input_width),
-                             static_cast<uint32_t>(input_height * batch)};
-    const std::vector<uint32_t> lws = {8, 16, 8, 1};
-    std::stringstream ss;
-    ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_"
-       << input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3);
-    TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
-  }
 }

 template struct DepthToSpaceOpFunctor<DeviceType::OPENCL, float>;

--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -23,7 +23,8 @@ void DepthwiseConv2d(cl::Kernel *kernel,
                     const DataType dt,
                     std::vector<index_t> *prev_input_shape,
                     Tensor *output,
-                     StatsFuture *future) {
+                     StatsFuture *future,
+                     uint32_t *kwg_size) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
  const index_t width = output->dim(2);
@@ -35,8 +36,14 @@ void DepthwiseConv2d(cl::Kernel *kernel,
  const index_t channel_blocks = RoundUpDiv4(channels);
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
  const index_t width_blocks = RoundUpDiv4(width);
-  if (kernel->get() == nullptr) {
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+
  auto runtime = OpenCLRuntime::Global();
+
+  if (kernel->get() == nullptr) {
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
    if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) {
@@ -45,6 +52,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
    } else {
      built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
    }
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
@@ -70,6 +80,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,

    *kernel =
        runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
+
+    *kwg_size =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(*kernel));
  }
  if (!IsVecEqual(*prev_input_shape, input->shape())) {
    const index_t input_batch = input->dim(0);
@@ -84,6 +97,11 @@ void DepthwiseConv2d(cl::Kernel *kernel,
               input_channels);

    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel->setArg(idx++, gws[0]);
+      kernel->setArg(idx++, gws[1]);
+      kernel->setArg(idx++, gws[2]);
+    }
    kernel->setArg(idx++, *(input->opencl_image()));
    kernel->setArg(idx++, *(filter->opencl_image()));
    if (bias != nullptr) {
@@ -104,13 +122,11 @@ void DepthwiseConv2d(cl::Kernel *kernel,
      kernel->setArg(idx++, static_cast<int16_t>(dilations[0]));
      kernel->setArg(idx++, static_cast<int16_t>(dilations[1]));
    }
+
    *prev_input_shape = input->shape();
  }

-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width_blocks),
-                           static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8, 1};
+  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
  std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
                                  batch, height, width, channels, multiplier);
  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
@@ -165,7 +181,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(

  DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(),
                  dilations_, activation_, relux_max_limit_,
-                  DataTypeToEnum<T>::value, &input_shape_, output, future);
+                  DataTypeToEnum<T>::value, &input_shape_, output, future,
+                  &kwg_size_);
 }

 template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;

--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -24,8 +24,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
  const index_t width_pixels = channel_blocks * width;
  const index_t batch_height_pixels = batch * height;

-  if (kernel_.get() == nullptr) {
+  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
+                           static_cast<uint32_t>(batch_height_pixels)};
+
  auto runtime = OpenCLRuntime::Global();
+
+  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise");
@@ -33,11 +37,21 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
    built_options.emplace(MakeString("-DELTWISE_TYPE=", type_));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM");
    kernel_ = runtime->BuildKernel("eltwise", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  if (!IsVecEqual(input_shape_, input0->shape())) {
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+    }
    kernel_.setArg(idx++, *(input0->opencl_image()));
    kernel_.setArg(idx++, *(input1->opencl_image()));
    if (!coeff_.empty()) {
@@ -45,12 +59,11 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
      kernel_.setArg(idx++, coeff_[1]);
    }
    kernel_.setArg(idx++, *(output->opencl_image()));
+
    input_shape_ = input0->shape();
  }

-  const uint32_t gws[2] = {static_cast<uint32_t>(width_pixels),
-                           static_cast<uint32_t>(batch_height_pixels)};
-  const std::vector<uint32_t> lws = {64, 16, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 1};
  std::stringstream ss;
  ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
     << "_" << output->dim(2) << "_" << output->dim(3);

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -200,6 +200,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
                         const std::vector<uint32_t> &lws,
                         StatsFuture *future) {
  auto runtime = OpenCLRuntime::Global();
+
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    const uint32_t kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
@@ -226,12 +227,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
        {4, kwg_size / 28, 7, 1},
        {4, kwg_size / 32, 8, 1},
        {4, kwg_size / 56, 14, 1},
-        {3, 15, 9, 1},
-        {7, 15, 9, 1},
-        {9, 7, 15, 1},
-        {15, 7, 9, 1},
        {1, kwg_size, 1, 1},
-        {4, 15, 8, 1},
    };
  };
  cl::Event event;
@@ -240,6 +236,13 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
    MACE_CHECK(params.size() == 4)
        << "Tuning parameters of 3D kernel must be 4D";
    cl_int error = CL_SUCCESS;
+    std::vector<uint32_t> roundup_gws(3);
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      for (size_t i = 0; i < 3; ++i) {
+        roundup_gws[i] = RoundUp(gws[i], params[i]);
+      }
+    }
+
    if (timer == nullptr) {
      uint32_t num_blocks = params[3];
      const uint32_t block_size = gws[2] / num_blocks;
@@ -247,17 +250,32 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
      for (uint32_t i = 0; i < num_blocks; ++i) {
        uint32_t gws2 =
            (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+        if (runtime->IsNonUniformWorkgroupsSupported()) {
          error = runtime->command_queue().enqueueNDRangeKernel(
              kernel, cl::NDRange(0, 0, i * block_size),
              cl::NDRange(gws[0], gws[1], gws2),
              cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        } else {
+          uint32_t roundup_gws2 = RoundUp(gws2, params[2]);
+          error = runtime->command_queue().enqueueNDRangeKernel(
+              kernel, cl::NDRange(0, 0, i * block_size),
+              cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2),
+              cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+        }
        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
      }
    } else {
      timer->ClearTiming();
+      if (runtime->IsNonUniformWorkgroupsSupported()) {
        error = runtime->command_queue().enqueueNDRangeKernel(
            kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      } else {
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel, cl::NullRange,
+            cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]),
+            cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+      }
      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
      timer->AccumulateTiming();
      tuning_result->assign(params.begin(), params.end());
@@ -273,10 +291,18 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
        for (uint32_t i = 0; i < num_blocks; ++i) {
          uint32_t gws2 =
              (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
+          if (runtime->IsNonUniformWorkgroupsSupported()) {
            error = runtime->command_queue().enqueueNDRangeKernel(
                kernel, cl::NDRange(0, 0, i * block_size),
                cl::NDRange(gws[0], gws[1], gws2),
                cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+          } else {
+            uint32_t roundup_gws2 = RoundUp(gws2, params[2]);
+            error = runtime->command_queue().enqueueNDRangeKernel(
+                kernel, cl::NDRange(0, 0, i * block_size),
+                cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws2),
+                cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
+          }
          MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
          timer->AccumulateTiming();
        }
@@ -304,6 +330,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
                         const std::vector<uint32_t> &lws,
                         StatsFuture *future) {
  auto runtime = OpenCLRuntime::Global();
+
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    const uint32_t kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
@@ -318,7 +345,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
            {kwg_size / 64, 64, 1},
            {kwg_size / 128, 128, 1},
            {kwg_size / 256, 256, 1},
-            {kwg_size / 512, 512, 1},
            {kwg_size, 1, 1},
            {1, kwg_size, 1}};
  };
@@ -328,6 +354,13 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
    MACE_CHECK(params.size() == 3)
        << "Tuning parameters of 2D kernel must be 3d";
    cl_int error = CL_SUCCESS;
+    std::vector<uint32_t> roundup_gws(2);
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      for (size_t i = 0; i < 2; ++i) {
+        roundup_gws[i] = RoundUp(gws[i], params[i]);
+      }
+    }
+
    if (timer == nullptr) {
      uint32_t num_blocks = params[2];
      const uint32_t block_size = gws[1] / num_blocks;
@@ -335,16 +368,30 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
      for (uint32_t i = 0; i < num_blocks; ++i) {
        uint32_t gws1 =
            (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
+        if (runtime->IsNonUniformWorkgroupsSupported()) {
          error = runtime->command_queue().enqueueNDRangeKernel(
              kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
              cl::NDRange(params[0], params[1]), nullptr, &event);
+        } else {
+          uint32_t roundup_gws1 = RoundUp(gws1, params[1]);
+          error = runtime->command_queue().enqueueNDRangeKernel(
+              kernel, cl::NDRange(0, i * block_size),
+              cl::NDRange(roundup_gws[0], roundup_gws1),
+              cl::NDRange(params[0], params[1]), nullptr, &event);
+        }
        MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
      }
    } else {
      timer->ClearTiming();
+      if (runtime->IsNonUniformWorkgroupsSupported()) {
        error = runtime->command_queue().enqueueNDRangeKernel(
            kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]),
            cl::NDRange(params[0], params[1]), nullptr, &event);
+      } else {
+        error = runtime->command_queue().enqueueNDRangeKernel(
+            kernel, cl::NullRange, cl::NDRange(roundup_gws[0], roundup_gws[1]),
+            cl::NDRange(params[0], params[1]), nullptr, &event);
+      }
      MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
      timer->AccumulateTiming();
      tuning_result->assign(params.begin(), params.end());
@@ -360,9 +407,18 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
        for (uint32_t i = 0; i < num_blocks; ++i) {
          uint32_t gws1 =
              (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
+          if (runtime->IsNonUniformWorkgroupsSupported()) {
            error = runtime->command_queue().enqueueNDRangeKernel(
-              kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1),
+                kernel, cl::NDRange(0, i * block_size),
+                cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]),
+                nullptr, &event);
+          } else {
+            uint32_t roundup_gws1 = RoundUp(gws1, params[1]);
+            error = runtime->command_queue().enqueueNDRangeKernel(
+                kernel, cl::NDRange(0, i * block_size),
+                cl::NDRange(roundup_gws[0], roundup_gws1),
                cl::NDRange(params[0], params[1]), nullptr, &event);
+          }
          MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
          timer->AccumulateTiming();
        }

--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -26,18 +26,33 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,

  const index_t height_blocks = RoundUpDiv4(height);
  const index_t width_blocks = RoundUpDiv4(width);
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(width_blocks),
+      static_cast<uint32_t>(height_blocks * batch),
+  };

-  if (kernel_.get() == nullptr) {
  auto runtime = OpenCLRuntime::Global();
+
+  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
    auto dt = DataTypeToEnum<T>::value;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul");
    built_options.emplace("-Dmatmul=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  uint32_t idx = 0;
+  if (!runtime->IsNonUniformWorkgroupsSupported()) {
+    kernel_.setArg(idx++, gws[0]);
+    kernel_.setArg(idx++, gws[1]);
+  }
  kernel_.setArg(idx++, *(A->opencl_image()));
  kernel_.setArg(idx++, *(B->opencl_image()));
  kernel_.setArg(idx++, *(C->opencl_image()));
@@ -47,11 +62,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
  kernel_.setArg(idx++, static_cast<int>(height_blocks));
  kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));

-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(width_blocks),
-      static_cast<uint32_t>(height_blocks * batch),
-  };
-  const std::vector<uint32_t> lws = {16, 64, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 1};
  std::stringstream ss;
  ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
     << C->dim(2) << "_" << C->dim(3);

--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -18,12 +18,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1)
      << "Pooling opencl kernel not support dilation yet";

+  auto runtime = OpenCLRuntime::Global();
+
  if (kernel_.get() == nullptr) {
    const DataType dt = DataTypeToEnum<T>::value;
-    auto runtime = OpenCLRuntime::Global();
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling");
    built_options.emplace("-Dpooling=" + kernel_name);
+
    if (pooling_type_ == MAX && input->dtype() == output->dtype()) {
      built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
      built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
@@ -35,8 +37,16 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    if (pooling_type_ == AVG) {
      built_options.emplace("-DPOOL_AVG");
    }
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
+
+  std::vector<uint32_t> gws;
  if (!IsVecEqual(input_shape_, input->shape())) {
    std::vector<index_t> output_shape(4);
    std::vector<index_t> filter_shape = {kernels_[0], kernels_[1],
@@ -59,7 +69,24 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                    &output_image_shape);
    output->ResizeImage(output_shape, output_image_shape);

+    index_t batch = output->dim(0);
+    index_t out_height = output->dim(1);
+    index_t out_width = output->dim(2);
+    index_t channels = output->dim(3);
+
+    index_t channel_blocks = (channels + 3) / 4;
+
+    gws = {
+        static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
+        static_cast<uint32_t>(batch * out_height),
+    };
+
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
@@ -71,8 +98,7 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    kernel_.setArg(idx++, *(output->opencl_image()));

    input_shape_ = input->shape();
-  }
-
+  } else {
    index_t batch = output->dim(0);
    index_t out_height = output->dim(1);
    index_t out_width = output->dim(2);
@@ -80,16 +106,17 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,

    index_t channel_blocks = (channels + 3) / 4;

-
-  const uint32_t gws[3] = {
+    gws = {
        static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(out_width),
        static_cast<uint32_t>(batch * out_height),
    };
-  std::vector<uint32_t> lws = {8, 16, 8, 1};
+  }
+
+  std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::stringstream ss;
  ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
     << "_" << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future);
 }

 template struct PoolingFunctor<DeviceType::OPENCL, float>;

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -24,16 +24,27 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
  const index_t out_height = out_height_;
  const index_t out_width = out_width_;

-  if (kernel_.get() == nullptr) {
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(out_width),
+                           static_cast<uint32_t>(out_height * batch)};
+
  auto runtime = OpenCLRuntime::Global();
+
+  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache");
    built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name);
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    kernel_ =
        runtime->BuildKernel("resize_bilinear", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  if (!IsVecEqual(input_shape_, input->shape())) {
    MACE_CHECK(out_height > 0 && out_width > 0);
@@ -50,6 +61,11 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
        CalculateResizeScale(in_width, out_width, align_corners_);

    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, *(output->opencl_image()));
    kernel_.setArg(idx++, height_scale);
@@ -61,10 +77,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
    input_shape_ = input->shape();
  }

-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(out_width),
-                           static_cast<uint32_t>(out_height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::stringstream ss;
  ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
     << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);

--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -29,15 +29,22 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
    output_list[i]->ResizeImage(output_shape, image_shape);
  }

-  if (kernel_.get() == nullptr) {
  auto runtime = OpenCLRuntime::Global();
+
+  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
    built_options.emplace("-Dslice=" + kernel_name);
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE="
                           + DtToCLCMDDt(DataTypeToEnum<T>::value));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  const index_t channel_blk = RoundUpDiv4(output_channels);

@@ -46,7 +53,8 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
      static_cast<uint32_t>(input->dim(2)),
      static_cast<uint32_t>(input->dim(0) * input->dim(1)),
  };
-  const std::vector<uint32_t> lws = {8, 16, 8, 1};
+
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::stringstream ss;
  ss << "slice_opencl_kernel_"
     << input->dim(0) << "_"
@@ -56,6 +64,11 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
     << outputs_count;
  for (int i = 0; i < outputs_count; ++i) {
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(input->opencl_image()));
    kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
    kernel_.setArg(idx++, *(output_list[i]->opencl_image()));

--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -23,29 +23,43 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
  const index_t channel_blocks = RoundUpDiv4(channels);
  const int remain_channels = channel_blocks * 4 - channels;

-  if (kernel_.get() == nullptr) {
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+
  auto runtime = OpenCLRuntime::Global();

+  if (kernel_.get() == nullptr) {
    std::set<std::string> built_options;
    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax");
    built_options.emplace("-Dsoftmax=" + kernel_name);
    auto dt = DataTypeToEnum<T>::value;
    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  if (!IsVecEqual(input_shape_, logits->shape())) {
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    kernel_.setArg(idx++, *(logits->opencl_image()));
    kernel_.setArg(idx++, static_cast<int>(channels));
    kernel_.setArg(idx++, remain_channels);
    kernel_.setArg(idx++, *(output->opencl_image()));
+
    input_shape_ = logits->shape();
  }
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-  const std::vector<uint32_t> lws = {8, 16, 8, 1};
+
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::stringstream ss;
  ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
     << "_" << output->dim(2) << "_" << output->dim(3);

--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -31,9 +31,15 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
    batch_tensor->ResizeImage(output_shape, output_image_shape);
    kernel_name = "space_to_batch";
  }
+  const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
+  const uint32_t gws[3] = {
+      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
+      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
+
+  auto runtime = OpenCLRuntime::Global();
+
  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name);
-    auto runtime = OpenCLRuntime::Global();
    std::set<std::string> built_options;
    std::stringstream kernel_name_ss;
    kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name;
@@ -41,11 +47,22 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE=" +
                          DtToCLCMDDt(DataTypeToEnum<T>::value));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    kernel_ =
        runtime->BuildKernel("space_to_batch", kernel_name, built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  if (!IsVecEqual(space_shape_, space_tensor->shape())) {
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+      kernel_.setArg(idx++, gws[2]);
+    }
    if (b2s_) {
      kernel_.setArg(idx++, *(batch_tensor->opencl_image()));
      kernel_.setArg(idx++, *(space_tensor->opencl_image()));
@@ -65,11 +82,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
    space_shape_ = space_tensor->shape();
  }

-  const uint32_t chan_blk = RoundUpDiv4<uint32_t>(batch_tensor->dim(3));
-  const uint32_t gws[3] = {
-      chan_blk, static_cast<uint32_t>(batch_tensor->dim(2)),
-      static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
-  const std::vector<uint32_t> lws = {8, 16, 8, 1};
+  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
  std::stringstream ss;
  ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
     << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -15,6 +15,8 @@ template <typename T>
 void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
    const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {

+  auto runtime = OpenCLRuntime::Global();
+
  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name =
        MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2");
@@ -24,9 +26,14 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE=" +
                          DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
-    auto runtime = OpenCLRuntime::Global();
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
                                   built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
  std::vector<index_t> output_shape(4);
  std::vector<index_t> filter_shape = {3, 3, input_tensor->dim(3), 1};
@@ -44,6 +51,9 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
  const index_t round_h = (output_shape[1] + 1) / 2;
  const index_t round_w = (output_shape[2] + 1) / 2;
  const index_t out_width = input_tensor->dim(0) * round_h * round_w;
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(out_width),
+      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};

  if (!IsVecEqual(input_shape_, input_tensor->shape())) {
    output_shape = {16, input_tensor->dim(3), out_width, 1};
@@ -52,6 +62,10 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
    output_tensor->ResizeImage(output_shape, image_shape);

    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+    }
    kernel_.setArg(idx++, *(input_tensor->opencl_image()));
    kernel_.setArg(idx++, *(output_tensor->opencl_image()));
    kernel_.setArg(idx++, static_cast<uint32_t>(input_tensor->dim(1)));
@@ -65,10 +79,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
    input_shape_ = input_tensor->shape();
  }

-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(out_width),
-      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(3)))};
-  const std::vector<uint32_t> lws = {128, 8, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};
  std::stringstream ss;
  ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
     << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
@@ -82,6 +93,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
    const Tensor *bias,
    Tensor *output_tensor,
    StatsFuture *future) {
+
+  auto runtime = OpenCLRuntime::Global();
+
  if (kernel_.get() == nullptr) {
    std::string obfuscated_kernel_name =
        MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2");
@@ -92,6 +106,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
                          DtToUpstreamCLDt(DataTypeToEnum<T>::value));
    built_options.emplace("-DCMD_DATA_TYPE=" +
                          DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
+    if (runtime->IsNonUniformWorkgroupsSupported()) {
+      built_options.emplace("-DNON_UNIFORM_WORK_GROUP");
+    }
    built_options.emplace(bias != nullptr ? "-DBIAS" : "");
    switch (activation_) {
      case NOOP:
@@ -115,10 +132,16 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
        LOG(FATAL) << "Unknown activation type: " << activation_;
    }

-    auto runtime = OpenCLRuntime::Global();
    kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name,
                                   built_options);
+
+    kwg_size_ =
+        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel_));
  }
+
+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(input_tensor->dim(2)),
+      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
  if (!IsVecEqual(input_shape_, input_tensor->shape())) {
    std::vector<index_t> output_shape = {batch_, height_, width_,
                                         input_tensor->dim(1)};
@@ -129,6 +152,10 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
    const uint32_t round_h = (height_ + 1) / 2;
    const uint32_t round_w = (width_ + 1) / 2;
    uint32_t idx = 0;
+    if (!runtime->IsNonUniformWorkgroupsSupported()) {
+      kernel_.setArg(idx++, gws[0]);
+      kernel_.setArg(idx++, gws[1]);
+    }
    kernel_.setArg(
        idx++,
        *(static_cast<const cl::Image2D *>(input_tensor->opencl_image())));
@@ -147,10 +174,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
    input_shape_ = input_tensor->shape();
  }

-  const uint32_t gws[2] = {
-      static_cast<uint32_t>(input_tensor->dim(2)),
-      static_cast<uint32_t>(RoundUpDiv4(input_tensor->dim(1)))};
-  const std::vector<uint32_t> lws = {128, 8, 1};
+  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 1};

  std::stringstream ss;
  ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"

--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -185,6 +185,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
                  StatsFuture *future);

  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };


--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -173,6 +173,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);

  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };


--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -61,6 +61,7 @@ struct SliceFunctor<DeviceType::OPENCL, T> {
                  const std::vector<Tensor *> &output_list,
                  StatsFuture *future);
  cl::Kernel kernel_;
+  uint32_t kwg_size_;
 };

 }  // namespace kernels

--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -61,6 +61,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
  void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);

  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };


--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.h
@@ -56,6 +56,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
                  StatsFuture *future);

  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  std::vector<index_t> space_shape_;
 };


--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -51,6 +51,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);

  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };

@@ -108,6 +109,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
                  StatsFuture *future);

  cl::Kernel kernel_;
+  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
 };


--- a/tools/build_mace_run.sh
+++ b/tools/build_mace_run.sh
@@ -43,6 +43,10 @@ else
    HEXAGON_MODE_BUILD_FLAG="--define hexagon=true"
  fi

+  if [ x"$TARGET_ABI" = x"arm64-v8a" ]; then
+    NEON_ENABLE_FLAG="--define neon=true"
+  fi
+
  bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \
    --crosstool_top=//external:android/crosstool \
    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
@@ -54,6 +58,7 @@ else
    --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
    --define openmp=true \
    --copt="-O3" \
+    $NEON_ENABLE_FLAG \
    $PRODUCTION_MODE_BUILD_FLAGS \
    $HEXAGON_MODE_BUILD_FLAG || exit 1
 fi

--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -376,6 +376,7 @@ def main(unused_args):
        build_run_throughput_test(target_soc, FLAGS.run_seconds,
                                  merged_lib_file, FLAGS.output_dir)

+  if FLAGS.mode == "build" or FLAGS.mode == "all":
    packaging_lib_file(FLAGS.output_dir)



--- a/tools/packaging_lib.sh
+++ b/tools/packaging_lib.sh
@@ -14,8 +14,13 @@ source ${CURRENT_DIR}/env.sh

 LIBMACE_BUILD_DIR=$1

+TAR_PACKAGE_NAME=libmace_${PROJECT_NAME}.tar.gz
+
 pushd $LIBMACE_BUILD_DIR/$PROJECT_NAME
-ls | grep -v build | xargs tar cvzf libmace_${PROJECT_NAME}.tar.gz
+if [ -f $TAR_PACKAGE_NAME ]; then
+  rm -f $TAR_PACKAGE_NAME
+fi
+ls | grep -v build | xargs tar cvzf $TAR_PACKAGE_NAME
 popd

 echo "Packaging done!"