diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc
index 4d0c235134729a76cb00ee1f77713fa23f93c0d9..ad5c8eaca8273ffc63f24949bc59f03731163ce5 100644
--- a/mace/core/runtime/opencl/opencl_allocator.cc
+++ b/mace/core/runtime/opencl/opencl_allocator.cc
@@ -56,7 +56,7 @@ MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const {
                                       nbytes, nullptr, &error);
   if (error != CL_SUCCESS) {
     LOG(WARNING) << "Allocate OpenCL Buffer with "
-                 << nbytes << " bytes failed because of"
+                 << nbytes << " bytes failed because of "
                  << OpenCLErrorToString(error);
     delete buffer;
     *result = nullptr;
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 5235479db1455f1b7830445b6e8d3de1d56da9db..ae2f69390968841a4f7410ccec3b16be2bc15971 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -371,7 +371,8 @@ OpenCLRuntime::OpenCLRuntime():
   }
 
   cl_int err;
-  if (gpu_type_ == GPUType::QUALCOMM_ADRENO) {
+  if (gpu_type_ == GPUType::QUALCOMM_ADRENO
+          && opencl_version_ == OpenCLVersion::CL_VER_2_0) {
     std::vector<cl_context_properties> context_properties;
     context_properties.reserve(5);
     GetAdrenoContextProperties(&context_properties,
@@ -698,7 +699,7 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
 
 bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() const {
   return (gpu_type_ == GPUType::QUALCOMM_ADRENO &&
-      opencl_version_ == "2.0");
+      opencl_version_ == OpenCLVersion::CL_VER_2_0);
 }
 
 GPUType OpenCLRuntime::gpu_type() const {
@@ -709,13 +710,24 @@ const std::string OpenCLRuntime::platform_info() const {
   return platform_info_;
 }
 
-const std::string OpenCLRuntime::ParseDeviceVersion(
+OpenCLVersion OpenCLRuntime::ParseDeviceVersion(
     const std::string &device_version) {
   // OpenCL Device version string format:
   // OpenCL<space><major_version.minor_version><space>
   // <vendor-specific information>
   auto words = Split(device_version, ' ');
-  return words[1];
+  if (words[1] == "2.0") {
+    return OpenCLVersion::CL_VER_2_0;
+  } else if (words[1] == "1.2") {
+    return OpenCLVersion::CL_VER_1_2;
+  } else if (words[1] == "1.1") {
+    return OpenCLVersion::CL_VER_1_1;
+  } else if (words[1] == "1.0") {
+    return OpenCLVersion::CL_VER_1_0;
+  } else {
+    LOG(FATAL) << "Do not support OpenCL version: " << words[1];
+    return OpenCLVersion::CL_VER_1_0;
+  }
 }
 
 bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const {
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index 447a2c371c14508f8023453d301a5cbe7abab5c0..31d2932823243d76fc0b15dabee2c4b8ad3af894 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -38,6 +38,13 @@ enum GPUType {
   UNKNOWN,
 };
 
+enum OpenCLVersion {
+  CL_VER_1_0,
+  CL_VER_1_1,
+  CL_VER_1_2,
+  CL_VER_2_0,
+};
+
 
 const std::string OpenCLErrorToString(cl_int error);
 
@@ -113,7 +120,7 @@ class OpenCLRuntime {
       const std::string &built_program_key,
       const std::string &build_options_str,
       cl::Program *program);
-  const std::string ParseDeviceVersion(const std::string &device_version);
+  OpenCLVersion ParseDeviceVersion(const std::string &device_version);
 
  private:
   std::unique_ptr<KVStorage> precompiled_binary_storage_;
@@ -127,7 +134,7 @@ class OpenCLRuntime {
   std::map<std::string, cl::Program> built_program_map_;
   std::mutex program_build_mutex_;
   std::string platform_info_;
-  std::string opencl_version_;
+  OpenCLVersion opencl_version_;
   std::string precompiled_binary_platform_info_;
   std::string cached_binary_platform_info_;
   bool out_of_range_check_;
diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc
index 76f246df010fba06d44fd32a8a4f7eb90b688918..2769e08b90c2a665ec4236103cdba8dd04c6fac4 100644
--- a/mace/kernels/opencl/batch_norm.cc
+++ b/mace/kernels/opencl/batch_norm.cc
@@ -118,10 +118,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
     input_shape_ = input->shape();
   }
 
-  std::vector<uint32_t> lws(4, 0);
-  lws[1] = std::min<uint32_t>(gws[1], kwg_size_);
-  lws[0] = std::min<uint32_t>(4, kwg_size_ / lws[1]);
-  lws[2] = std::min<uint32_t>(gws[2], kwg_size_ / (lws[1] * lws[0]));
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
   std::string tuning_key =
       Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3), folded_constant_);
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 47e09450a3654b1ca4bac2c825b1017fd5fc7d45..4ccb42a167715f4a20c94095da5ca256fccf1bdc 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -25,11 +25,11 @@ namespace {
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
   uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
   lws[1] = std::min<uint32_t>(gws[1], kwg_size);
   lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
   const uint32_t lws_size = lws[0] * lws[1];
-  lws[2] = std::min<uint32_t>(base, kwg_size / lws_size);
+  lws[2] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size), 1);
   return lws;
 }
 
diff --git a/mace/kernels/opencl/conv_2d.cc b/mace/kernels/opencl/conv_2d.cc
index d7706cb588aa5c3dac9cfd4fb5ff29eb714a8eca..6221382e7e8e9d9290777379d7d77832b17b8e40 100644
--- a/mace/kernels/opencl/conv_2d.cc
+++ b/mace/kernels/opencl/conv_2d.cc
@@ -80,8 +80,8 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
       std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
       uint32_t *kwg_size, std::unique_ptr<BufferBase> *kernel_error);
   // Selection matrix: kernel_size x stride_size
-  static const Conv2dOpenclFunction selector[5] = {
-      Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr};
+  static const Conv2dOpenclFunction selector[3] = {
+      Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3};
 
   index_t kernel_h = filter->dim(2);
   index_t kernel_w = filter->dim(3);
@@ -113,7 +113,7 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                   &output_image_shape);
   MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
-  if (kernel_h == kernel_w && kernel_h <= 5 &&
+  if (kernel_h == kernel_w && kernel_h <= 3 &&
       selector[kernel_h - 1] != nullptr) {
     auto conv2d_func = selector[kernel_h - 1];
     return conv2d_func(
diff --git a/mace/kernels/opencl/conv_2d_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc
index 3f9596dfe8db68c573c50c40ab1c23e4570a7188..48ea04d3d35f792a984eaa2cef01ca2b00726e3e 100644
--- a/mace/kernels/opencl/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_1x1.cc
@@ -29,7 +29,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
   uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
   uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
-  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  const uint32_t base =
+      std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
   lws[1] = std::min<uint32_t>(gws[1], kwg_size);
   if (lws[1] >= base) {
     lws[0] = std::min<uint32_t>(gws[0], base);
@@ -48,7 +49,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   if (lws[2] == 0) {
     lws[2] = std::min<uint32_t>(gws[2], base);
   }
-  lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
+  lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
+                              1);
   return lws;
 }
 
diff --git a/mace/kernels/opencl/conv_2d_3x3.cc b/mace/kernels/opencl/conv_2d_3x3.cc
index 0da37fc1573eb0b4c3be0f7f09b2dde1ba6db27e..0fa295d0746d8903741418d3cc4177a8f5c91da2 100644
--- a/mace/kernels/opencl/conv_2d_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_3x3.cc
@@ -30,7 +30,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   uint32_t compute_units = std::max<uint32_t>(
       OpenCLRuntime::Global()->device_compute_units() / 2, 1);
   const uint32_t base =
-      std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4);
+      std::max<uint32_t>(
+          std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize, 4), 1);
   lws[1] = std::min<uint32_t>(gws[1], kwg_size);
   lws[0] =
       std::min<uint32_t>(std::min<uint32_t>(gws[0], base), kwg_size / lws[1]);
@@ -42,7 +43,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   if (lws[2] == 0) {
     lws[2] = std::min<uint32_t>(gws[2], base);
   }
-  lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
+  lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
+                              1);
   return lws;
 }
 
diff --git a/mace/kernels/opencl/conv_2d_general.cc b/mace/kernels/opencl/conv_2d_general.cc
index 127df7e9c45af64fd2a316bd8880eaa1a131ff27..c3208eca5f45ad7fd8fdc255a0e59906b531e228 100644
--- a/mace/kernels/opencl/conv_2d_general.cc
+++ b/mace/kernels/opencl/conv_2d_general.cc
@@ -32,7 +32,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
   std::vector<uint32_t> lws(4, 0);
   uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
   uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
-  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  const uint32_t base =
+      std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
   lws[1] = std::min<uint32_t>(gws[1], kwg_size);
   lws[0] = gws[0] / 4;
   if (lws[0] == 0) {
@@ -51,7 +52,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
       lws[2] = base;
     }
   }
-  lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
+  lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
+                              1);
   return lws;
 }
 
diff --git a/mace/kernels/opencl/deconv_2d_opencl.cc b/mace/kernels/opencl/deconv_2d_opencl.cc
index 486bf064e592fa4b4caf8fa5f1fb51eb50cee32f..a6d65b9e3255a15235dd9421b9194934740d93bb 100644
--- a/mace/kernels/opencl/deconv_2d_opencl.cc
+++ b/mace/kernels/opencl/deconv_2d_opencl.cc
@@ -144,7 +144,7 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel,
     *prev_input_shape = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, *kwg_size);
   std::string tuning_key =
       Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc
index 7ca2d5d708ac6211e1136b7fdd46ba516f3251a3..ec358d092ef1da623190b55ff9b8da04a03bb1c4 100644
--- a/mace/kernels/opencl/depthwise_conv.cc
+++ b/mace/kernels/opencl/depthwise_conv.cc
@@ -27,24 +27,26 @@ const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
   uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t min_lws0 = cache_size / kBaseGPUMemCacheSize;
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
   lws[1] = std::min<uint32_t>(gws[1], kwg_size);
-  if (lws[1] >= min_lws0) {
-    lws[0] = std::min<uint32_t>(gws[0], min_lws0);
+  if (lws[1] >= base) {
+    lws[0] = std::min<uint32_t>(gws[0], base);
   } else {
     lws[0] = std::min<uint32_t>(gws[0] / 8, kwg_size / lws[1]);
-    if (lws[0] < min_lws0) {
-      lws[0] = std::min<uint32_t>(std::max<uint32_t>(gws[0] / 4, min_lws0),
+    if (lws[0] < base) {
+      lws[0] = std::min<uint32_t>(std::max<uint32_t>(gws[0] / 4, base),
                                   kwg_size / lws[1]);
     }
   }
+  lws[0] = std::max<uint32_t>(lws[0], 1);
   const uint32_t lws_size = lws[0] * lws[1];
   lws[2] = std::min<uint32_t>((cache_size / kernel_cache_size / lws_size) * 4,
                               gws[2]);
   if (lws[2] == 0) {
     lws[2] = gws[2];
   }
-  lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
+  lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
+                              1);
   return lws;
 }
 
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index b5254800fa9358fa95e81cb9328cfcb4f01fedba..78e50871ff77d19a7c287619c75499afd00047d7 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -252,7 +252,8 @@ std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
   lws[2] =
       std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
   const uint32_t lws_size = lws[1] * lws[2];
-  lws[0] = std::min<uint32_t>(base, kwg_size / lws_size);
+  lws[0] = std::max<uint32_t>(std::min<uint32_t>(base, kwg_size / lws_size),
+                              1);
   return lws;
 }
 
diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc
index b408a9cdf477b416cb15b79e7619d0158ebe8440..09bc8d905f835cd48345856d434ce974638c0e29 100644
--- a/mace/kernels/opencl/pooling.cc
+++ b/mace/kernels/opencl/pooling.cc
@@ -26,7 +26,7 @@ namespace {
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
   uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
   lws[1] = std::min<uint32_t>(gws[1], kwg_size);
   lws[2] =
       std::min<uint32_t>(std::min<uint32_t>(gws[2], base), kwg_size / lws[1]);
@@ -35,7 +35,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   if (lws[0] == 0) {
     lws[0] = gws[0];
   }
-  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws_size);
+  lws[0] = std::max<uint32_t>(std::min<uint32_t>(lws[0], kwg_size / lws_size),
+                              1);
   return lws;
 }
 
diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc
index e935e9059130cd513a90543d548e24be467a83ae..ee823116877939eb7df7ba142cc9e0c763c3f289 100644
--- a/mace/kernels/opencl/resize_bilinear.cc
+++ b/mace/kernels/opencl/resize_bilinear.cc
@@ -26,7 +26,7 @@ namespace {
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   std::vector<uint32_t> lws(4, 0);
   uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
   lws[1] = std::min<uint32_t>(gws[1], kwg_size);
   if (lws[1] >= base) {
     lws[0] = std::min<uint32_t>(gws[0], base);
@@ -42,7 +42,8 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   if (lws[2] == 0) {
     lws[2] = gws[2];
   }
-  lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
+  lws[2] = std::max<uint32_t>(std::min<uint32_t>(lws[2], kwg_size / lws_size),
+                              1);
   return lws;
 }
 
diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc
index 76dc2c7f810aadf9042c5a5c6ce07875fdf799f3..38d0b8bb3eb205b73a0f66e6fe0be2fad9134bcc 100644
--- a/mace/kernels/opencl/softmax.cc
+++ b/mace/kernels/opencl/softmax.cc
@@ -26,7 +26,7 @@ namespace {
 
 std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
   uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
-  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  uint32_t base = std::max<uint32_t>(cache_size / kBaseGPUMemCacheSize, 1);
   std::vector<uint32_t> lws(4, 0);
   lws[1] = std::min<uint32_t>(gws[1], kwg_size);
   if (gws[0] < base) {
@@ -35,7 +35,9 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws, const uint32_t kwg_size) {
     lws[0] = gws[0] / base;
   }
   lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
-  lws[2] = std::min<uint32_t>(gws[2], kwg_size / (lws[0] * lws[1]));
+  lws[2] = std::max<uint32_t>(std::min<uint32_t>(gws[2],
+                                                 kwg_size / (lws[0] * lws[1])),
+                              1);
   return lws;
 }
 
diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc
index 94b28b0b1b2e74b806b853e2a015a7bd54b4cade..6b2325b725177ff203deecda7b5e55bc74d47b6a 100644
--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -136,7 +136,7 @@ TEST(BufferToImageTest, WeightWidthMedium) {
 
 TEST(BufferToImageTest, WeightWidthLarge) {
   TestBidirectionTransform<DeviceType::GPU, float>(kernels::WEIGHT_WIDTH,
-                                                   {64, 128, 11, 13});
+                                                   {64, 64, 11, 13});
 }
 
 TEST(BufferToImageTest, WeightHeightSmall) {
@@ -151,7 +151,7 @@ TEST(BufferToImageTest, WeightHeightMedium) {
 
 TEST(BufferToImageTest, WeightHeightLarge) {
   TestBidirectionTransform<DeviceType::GPU, float>(kernels::WEIGHT_HEIGHT,
-                                                   {64, 32, 11, 13});
+                                                   {64, 16, 11, 13});
 }
 
 namespace {