diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 3477664fbd5952ecf486dc9fa7ce015170b53bd2..f9b0d5e2e99ce00d8a4e961ab7b9c24c56b458ed 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -362,6 +362,11 @@ OpenCLRuntime::OpenCLRuntime():
     }
   }
 
+  device_->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
+                   &device_gloabl_mem_cache_size_);
+
+  device_->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS,
+                   &device_compute_units_);
   const char *out_of_range_check = getenv("MACE_OUT_OF_RANGE_CHECK");
   if (out_of_range_check != nullptr && strlen(out_of_range_check) == 1
       && out_of_range_check[0] == '1') {
@@ -386,6 +391,14 @@ cl::Device &OpenCLRuntime::device() { return *device_; }
 
 cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }
 
+const uint64_t OpenCLRuntime::device_global_mem_cache_size() const {
+  return device_gloabl_mem_cache_size_;
+}
+
+const uint32_t OpenCLRuntime::device_compute_units() const {
+  return device_compute_units_;
+}
+
 bool OpenCLRuntime::BuildProgramFromBinary(
     const std::string &built_program_key,
     const std::string &build_options_str,
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index 8228324c04ea0a000dadeb152b9bb6b49b1427e9..521698b7fc33d4b1ee7fa8f3e19265e511870081 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -73,6 +73,8 @@ class OpenCLRuntime {
   cl::CommandQueue &command_queue();
   const GPUType gpu_type() const;
   const std::string platform_info() const;
+  const uint64_t device_global_mem_cache_size() const;
+  const uint32_t device_compute_units() const;
 
   cl::Kernel BuildKernel(const std::string &program_name,
                          const std::string &kernel_name,
@@ -128,6 +130,9 @@ class OpenCLRuntime {
   bool program_map_changed_;
   std::unique_ptr<KVStorage> storage_;
   bool is_profiling_enabled_;
+  uint64_t device_gloabl_mem_cache_size_;
+  uint32_t device_compute_units_;
+
 
   static GPUPerfHint kGPUPerfHint;
   static GPUPriorityHint kGPUPriorityHint;
diff --git a/mace/kernels/opencl/REAEMD.md b/mace/kernels/opencl/REAEMD.md
deleted file mode 100644
index c6f42fd5c04d53ac3da5476d10198aaa17244d6d..0000000000000000000000000000000000000000
--- a/mace/kernels/opencl/REAEMD.md
+++ /dev/null
@@ -1,58 +0,0 @@
-OpenCL Image Storage Layout
-===
-Use **Image** object to optimize memory access and parallel computing based on OpenCL 2.0.
-
-
-Design the corresponding **Image** format to optimize memory access for different Op algorithm.
-Each pixel of **Image** object contains 4 elements(e.g. RGBA).
-
-
-The Followings are the **Buffer** and **Image** format for all **Tensors**.
-
-Input/Output
----
-**Mace** use NHWC format Input/Output.
-
-| Tensor| Buffer| Image Size [Width, Height]| Explanation|
-| --------- | :---------:|:--------:|:----:|
-|Channel-Major Input/Output | NHWC | [W * (C+3)/4, N * H] | Default Input/Output format|
-|Height-Major Input/Output | NHWC | [W * C, N * (H+3)/4] | Winograd Convolution format| 
-|Width-Major Input/Output | NHWC | [(W+3)/4 * C, N * H] | Winograd Convolution format|
-
-Each Pixel of **Image** contains 4 elements. The below table list the coordination relation 
-between **Image** and **Buffer**.
-
-| Tensor| Pixel Coordinate Relation| Explanation
-| --------- | :---------:| :-----: |
-|Channel-Major Input/Output | P[i, j] = {E[n, h, w, c] &#124; (n=j/H, h=j%H, w=i%W, c=[i/W * 4 + k])}| k=[0, 4)|
-|Height-Major Input/Output | P[i, j] = {E[n, h, w, c] &#124; (n=j%N, h=[j/H*4 + k], w=i%W, c=i/W)}| k=[0, 4)|
-|Width-Major Input/Output | P[i, j] = {E[n, h, w, c] &#124; (n=j/H, h=j%H, w=[i%W*4 + k], c=i/W)}| k=[0, 4)|
-
-
-Filter
----
-| Tensor| Buffer| Image Size [Width, Height]| Explanation|
-| --------- | :---------:|:--------:|:----:|
-|Convolution Filter | HWOI | [H * W * RoundUp<4>(I), (O+3)/4]|Convolution filter format，There is no difference compared to [H*w*I, (O+3)/4]|
-|Depthwise Convlution Filter | HWIM | [H * W * M, (I+3)/4]|Depthwise-Convolution filter format|
-
-Each Pixel of **Image** contains 4 elements. The below table list the coordination relation 
-between **Image** and **Buffer**.
-
-| Tensor| Pixel Coordinate Relation| Explanation|
-| --------- | :---------:| :-----:|
-|Convolution Filter | P[m, n] = {E[h, w, o, i] &#124; (h=T/W, w=T%W, o=[n*4+k], i=m%RI)}| RI=((I + 3) / 4) * 4, T=m/RI, k=[0, 4)|
-|Depthwise Convlution Filter | P[m, n] = {E[h, w, i, 0] &#124; (h=m/W, w=m%W, i=[n*4+k])}| only support multiplier == 1, k=[0, 4)| 
-
-1-D Argument
----
-| Tensor| Buffer| Image Size [Width, Height]| Explanation|
-| --------- | :---------:|:--------:|:----:|
-|1-D Argument | W | [(W+3)/4, 1] | 1D argument format, e.g. Bias|
-
-Each Pixel of **Image** contains 4 elements. The below table list the coordination relation 
-between **Image** and **Buffer**.
-
-| Tensor| Pixel Coordinate Relation| Explanation|
-| --------- | :---------:| :-----:|
-|1-D Argument | P[i, 0] = {E[w] &#124; w=i*4+k}| k=[0, 4)|
diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation.cc
similarity index 93%
rename from mace/kernels/opencl/activation_opencl.cc
rename to mace/kernels/opencl/activation.cc
index 0e29e00f34f93935292304bddc0b91be8297bca8..5cee48620aa0aa6be6600bbbe331016a879c4c54 100644
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation.cc
@@ -21,7 +21,6 @@
 
 namespace mace {
 namespace kernels {
-
 template <typename T>
 void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                           const Tensor *alpha,
@@ -56,23 +55,23 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     }
     switch (activation_) {
       case RELU:
-        tuning_key_prefix_ = "relu_opencl_kernel_";
+        tuning_key_prefix_ = "relu_opencl_kernel";
         built_options.emplace("-DUSE_RELU");
         break;
       case RELUX:
-        tuning_key_prefix_ = "relux_opencl_kernel_";
+        tuning_key_prefix_ = "relux_opencl_kernel";
         built_options.emplace("-DUSE_RELUX");
         break;
       case PRELU:
-        tuning_key_prefix_ = "prelu_opencl_kernel_";
+        tuning_key_prefix_ = "prelu_opencl_kernel";
         built_options.emplace("-DUSE_PRELU");
         break;
       case TANH:
-        tuning_key_prefix_ = "tanh_opencl_kernel_";
+        tuning_key_prefix_ = "tanh_opencl_kernel";
         built_options.emplace("-DUSE_TANH");
         break;
       case SIGMOID:
-        tuning_key_prefix_ = "sigmoid_opencl_kernel_";
+        tuning_key_prefix_ = "sigmoid_opencl_kernel";
         built_options.emplace("-DUSE_SIGMOID");
         break;
       default:
@@ -110,7 +109,7 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
   std::string tuning_key =
       Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
              output->dim(3));
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index b4e2493f7876cc8b2d12dae66a5c70be3606ebb4..105435d5a48e912bf2c147d628d9f12581ebeea1 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -106,10 +106,10 @@ void AddNFunctor<DeviceType::GPU, T>::operator()(
   }
 
   const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
-  std::stringstream ss;
-  ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
-     << "_" << output_shape[2] << "_" << output_shape[3];
-  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
+  std::string tuning_key =
+      Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
+             output_tensor->dim(2), output_tensor->dim(3));
+  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm.cc
similarity index 94%
rename from mace/kernels/opencl/batch_norm_opencl.cc
rename to mace/kernels/opencl/batch_norm.cc
index 21adfd9626f5e60f9962fc6576299a007ffc2bad..f28c9ccc6cda25ec713c108bc1eae2ad3f9a38ed 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm.cc
@@ -116,9 +116,12 @@ void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
+  std::vector<uint32_t> lws(4, 0);
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size_);
+  lws[0] = std::min<uint32_t>(4, kwg_size_ / lws[1]);
+  lws[2] = std::min<uint32_t>(gws[2], kwg_size_ / (lws[1] * lws[0]));
   std::string tuning_key =
-      Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
+      Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3), folded_constant_);
   TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
 
diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add.cc
similarity index 98%
rename from mace/kernels/opencl/bias_add_opencl.cc
rename to mace/kernels/opencl/bias_add.cc
index 5cffe75caf94f61af804ab71f6381233df6012e1..b6d2b4b1855d2210cb25fb6b99921800d18a6cba 100644
--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add.cc
@@ -79,7 +79,7 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8};
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
 
   cl::Event event;
   cl_int error;
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
index b30ecb69b60cb0a12dadf63c245e606227d63ca5..7cb082544f55c2bf72711ec0fe6ec0e8448442eb 100644
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -90,14 +90,11 @@ void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
-  std::stringstream ss;
-  ss << "channel_shuffle_opencl_kernel_"
-     << output->dim(0) << "_"
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
-  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  std::string tuning_key =
+      Concat("channel_shuffle_opencl_kernel", output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 4dacf8cfa8bf8fba02c669e889f990831c1fc9ef..514da7840437f6ea7e3f8740d8cc2923279230e8 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -21,6 +21,23 @@
 namespace mace {
 namespace kernels {
 
+namespace {
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size = 
+    OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
+  const uint32_t lws_size = lws[0] * lws[1];
+  lws[2] = std::min<uint32_t>(base, kwg_size / lws_size);
+  return lws;
+}
+
+}  // namespace
+
+
 static void Concat2(cl::Kernel *kernel,
                     const Tensor *input0,
                     const Tensor *input1,
@@ -95,11 +112,11 @@ static void Concat2(cl::Kernel *kernel,
     *prev_input_shape = input0->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
-  std::stringstream ss;
-  ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
-     << "_" << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
+  const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  std::string tuning_key =
+      Concat("concat_opencl_kernel", output->dim(0), 
+             output->dim(1), output->dim(2), output->dim(3));
+  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     (*kernel_error)->Map(nullptr);
@@ -149,7 +166,6 @@ static void ConcatN(cl::Kernel *kernel,
   index_t chan_blk_offset = 0;
   cl::Event event;
   CallStats call_stats{INT64_MAX, 0};
-  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
   for (int i = 0; i < inputs_count; ++i) {
     const Tensor *input = input_list[i];
     index_t input_channel_blk = input->dim(3) / 4;
@@ -157,6 +173,7 @@ static void ConcatN(cl::Kernel *kernel,
         static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
         static_cast<uint32_t>(batch * height),
     };
+    const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
 
     uint32_t idx = 0;
     if (runtime->IsOutOfRangeCheckEnabled()) {
@@ -183,6 +200,7 @@ static void ConcatN(cl::Kernel *kernel,
       for (size_t j = 0; j < 3; ++j) {
         roundup_gws[j] = RoundUp(gws[j], lws[j]);
       }
+      const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
 
       error = runtime->command_queue().enqueueNDRangeKernel(
           *kernel, cl::NullRange,
diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d.cc
similarity index 100%
rename from mace/kernels/opencl/conv_2d_opencl.cc
rename to mace/kernels/opencl/conv_2d.cc
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc
similarity index 81%
rename from mace/kernels/opencl/conv_2d_opencl_1x1.cc
rename to mace/kernels/opencl/conv_2d_1x1.cc
index d148edb2bbdefa587f10ac28e49ba6c8c95525b2..edce5d11715686ccb90021aa34a4bd7f1858fdcd 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_1x1.cc
@@ -20,6 +20,39 @@
 namespace mace {
 namespace kernels {
 
+namespace {
+// (inputs + weights + outputs) * array_size * sizeof(float)
+const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size = 
+    OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  if (lws[1] >= base) {
+    lws[0] = std::min<uint32_t>(gws[0], base);
+  } else {
+    lws[0] = gws[0] / 8;
+    if (lws[0] < base) {
+      lws[0] = std::max<uint32_t>(gws[0] / 4, base);
+    }
+  }
+  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+  const uint32_t lws_size = lws[0] * lws[1];
+  lws[2] = std::min<uint32_t>(
+      (cache_size / kernel_cache_size / lws_size / compute_units) * 8,
+      gws[2]);
+  if (lws[2] == 0) {
+    lws[2] = std::min<uint32_t>(gws[2], base);
+  }
+  lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
+  return lws;
+}
+
+}  // namespace
+
 extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                              const Tensor *input,
                              const Tensor *filter,
@@ -130,9 +163,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
     *prev_input_shape = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
+  std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
   std::string tuning_key =
-      Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
+      Concat("conv2d_1x1_opencl_kernel", output->dim(0), 
              output->dim(1), output->dim(2), output->dim(3));
   TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
 
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_3x3.cc
similarity index 82%
rename from mace/kernels/opencl/conv_2d_opencl_3x3.cc
rename to mace/kernels/opencl/conv_2d_3x3.cc
index a51ff2527221509ce197209e2a8b5d2898f39077..7dcd320d43bd793e4eb9606976f6a377dd788302 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_3x3.cc
@@ -22,6 +22,35 @@
 namespace mace {
 namespace kernels {
 
+namespace {
+// (inputs + weights + outputs) * array_size * sizeof(float)
+const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size =
+      OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t compute_units = std::max<uint32_t>(
+      OpenCLRuntime::Global()->device_compute_units() / 2, 1);
+  const uint32_t base = std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize,
+                                           4);
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  lws[0] = std::min<uint32_t>(std::min<uint32_t>(gws[0], base),
+      kwg_size / lws[1]);
+  const uint32_t lws_size = lws[0] * lws[1];
+  lws[2] = std::min<uint32_t>(
+      RoundUp<uint32_t>(cache_size / kernel_cache_size /
+          lws_size / compute_units, base),
+      gws[2]);
+  if (lws[2] == 0) {
+    lws[2] = std::min<uint32_t>(gws[2], base);
+  }
+  lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
+  return lws;
+}
+
+}  // namespace
+
 extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                              const Tensor *input,
                              const Tensor *filter,
@@ -128,9 +157,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
     *prev_input_shape = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 0};
+  const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
   std::string tuning_key =
-      Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
+      Concat("conv2d_3x3_opencl_kernel", output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
   TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
 
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_general.cc
similarity index 79%
rename from mace/kernels/opencl/conv_2d_opencl_general.cc
rename to mace/kernels/opencl/conv_2d_general.cc
index b8431193358909deb7fa435b1e72e3a3f843c30c..63b64dbdfa50d5f82a2d73b1191740cf99606b9f 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_general.cc
@@ -21,6 +21,42 @@
 
 namespace mace {
 namespace kernels {
+namespace {
+// (inputs + weights + outputs) * array_size * sizeof(float)
+const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
+// TODO(liuqi): Fix the specific value.
+const uint32_t lws_limit = 20;
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kernel_size,
+                              const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size =
+    OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  lws[0] = gws[0] / 4;
+  if (lws[0] == 0) {
+    lws[0] = gws[0];
+  }
+  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+  const uint32_t lws_size = lws[0] * lws[1];
+  lws[2] = std::min<uint32_t>(
+      (cache_size / kernel_cache_size / kernel_size / lws_size / compute_units)
+          * 8,
+      gws[2]);
+  if (lws[2] == 0) {
+    if (gws[2] < lws_limit) {
+      lws[2] = gws[2];
+    } else {
+      lws[2] = base;
+    }
+  }
+  lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
+  return lws;
+}
+
+}  // namespace
 
 extern void Conv2dOpencl(cl::Kernel *kernel,
                          const Tensor *input,
@@ -130,10 +166,12 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
     *prev_input_shape = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
   std::string tuning_key =
-      Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
-             output->dim(1), output->dim(2), output->dim(3));
+      Concat("conv2d_general_opencl_kernel", output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3), 
+             filter->dim(0), filter->dim(1));
+  std::vector<uint32_t> lws =
+    LocalWS(gws, filter->dim(0) * filter->dim(1), *kwg_size);
   TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
diff --git a/mace/kernels/opencl/depth_to_space_opencl.cc b/mace/kernels/opencl/depth_to_space.cc
similarity index 92%
rename from mace/kernels/opencl/depth_to_space_opencl.cc
rename to mace/kernels/opencl/depth_to_space.cc
index f5f45ca94a74f91471060947e9717182e188b44a..fd25f948c355999909bcd670e41ff249dc4e5aea 100644
--- a/mace/kernels/opencl/depth_to_space_opencl.cc
+++ b/mace/kernels/opencl/depth_to_space.cc
@@ -33,7 +33,7 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
   const char *kernel_name = nullptr;
 
   uint32_t gws[3];
-  std::stringstream ss;
+  std::string tuning_key;
   index_t output_height, output_width, output_depth;
   if (d2s_) {
     output_height = input_height * block_size_;
@@ -46,8 +46,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
     gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
     gws[1] = static_cast<uint32_t>(output_width);
     gws[2] = static_cast<uint32_t>(output_height * batch);
-    ss << "depth_to_space_opencl_kernel_" << batch << "_"
-       << output_height << "_" << output_width << "_" << output_depth;
+    tuning_key = Concat("depth_to_space_opencl_kernel", batch, output_height,
+                        output_width, output_depth);
   } else {
     output_height = input_height / block_size_;
     output_width = input_width / block_size_;
@@ -59,8 +59,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
     gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
     gws[1] = static_cast<uint32_t>(input_width);
     gws[2] = static_cast<uint32_t>(input_height * batch);
-    ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_"
-       << input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3);
+    tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0),
+                        input->dim(1), input->dim(2), input->dim(3));
   }
   const index_t input_depth_blocks = RoundUpDiv4(input_depth);
   const index_t output_depth_blocks = RoundUpDiv4(output_depth);
@@ -134,8 +134,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
-  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv.cc
similarity index 87%
rename from mace/kernels/opencl/depthwise_conv_opencl.cc
rename to mace/kernels/opencl/depthwise_conv.cc
index 67bfbf7a6c051c156b1db4b0e9114fcc09cdb1ce..d4aa32f3b715baa8b1118890e201a9309c32174e 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv.cc
@@ -21,6 +21,37 @@
 namespace mace {
 namespace kernels {
 
+namespace {
+// (inputs + weights + outputs) * array_size * sizeof(float)
+const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t min_lws0 = cache_size / kBaseGPUMemCacheSize;
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  if (lws[1] >= min_lws0) {
+    lws[0] = std::min<uint32_t>(gws[0], min_lws0);
+  } else { 
+    lws[0] = std::min<uint32_t>(gws[0] / 8, kwg_size / lws[1]);
+    if (lws[0] < min_lws0) {
+      lws[0] = std::min<uint32_t>(std::max<uint32_t>(gws[0] / 4, min_lws0),
+                                  kwg_size / lws[1]);
+    }
+  }
+  const uint32_t lws_size = lws[0] * lws[1];
+  lws[2] = std::min<uint32_t>(
+      (cache_size / kernel_cache_size / lws_size) * 4,
+      gws[2]);
+  if (lws[2] == 0) {
+    lws[2] = gws[2];
+  }
+  lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
+  return lws;
+}
+
+}  // namespace
+
 static void DepthwiseConv2d(cl::Kernel *kernel,
                             const Tensor *input,   // NHWC
                             const Tensor *filter,  // HWIM
@@ -149,9 +180,9 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
     *prev_input_shape = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
-  std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
-                                  batch, height, width, channels, multiplier);
+  const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel",
+                                  gws[0], gws[1], gws[2], multiplier);
   TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise.cc
similarity index 94%
rename from mace/kernels/opencl/eltwise_opencl.cc
rename to mace/kernels/opencl/eltwise.cc
index d834c292c51697adacb36cb849cbb2b50b6085fc..e3f4b8f8f7db189fc1faf8d52140cd259768af9f 100644
--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise.cc
@@ -116,11 +116,11 @@ void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
     input_shape_ = input0->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
-  std::stringstream ss;
-  ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
-     << "_" << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  std::string tuning_key =
+      Concat("eltwise_opencl_kernel", output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
     char *kerror_code = kernel_error_->mutable_data<char>();
diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected.cc
similarity index 98%
rename from mace/kernels/opencl/fully_connected_opencl.cc
rename to mace/kernels/opencl/fully_connected.cc
index 378a9d835436f7f6cd8932935dec8f58d3d4abdc..e1546541a71d056d61bbaa2205a1fdd7ee55ec94 100644
--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected.cc
@@ -267,10 +267,10 @@ void FCWTXKernel(cl::Kernel *kernel,
     *prev_input_shape = input->shape();
   }
 
-  std::stringstream ss;
-  ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
-     << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun2DKernel(*kernel, ss.str(), gws->data(), *lws, future);
+  std::string tuning_key =
+      Concat("fc_opencl_kernel", output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future);
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     (*kernel_error)->Map(nullptr);
diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc
index 7934b7209f0456edda559044b041f482b8554472..7f3e444e832a9cbe74f00610ca44ac44158de45c 100644
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -206,6 +206,32 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
   }
 }
 
+std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws,
+                                       const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(3, 0);
+  uint64_t cache_size =
+      OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  lws[0] = std::min<uint32_t>(base, kwg_size);
+  lws[1] = kwg_size / lws[1];
+  return lws;
+
+}
+
+std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
+                                       const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size =
+      OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base),
+                              kwg_size / lws[1]);
+  const uint32_t lws_size = lws[1] * lws[2];
+  lws[0] = std::min<uint32_t>(base, kwg_size / lws_size);
+  return lws;
+}
+
 void TuningOrRun3DKernel(const cl::Kernel &kernel,
                          const std::string tuning_key,
                          const uint32_t *gws,
@@ -216,31 +242,47 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     const uint32_t kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
-    local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
-    local_ws[2] =
-        std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1]));
-    return {
+    std::vector<std::vector<uint32_t>> results;
+    std::vector<std::vector<uint32_t>> candidates = {
         // TODO(heliangliang): tuning these magic numbers
-        {local_ws[0], local_ws[1], local_ws[2], 0},
-        {kwg_size / 16, 4, 4, 0},
-        {kwg_size / 32, 4, 8, 0},
-        {kwg_size / 32, 8, 4, 0},
-        {kwg_size / 64, 8, 8, 0},
-        {kwg_size / 64, 16, 4, 0},
-        {kwg_size / 128, 8, 16, 0},
-        {kwg_size / 128, 16, 8, 0},
-        {kwg_size / 128, 32, 4, 0},
-        {1, kwg_size / 32, 32, 0},
-        {1, kwg_size / 64, 64, 0},
-        {1, kwg_size / 128, 128, 0},
-        {4, kwg_size / 16, 4, 0},
-        {4, kwg_size / 28, 7, 0},
-        {4, kwg_size / 32, 8, 0},
-        {4, kwg_size / 56, 14, 0},
-        {1, kwg_size, 1, 0},
+        {gws[0], gws[1], gws[2], 0},
+        {gws[0], gws[1], gws[2] / 8, 0},
+        {gws[0], gws[1], gws[2] / 4, 0},
+        {gws[0], gws[1], 8, 0},
+        {gws[0], gws[1], 4, 0},
+        {gws[0], gws[1], 1, 0},
+        {gws[0] / 4, gws[1], gws[2], 0},
+        {gws[0] / 4, gws[1], gws[2] / 8, 0},
+        {gws[0] / 4, gws[1], gws[2] / 4, 0},
+        {gws[0] / 4, gws[1], 8, 0},
+        {gws[0] / 4, gws[1], 4, 0},
+        {gws[0] / 4, gws[1], 1, 0},
+        {gws[0] / 8, gws[1], gws[2], 0},
+        {gws[0] / 8, gws[1], gws[2] / 8, 0},
+        {gws[0] / 8, gws[1], gws[2] / 4, 0},
+        {gws[0] / 8, gws[1], 8, 0},
+        {gws[0] / 8, gws[1], 4, 0},
+        {gws[0] / 8, gws[1], 1, 0},
+        {4, gws[1], gws[2], 0},
+        {4, gws[1], gws[2] / 8, 0},
+        {4, gws[1], gws[2] / 4, 0},
+        {4, gws[1], 8, 0},
+        {4, gws[1], 4, 0},
+        {4, gws[1], 1, 0},
+        {1, gws[1], gws[2], 0},
+        {1, gws[1], gws[2] / 8, 0},
+        {1, gws[1], gws[2] / 4, 0},
+        {1, gws[1], 8, 0},
+        {1, gws[1], 4, 0},
+        {1, gws[1], 1, 0},
     };
+    for (auto &ele : candidates) {
+      const uint32_t tmp = ele[0] * ele[1] * ele[2];
+      if (0 < tmp && tmp <= kwg_size) {
+        results.push_back(ele);
+      }
+    }
+    return results;
   };
   cl::Event event;
   auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
@@ -333,19 +375,26 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
   auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
     const uint32_t kwg_size =
         static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
-    uint32_t local_ws[2];
-    local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
-    local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
-    return {{local_ws[0], local_ws[1], 0},
-            {local_ws[1], local_ws[0], 0},
-            {kwg_size / 4, 4, 0},
-            {kwg_size / 16, 16, 0},
-            {kwg_size / 32, 32, 0},
-            {kwg_size / 64, 64, 0},
-            {kwg_size / 128, 128, 0},
-            {kwg_size / 256, 256, 0},
-            {kwg_size, 1, 0},
-            {1, kwg_size, 0}};
+    std::vector<std::vector<uint32_t>> results;
+    std::vector<std::vector<uint32_t>> candidates = {
+        {kwg_size / 2, 2, 0},
+        {kwg_size / 4, 4, 0},
+        {kwg_size / 8, 8, 0},
+        {kwg_size / 16, 16, 0},
+        {kwg_size / 32, 32, 0},
+        {kwg_size / 64, 64, 0},
+        {kwg_size / 128, 128, 0},
+        {kwg_size / 256, 256, 0},
+        {kwg_size, 1, 0},
+        {1, kwg_size, 0}
+    };
+    for (auto &ele : candidates) {
+      const uint32_t tmp = ele[0] * ele[1] * ele[2];
+      if (0 < tmp && tmp <= kwg_size) {
+        results.push_back(ele);
+      }
+    }
+    return results;
   };
   cl::Event event;
   auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
@@ -426,5 +475,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
   }
 }
 
+
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h
index 5257ed2cabfc72e59a80f2b1e6af8a00df03cb8c..4576ba99e41156a03def60bac6d9eccb1e7ac69a 100644
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -29,6 +29,8 @@ namespace kernels {
 
 const float kMaxKernelExeTime = 1000.0;  // microseconds
 
+const int32_t kBaseGPUMemCacheSize = 16384;
+
 enum BufferType {
   CONV2D_FILTER = 0,
   IN_OUT_CHANNEL = 1,
@@ -112,6 +114,11 @@ std::string Concat(Args... args) {
   return ss.str();
 }
 
+std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws,
+                                       const uint32_t kwg_size);
+std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
+                                       const uint32_t kwg_size);
+
 }  // namespace kernels
 }  // namespace mace
 #endif  // MACE_KERNELS_OPENCL_HELPER_H_
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
index b307c44572932bbcdbb5abee14bca75714abc36b..98529547d172613bdc33c5313edc88974efeafb4 100644
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -85,10 +85,10 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
   kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));
 
   const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
-  std::stringstream ss;
-  ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
-     << C->dim(2) << "_" << C->dim(3);
-  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
+  std::string tuning_key =
+      Concat("matmul_opencl_kernel", C->dim(0), 
+             C->dim(1), C->dim(2), C->dim(3));
+  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc
index 46eb496832c1536c2c0d8ee3ef645062ad3a405e..c3c90944aef80f6e971c8bbfa76045381d399786 100644
--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -100,7 +100,7 @@ void PadFunctor<DeviceType::GPU, T>::operator()(
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
   std::string tuning_key =
       Concat("pad", output->dim(0), output->dim(1), output->dim(2),
              output->dim(3));
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling.cc
similarity index 85%
rename from mace/kernels/opencl/pooling_opencl.cc
rename to mace/kernels/opencl/pooling.cc
index 5d31b76f325111f289f0aa88fe286ccc93357a36..7d7fe3d86e48c44dd3a2ffc34460a055d2fd2925 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling.cc
@@ -21,6 +21,28 @@
 namespace mace {
 namespace kernels {
 
+namespace {
+
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size =
+    OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base),
+                              kwg_size / lws[1]);
+  const uint32_t lws_size = lws[1] * lws[2];
+  lws[0] = gws[0] / 4;
+  if (lws[0] == 0) {
+    lws[0] = gws[0];
+  }
+  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws_size);
+  return lws;
+}
+
+}  // namespace
+
 template <typename T>
 void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                        Tensor *output,
@@ -134,11 +156,11 @@ void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
     };
   }
 
-  std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
-  std::stringstream ss;
-  ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
-     << "_" << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future);
+  const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_);
+  std::string tuning_key =
+      Concat("pooling_opencl_kernel_", output->dim(0), 
+             output->dim(1), output->dim(2), output->dim(3));
+  TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future);
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear.cc
similarity index 80%
rename from mace/kernels/opencl/resize_bilinear_opencl.cc
rename to mace/kernels/opencl/resize_bilinear.cc
index 1b154bb1adb97657f8e625a5fe839fbc17347550..45f3b2e2a7ac14a2d4a2f1f66c2f12660a463fe0 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear.cc
@@ -22,6 +22,34 @@
 namespace mace {
 namespace kernels {
 
+namespace {
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size =
+    OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  if (lws[1] >= base) {
+    lws[0] = std::min<uint32_t>(gws[0], base);
+  } else {
+    lws[0] = gws[0] / 8;
+    if (lws[0] == 0) {
+      lws[0] = gws[0];
+    }
+  }
+  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+  const uint32_t lws_size = lws[0] * lws[1];
+  lws[2] = gws[2] / 8;
+  if (lws[2] == 0) {
+    lws[2] = gws[2];
+  }
+  lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
+  return lws;
+}
+
+}  // namespace
+
 template <typename T>
 void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
     const Tensor *input, Tensor *output, StatsFuture *future) {
@@ -99,11 +127,11 @@ void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
     input_shape_ = input->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
-  std::stringstream ss;
-  ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
-     << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  std::string tuning_key =
+      Concat("resize_bilinear_opencl_kernel", output->dim(0), 
+             output->dim(1), output->dim(2), output->dim(3));
+  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
index 29b5f909b28e1504f6a9c825c3e50e1a3b44e676..b7f6086814fb2f63a9a24ccbde5baca0ba904f56 100644
--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -72,7 +72,7 @@ void SliceFunctor<DeviceType::GPU, T>::operator()(
       static_cast<uint32_t>(input->dim(0) * input->dim(1)),
   };
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
   cl::Event event;
   CallStats call_stats{INT64_MAX, 0};
   for (int i = 0; i < outputs_count; ++i) {
diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax.cc
similarity index 81%
rename from mace/kernels/opencl/softmax_opencl.cc
rename to mace/kernels/opencl/softmax.cc
index 47c10dca9fa70e3620dd6ae52e24aff6204c806c..85ba41f274fe97f81e3573aa7c2d4a90df0539f6 100644
--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax.cc
@@ -22,6 +22,27 @@
 namespace mace {
 namespace kernels {
 
+namespace {
+
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kwg_size) {
+  uint64_t cache_size =
+    OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  std::vector<uint32_t> lws(4, 0);
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  if (gws[0] < base) {
+    lws[0] = gws[0];
+  } else {
+    lws[0] = gws[0] / base;
+  }
+  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+  lws[2] = std::min<uint32_t>(gws[2], kwg_size / (lws[0] * lws[1]));
+  return lws;
+}
+
+}  // namespace
+
 template <typename T>
 void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
                                                        Tensor *output,
@@ -81,11 +102,11 @@ void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
     input_shape_ = logits->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
-  std::stringstream ss;
-  ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
-     << "_" << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  std::string tuning_key =
+      Concat("softmax_opencl_kernel", output->dim(0), 
+             output->dim(1), output->dim(2), output->dim(3));
+  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch.cc
similarity index 93%
rename from mace/kernels/opencl/space_to_batch_opencl.cc
rename to mace/kernels/opencl/space_to_batch.cc
index 454d2d0a0d0e148618262ebb710d3a9712ec2ca2..02f76bea173e9275db4d35b05152af53518cd668 100644
--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch.cc
@@ -105,12 +105,11 @@ void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
     space_shape_ = space_tensor->shape();
   }
 
-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
-  std::stringstream ss;
-  ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
-     << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
-     << batch_tensor->dim(3);
-  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  std::string tuning_key =
+      Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
+             batch_tensor->dim(2), batch_tensor->dim(3));
+  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index 62a7ca601f5a29b5387f90938281c3cb73128235..497cd1000479ff1c855cd3c89199fbab8cb96ced 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -102,11 +102,11 @@ void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
   }
 
   const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
-  std::stringstream ss;
-  ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
-     << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
-     << input_tensor->dim(3);
-  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
+  std::string tuning_key =
+      Concat("winograd_transform_kernel", output_tensor->dim(0), 
+             output_tensor->dim(1), output_tensor->dim(2),
+             output_tensor->dim(3));
+  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);
@@ -216,12 +216,11 @@ void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
   }
 
   const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
-
-  std::stringstream ss;
-  ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"
-     << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
-     << input_tensor->dim(3);
-  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
+  std::string tuning_key =
+      Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), 
+             output_tensor->dim(1), output_tensor->dim(2),
+             output_tensor->dim(3), input_tensor->dim(2));
+  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);
 
   if (runtime->IsOutOfRangeCheckEnabled()) {
     kernel_error_->Map(nullptr);