Merge branch 'opencl-lws' into 'master'

Refactor opencl default local work group size. See merge request !449

Merge branch 'opencl-lws' into 'master'
Refactor opencl default local work group size. See merge request !449
eb1e5131 · Liangliang He · c9490fd3 · 1794dae4 · eb1e5131 · eb1e5131
27 changed file
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -362,6 +362,11 @@ OpenCLRuntime::OpenCLRuntime():
    }
  }

+  device_->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
+                   &device_gloabl_mem_cache_size_);
+
+  device_->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS,
+                   &device_compute_units_);
  const char *out_of_range_check = getenv("MACE_OUT_OF_RANGE_CHECK");
  if (out_of_range_check != nullptr && strlen(out_of_range_check) == 1
      && out_of_range_check[0] == '1') {
@@ -386,6 +391,14 @@ cl::Device &OpenCLRuntime::device() { return *device_; }

 cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; }

+const uint64_t OpenCLRuntime::device_global_mem_cache_size() const {
+  return device_gloabl_mem_cache_size_;
+}
+
+const uint32_t OpenCLRuntime::device_compute_units() const {
+  return device_compute_units_;
+}
+
 bool OpenCLRuntime::BuildProgramFromBinary(
    const std::string &built_program_key,
    const std::string &build_options_str,

--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -73,6 +73,8 @@ class OpenCLRuntime {
  cl::CommandQueue &command_queue();
  const GPUType gpu_type() const;
  const std::string platform_info() const;
+  const uint64_t device_global_mem_cache_size() const;
+  const uint32_t device_compute_units() const;

  cl::Kernel BuildKernel(const std::string &program_name,
                         const std::string &kernel_name,
@@ -128,6 +130,9 @@ class OpenCLRuntime {
  bool program_map_changed_;
  std::unique_ptr<KVStorage> storage_;
  bool is_profiling_enabled_;
+  uint64_t device_gloabl_mem_cache_size_;
+  uint32_t device_compute_units_;
+

  static GPUPerfHint kGPUPerfHint;
  static GPUPriorityHint kGPUPriorityHint;

--- a/mace/kernels/opencl/REAEMD.md
+++ b/mace/kernels/opencl/REAEMD.md
-OpenCL Image Storage Layout
-===
-Use **Image** object to optimize memory access and parallel computing based on OpenCL 2.0.
-
-
-Design the corresponding **Image** format to optimize memory access for different Op algorithm.
-Each pixel of **Image** object contains 4 elements(e.g. RGBA).
-
-
-The Followings are the **Buffer** and **Image** format for all **Tensors**.
-
-Input/Output
---
-**Mace** use NHWC format Input/Output.
-
-| Tensor| Buffer| Image Size [Width, Height]| Explanation|
-| --------- | :---------:|:--------:|:----:|
-|Channel-Major Input/Output | NHWC | [W * (C+3)/4, N * H] | Default Input/Output format|
-|Height-Major Input/Output | NHWC | [W * C, N * (H+3)/4] | Winograd Convolution format| 
-|Width-Major Input/Output | NHWC | [(W+3)/4 * C, N * H] | Winograd Convolution format|
-
-Each Pixel of **Image** contains 4 elements. The below table list the coordination relation 
-between **Image** and **Buffer**.
-
-| Tensor| Pixel Coordinate Relation| Explanation
-| --------- | :---------:| :-----: |
-|Channel-Major Input/Output | P[i, j] = {E[n, h, w, c] &#124; (n=j/H, h=j%H, w=i%W, c=[i/W * 4 + k])}| k=[0, 4)|
-|Height-Major Input/Output | P[i, j] = {E[n, h, w, c] &#124; (n=j%N, h=[j/H*4 + k], w=i%W, c=i/W)}| k=[0, 4)|
-|Width-Major Input/Output | P[i, j] = {E[n, h, w, c] &#124; (n=j/H, h=j%H, w=[i%W*4 + k], c=i/W)}| k=[0, 4)|
-
-
-Filter
---
-| Tensor| Buffer| Image Size [Width, Height]| Explanation|
-| --------- | :---------:|:--------:|:----:|
-|Convolution Filter | HWOI | [H * W * RoundUp<4>(I), (O+3)/4]|Convolution filter format，There is no difference compared to [H*w*I, (O+3)/4]|
-|Depthwise Convlution Filter | HWIM | [H * W * M, (I+3)/4]|Depthwise-Convolution filter format|
-
-Each Pixel of **Image** contains 4 elements. The below table list the coordination relation 
-between **Image** and **Buffer**.
-
-| Tensor| Pixel Coordinate Relation| Explanation|
-| --------- | :---------:| :-----:|
-|Convolution Filter | P[m, n] = {E[h, w, o, i] &#124; (h=T/W, w=T%W, o=[n*4+k], i=m%RI)}| RI=((I + 3) / 4) * 4, T=m/RI, k=[0, 4)|
-|Depthwise Convlution Filter | P[m, n] = {E[h, w, i, 0] &#124; (h=m/W, w=m%W, i=[n*4+k])}| only support multiplier == 1, k=[0, 4)| 
-
-1-D Argument
---
-| Tensor| Buffer| Image Size [Width, Height]| Explanation|
-| --------- | :---------:|:--------:|:----:|
-|1-D Argument | W | [(W+3)/4, 1] | 1D argument format, e.g. Bias|
-
-Each Pixel of **Image** contains 4 elements. The below table list the coordination relation 
-between **Image** and **Buffer**.
-
-| Tensor| Pixel Coordinate Relation| Explanation|
-| --------- | :---------:| :-----:|
-|1-D Argument | P[i, 0] = {E[w] &#124; w=i*4+k}| k=[0, 4)|
--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -21,7 +21,6 @@

 namespace mace {
 namespace kernels {
-
 template <typename T>
 void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                          const Tensor *alpha,
@@ -56,23 +55,23 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    }
    switch (activation_) {
      case RELU:
-        tuning_key_prefix_ = "relu_opencl_kernel_";
+        tuning_key_prefix_ = "relu_opencl_kernel";
        built_options.emplace("-DUSE_RELU");
        break;
      case RELUX:
-        tuning_key_prefix_ = "relux_opencl_kernel_";
+        tuning_key_prefix_ = "relux_opencl_kernel";
        built_options.emplace("-DUSE_RELUX");
        break;
      case PRELU:
-        tuning_key_prefix_ = "prelu_opencl_kernel_";
+        tuning_key_prefix_ = "prelu_opencl_kernel";
        built_options.emplace("-DUSE_PRELU");
        break;
      case TANH:
-        tuning_key_prefix_ = "tanh_opencl_kernel_";
+        tuning_key_prefix_ = "tanh_opencl_kernel";
        built_options.emplace("-DUSE_TANH");
        break;
      case SIGMOID:
-        tuning_key_prefix_ = "sigmoid_opencl_kernel_";
+        tuning_key_prefix_ = "sigmoid_opencl_kernel";
        built_options.emplace("-DUSE_SIGMOID");
        break;
      default:
@@ -110,7 +109,7 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    input_shape_ = input->shape();
  }

-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
  std::string tuning_key =
      Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
             output->dim(3));

--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -106,10 +106,10 @@ void AddNFunctor<DeviceType::GPU, T>::operator()(
  }

  const std::vector<uint32_t> lws = {kwg_size_ / 16, 16, 0};
-  std::stringstream ss;
-  ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1]
-     << "_" << output_shape[2] << "_" << output_shape[3];
-  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
+  std::string tuning_key =
+      Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
+             output_tensor->dim(2), output_tensor->dim(3));
+  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -116,9 +116,12 @@ void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    input_shape_ = input->shape();
  }

-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
+  std::vector<uint32_t> lws(4, 0);
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size_);
+  lws[0] = std::min<uint32_t>(4, kwg_size_ / lws[1]);
+  lws[2] = std::min<uint32_t>(gws[2], kwg_size_ / (lws[1] * lws[0]));
  std::string tuning_key =
-      Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
+      Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
             output->dim(1), output->dim(2), output->dim(3), folded_constant_);
  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);


--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -79,7 +79,7 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    input_shape_ = input->shape();
  }

-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8};
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);

  cl::Event event;
  cl_int error;

--- a/mace/kernels/opencl/channel_shuffle.cc
+++ b/mace/kernels/opencl/channel_shuffle.cc
@@ -90,14 +90,11 @@ void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }

-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
-  std::stringstream ss;
-  ss << "channel_shuffle_opencl_kernel_"
-     << output->dim(0) << "_"
-     << output->dim(1) << "_"
-     << output->dim(2) << "_"
-     << output->dim(3);
-  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  std::string tuning_key =
+      Concat("channel_shuffle_opencl_kernel", output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -21,6 +21,23 @@
 namespace mace {
 namespace kernels {

+namespace {
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size =
+    OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  lws[0] = std::min<uint32_t>(base, kwg_size / lws[1]);
+  const uint32_t lws_size = lws[0] * lws[1];
+  lws[2] = std::min<uint32_t>(base, kwg_size / lws_size);
+  return lws;
+}
+
+}  // namespace
+
+
 static void Concat2(cl::Kernel *kernel,
                    const Tensor *input0,
                    const Tensor *input1,
@@ -95,11 +112,11 @@ static void Concat2(cl::Kernel *kernel,
    *prev_input_shape = input0->shape();
  }

-  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
-  std::stringstream ss;
-  ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
-     << "_" << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future);
+  const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  std::string tuning_key =
+      Concat("concat_opencl_kernel", output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {
    (*kernel_error)->Map(nullptr);
@@ -149,7 +166,6 @@ static void ConcatN(cl::Kernel *kernel,
  index_t chan_blk_offset = 0;
  cl::Event event;
  CallStats call_stats{INT64_MAX, 0};
-  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 1};
  for (int i = 0; i < inputs_count; ++i) {
    const Tensor *input = input_list[i];
    index_t input_channel_blk = input->dim(3) / 4;
@@ -157,6 +173,7 @@ static void ConcatN(cl::Kernel *kernel,
        static_cast<uint32_t>(input_channel_blk), static_cast<uint32_t>(width),
        static_cast<uint32_t>(batch * height),
    };
+    const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);

    uint32_t idx = 0;
    if (runtime->IsOutOfRangeCheckEnabled()) {
@@ -183,6 +200,7 @@ static void ConcatN(cl::Kernel *kernel,
      for (size_t j = 0; j < 3; ++j) {
        roundup_gws[j] = RoundUp(gws[j], lws[j]);
      }
+      const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);

      error = runtime->command_queue().enqueueNDRangeKernel(
          *kernel, cl::NullRange,

--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -20,6 +20,43 @@
 namespace mace {
 namespace kernels {

+namespace {
+// (inputs + weights + outputs) * array_size * sizeof(float)
+const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
+// TODO(liuqi): Fix the specific value.
+const uint32_t lws_limit = 128;
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size =
+    OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  if (lws[1] >= base) {
+    lws[0] = std::min<uint32_t>(gws[0], base);
+  } else if ((1 < lws[1] && lws[1] < base) && gws[0] >= lws_limit) {
+    lws[0] = std::min<uint32_t>(gws[0], base);
+  } else {
+    lws[0] = gws[0] / 8;
+    if (lws[0] < base) {
+      lws[0] = std::max<uint32_t>(gws[0] / 4, base);
+    }
+  }
+  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+  const uint32_t lws_size = lws[0] * lws[1];
+  lws[2] = std::min<uint32_t>(
+      (cache_size / kernel_cache_size / lws_size / compute_units) * 8,
+      gws[2]);
+  if (lws[2] == 0) {
+    lws[2] = std::min<uint32_t>(gws[2], base);
+  }
+  lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
+  return lws;
+}
+
+}  // namespace
+
 extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                             const Tensor *input,
                             const Tensor *filter,
@@ -130,9 +167,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }

-  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
+  std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
  std::string tuning_key =
-      Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
+      Concat("conv2d_1x1_opencl_kernel", output->dim(0),
             output->dim(1), output->dim(2), output->dim(3));
  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);


--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -21,6 +21,34 @@

 namespace mace {
 namespace kernels {
+namespace {
+// (inputs + weights + outputs) * array_size * sizeof(float)
+const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4;
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size =
+      OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t compute_units = std::max<uint32_t>(
+      OpenCLRuntime::Global()->device_compute_units() / 2, 1);
+  const uint32_t base = std::min<uint32_t>(cache_size / kBaseGPUMemCacheSize,
+                                           4);
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  lws[0] = std::min<uint32_t>(std::min<uint32_t>(gws[0], base),
+      kwg_size / lws[1]);
+  const uint32_t lws_size = lws[0] * lws[1];
+  lws[2] = std::min<uint32_t>(
+      RoundUp<uint32_t>(cache_size / kernel_cache_size /
+          lws_size / compute_units, base),
+      gws[2]);
+  if (lws[2] == 0) {
+    lws[2] = std::min<uint32_t>(gws[2], base);
+  }
+  lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
+  return lws;
+}
+
+}  // namespace

 extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                             const Tensor *input,
@@ -128,9 +156,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }

-  const std::vector<uint32_t> lws = {4, *kwg_size / 32, 8, 0};
+  std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
  std::string tuning_key =
-      Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
+      Concat("conv2d_3x3_opencl_kernel", output->dim(0),
             output->dim(1), output->dim(2), output->dim(3));
  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);


--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -21,6 +21,42 @@

 namespace mace {
 namespace kernels {
+namespace {
+// (inputs + weights + outputs) * array_size * sizeof(float)
+const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4;
+// TODO(liuqi): Fix the specific value.
+const uint32_t lws_limit = 20;
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kernel_size,
+                              const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size =
+    OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  lws[0] = gws[0] / 4;
+  if (lws[0] == 0) {
+    lws[0] = gws[0];
+  }
+  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+  const uint32_t lws_size = lws[0] * lws[1];
+  lws[2] = std::min<uint32_t>(
+      (cache_size / kernel_cache_size / kernel_size / lws_size / compute_units)
+          * 8,
+      gws[2]);
+  if (lws[2] == 0) {
+    if (gws[2] < lws_limit) {
+      lws[2] = gws[2];
+    } else {
+      lws[2] = base;
+    }
+  }
+  lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
+  return lws;
+}
+
+}  // namespace

 extern void Conv2dOpencl(cl::Kernel *kernel,
                         const Tensor *input,
@@ -130,10 +166,12 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }

-  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
  std::string tuning_key =
-      Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
-             output->dim(1), output->dim(2), output->dim(3));
+      Concat("conv2d_general_opencl_kernel", output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3),
+             filter->dim(0), filter->dim(1));
+  std::vector<uint32_t> lws =
+    LocalWS(gws, filter->dim(0) * filter->dim(1), *kwg_size);
  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {

--- a/mace/kernels/opencl/depth_to_space_opencl.cc
+++ b/mace/kernels/opencl/depth_to_space_opencl.cc
@@ -33,7 +33,7 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
  const char *kernel_name = nullptr;

  uint32_t gws[3];
-  std::stringstream ss;
+  std::string tuning_key;
  index_t output_height, output_width, output_depth;
  if (d2s_) {
    output_height = input_height * block_size_;
@@ -46,8 +46,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
    gws[0] = static_cast<uint32_t>(RoundUpDiv4(output_depth));
    gws[1] = static_cast<uint32_t>(output_width);
    gws[2] = static_cast<uint32_t>(output_height * batch);
-    ss << "depth_to_space_opencl_kernel_" << batch << "_"
-       << output_height << "_" << output_width << "_" << output_depth;
+    tuning_key = Concat("depth_to_space_opencl_kernel", batch, output_height,
+                        output_width, output_depth);
  } else {
    output_height = input_height / block_size_;
    output_width = input_width / block_size_;
@@ -59,8 +59,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
    gws[0] = static_cast<uint32_t>(RoundUpDiv4(input_depth));
    gws[1] = static_cast<uint32_t>(input_width);
    gws[2] = static_cast<uint32_t>(input_height * batch);
-    ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_"
-       << input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3);
+    tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0),
+                        input->dim(1), input->dim(2), input->dim(3));
  }
  const index_t input_depth_blocks = RoundUpDiv4(input_depth);
  const index_t output_depth_blocks = RoundUpDiv4(output_depth);
@@ -134,8 +134,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }

-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
-  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -21,6 +21,37 @@
 namespace mace {
 namespace kernels {

+namespace {
+// (inputs + weights + outputs) * array_size * sizeof(float)
+const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4;
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t min_lws0 = cache_size / kBaseGPUMemCacheSize;
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  if (lws[1] >= min_lws0) {
+    lws[0] = std::min<uint32_t>(gws[0], min_lws0);
+  } else {
+    lws[0] = std::min<uint32_t>(gws[0] / 8, kwg_size / lws[1]);
+    if (lws[0] < min_lws0) {
+      lws[0] = std::min<uint32_t>(std::max<uint32_t>(gws[0] / 4, min_lws0),
+                                  kwg_size / lws[1]);
+    }
+  }
+  const uint32_t lws_size = lws[0] * lws[1];
+  lws[2] = std::min<uint32_t>(
+      (cache_size / kernel_cache_size / lws_size) * 4,
+      gws[2]);
+  if (lws[2] == 0) {
+    lws[2] = gws[2];
+  }
+  lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
+  return lws;
+}
+
+}  // namespace
+
 static void DepthwiseConv2d(cl::Kernel *kernel,
                            const Tensor *input,   // NHWC
                            const Tensor *filter,  // HWIM
@@ -149,9 +180,9 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }

-  const std::vector<uint32_t> lws = {8, *kwg_size / 64, 8, 0};
-  std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
-                                  batch, height, width, channels, multiplier);
+  const std::vector<uint32_t> lws = LocalWS(gws, *kwg_size);
+  std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel",
+                                  gws[0], gws[1], gws[2], multiplier);
  TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {

--- a/mace/kernels/opencl/eltwise_opencl.cc
+++ b/mace/kernels/opencl/eltwise_opencl.cc
@@ -116,11 +116,11 @@ void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
    input_shape_ = input0->shape();
  }

-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
-  std::stringstream ss;
-  ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
-     << "_" << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  std::string tuning_key =
+      Concat("eltwise_opencl_kernel", output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);
  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);
    char *kerror_code = kernel_error_->mutable_data<char>();

--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -267,10 +267,10 @@ void FCWTXKernel(cl::Kernel *kernel,
    *prev_input_shape = input->shape();
  }

-  std::stringstream ss;
-  ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_"
-     << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun2DKernel(*kernel, ss.str(), gws->data(), *lws, future);
+  std::string tuning_key =
+      Concat("fc_opencl_kernel", output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {
    (*kernel_error)->Map(nullptr);

--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -206,6 +206,31 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
  }
 }

+std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws,
+                                       const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(3, 0);
+  uint64_t cache_size =
+      OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  lws[0] = std::min<uint32_t>(base, kwg_size);
+  lws[1] = kwg_size / lws[1];
+  return lws;
+}
+
+std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
+                                       const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size =
+      OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base),
+                              kwg_size / lws[1]);
+  const uint32_t lws_size = lws[1] * lws[2];
+  lws[0] = std::min<uint32_t>(base, kwg_size / lws_size);
+  return lws;
+}
+
 void TuningOrRun3DKernel(const cl::Kernel &kernel,
                         const std::string tuning_key,
                         const uint32_t *gws,
@@ -216,31 +241,47 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    const uint32_t kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
-    std::vector<uint32_t> local_ws(3, 0);
-    local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
-    local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
-    local_ws[2] =
-        std::min<uint32_t>(gws[2], kwg_size / (local_ws[0] * local_ws[1]));
-    return {
+    std::vector<std::vector<uint32_t>> results;
+    std::vector<std::vector<uint32_t>> candidates = {
        // TODO(heliangliang): tuning these magic numbers
-        {local_ws[0], local_ws[1], local_ws[2], 0},
-        {kwg_size / 16, 4, 4, 0},
-        {kwg_size / 32, 4, 8, 0},
-        {kwg_size / 32, 8, 4, 0},
-        {kwg_size / 64, 8, 8, 0},
-        {kwg_size / 64, 16, 4, 0},
-        {kwg_size / 128, 8, 16, 0},
-        {kwg_size / 128, 16, 8, 0},
-        {kwg_size / 128, 32, 4, 0},
-        {1, kwg_size / 32, 32, 0},
-        {1, kwg_size / 64, 64, 0},
-        {1, kwg_size / 128, 128, 0},
-        {4, kwg_size / 16, 4, 0},
-        {4, kwg_size / 28, 7, 0},
-        {4, kwg_size / 32, 8, 0},
-        {4, kwg_size / 56, 14, 0},
-        {1, kwg_size, 1, 0},
+        {gws[0], gws[1], gws[2], 0},
+        {gws[0], gws[1], gws[2] / 8, 0},
+        {gws[0], gws[1], gws[2] / 4, 0},
+        {gws[0], gws[1], 8, 0},
+        {gws[0], gws[1], 4, 0},
+        {gws[0], gws[1], 1, 0},
+        {gws[0] / 4, gws[1], gws[2], 0},
+        {gws[0] / 4, gws[1], gws[2] / 8, 0},
+        {gws[0] / 4, gws[1], gws[2] / 4, 0},
+        {gws[0] / 4, gws[1], 8, 0},
+        {gws[0] / 4, gws[1], 4, 0},
+        {gws[0] / 4, gws[1], 1, 0},
+        {gws[0] / 8, gws[1], gws[2], 0},
+        {gws[0] / 8, gws[1], gws[2] / 8, 0},
+        {gws[0] / 8, gws[1], gws[2] / 4, 0},
+        {gws[0] / 8, gws[1], 8, 0},
+        {gws[0] / 8, gws[1], 4, 0},
+        {gws[0] / 8, gws[1], 1, 0},
+        {4, gws[1], gws[2], 0},
+        {4, gws[1], gws[2] / 8, 0},
+        {4, gws[1], gws[2] / 4, 0},
+        {4, gws[1], 8, 0},
+        {4, gws[1], 4, 0},
+        {4, gws[1], 1, 0},
+        {1, gws[1], gws[2], 0},
+        {1, gws[1], gws[2] / 8, 0},
+        {1, gws[1], gws[2] / 4, 0},
+        {1, gws[1], 8, 0},
+        {1, gws[1], 4, 0},
+        {1, gws[1], 1, 0},
    };
+    for (auto &ele : candidates) {
+      const uint32_t tmp = ele[0] * ele[1] * ele[2];
+      if (0 < tmp && tmp <= kwg_size) {
+        results.push_back(ele);
+      }
+    }
+    return results;
  };
  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
@@ -333,19 +374,26 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
    const uint32_t kwg_size =
        static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
-    uint32_t local_ws[2];
-    local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
-    local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
-    return {{local_ws[0], local_ws[1], 0},
-            {local_ws[1], local_ws[0], 0},
-            {kwg_size / 4, 4, 0},
-            {kwg_size / 16, 16, 0},
-            {kwg_size / 32, 32, 0},
-            {kwg_size / 64, 64, 0},
-            {kwg_size / 128, 128, 0},
-            {kwg_size / 256, 256, 0},
-            {kwg_size, 1, 0},
-            {1, kwg_size, 0}};
+    std::vector<std::vector<uint32_t>> results;
+    std::vector<std::vector<uint32_t>> candidates = {
+        {kwg_size / 2, 2, 0},
+        {kwg_size / 4, 4, 0},
+        {kwg_size / 8, 8, 0},
+        {kwg_size / 16, 16, 0},
+        {kwg_size / 32, 32, 0},
+        {kwg_size / 64, 64, 0},
+        {kwg_size / 128, 128, 0},
+        {kwg_size / 256, 256, 0},
+        {kwg_size, 1, 0},
+        {1, kwg_size, 0}
+    };
+    for (auto &ele : candidates) {
+      const uint32_t tmp = ele[0] * ele[1] * ele[2];
+      if (0 < tmp && tmp <= kwg_size) {
+        results.push_back(ele);
+      }
+    }
+    return results;
  };
  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params, Timer *timer,
@@ -426,5 +474,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
  }
 }

+
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -29,6 +29,8 @@ namespace kernels {

 const float kMaxKernelExeTime = 1000.0;  // microseconds

+const int32_t kBaseGPUMemCacheSize = 16384;
+
 enum BufferType {
  CONV2D_FILTER = 0,
  IN_OUT_CHANNEL = 1,
@@ -112,6 +114,10 @@ std::string Concat(Args... args) {
  return ss.str();
 }

+std::vector<uint32_t> Default2DLocalWS(const uint32_t *gws,
+                                       const uint32_t kwg_size);
+std::vector<uint32_t> Default3DLocalWS(const uint32_t *gws,
+                                       const uint32_t kwg_size);
 }  // namespace kernels
 }  // namespace mace
 #endif  // MACE_KERNELS_OPENCL_HELPER_H_
--- a/mace/kernels/opencl/matmul.cc
+++ b/mace/kernels/opencl/matmul.cc
@@ -85,10 +85,10 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
  kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(A->dim(2))));

  const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
-  std::stringstream ss;
-  ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_"
-     << C->dim(2) << "_" << C->dim(3);
-  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
+  std::string tuning_key =
+      Concat("matmul_opencl_kernel", C->dim(0),
+             C->dim(1), C->dim(2), C->dim(3));
+  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/pad.cc
+++ b/mace/kernels/opencl/pad.cc
@@ -100,7 +100,7 @@ void PadFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }

-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
  std::string tuning_key =
      Concat("pad", output->dim(0), output->dim(1), output->dim(2),
             output->dim(3));

--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -21,6 +21,28 @@
 namespace mace {
 namespace kernels {

+namespace {
+
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size =
+    OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  lws[2] = std::min<uint32_t>(std::min<uint32_t>(gws[2], base),
+                              kwg_size / lws[1]);
+  const uint32_t lws_size = lws[1] * lws[2];
+  lws[0] = gws[0] / 4;
+  if (lws[0] == 0) {
+    lws[0] = gws[0];
+  }
+  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws_size);
+  return lws;
+}
+
+}  // namespace
+
 template <typename T>
 void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
                                                       Tensor *output,
@@ -134,11 +156,11 @@ void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
    };
  }

-  std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
-  std::stringstream ss;
-  ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
-     << "_" << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future);
+  const std::vector<uint32_t> lws = LocalWS(gws.data(), kwg_size_);
+  std::string tuning_key =
+      Concat("pooling_opencl_kernel_", output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -22,6 +22,34 @@
 namespace mace {
 namespace kernels {

+namespace {
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kwg_size) {
+  std::vector<uint32_t> lws(4, 0);
+  uint64_t cache_size =
+    OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  if (lws[1] >= base) {
+    lws[0] = std::min<uint32_t>(gws[0], base);
+  } else {
+    lws[0] = gws[0] / 8;
+    if (lws[0] == 0) {
+      lws[0] = gws[0];
+    }
+  }
+  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+  const uint32_t lws_size = lws[0] * lws[1];
+  lws[2] = gws[2] / 8;
+  if (lws[2] == 0) {
+    lws[2] = gws[2];
+  }
+  lws[2] = std::min<uint32_t>(lws[2], kwg_size / lws_size);
+  return lws;
+}
+
+}  // namespace
+
 template <typename T>
 void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
    const Tensor *input, Tensor *output, StatsFuture *future) {
@@ -99,11 +127,11 @@ void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
    input_shape_ = input->shape();
  }

-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
-  std::stringstream ss;
-  ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_"
-     << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  const std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  std::string tuning_key =
+      Concat("resize_bilinear_opencl_kernel", output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/slice.cc
+++ b/mace/kernels/opencl/slice.cc
@@ -72,7 +72,7 @@ void SliceFunctor<DeviceType::GPU, T>::operator()(
      static_cast<uint32_t>(input->dim(0) * input->dim(1)),
  };

-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 1};
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
  cl::Event event;
  CallStats call_stats{INT64_MAX, 0};
  for (int i = 0; i < outputs_count; ++i) {

--- a/mace/kernels/opencl/softmax_opencl.cc
+++ b/mace/kernels/opencl/softmax_opencl.cc
@@ -22,6 +22,27 @@
 namespace mace {
 namespace kernels {

+namespace {
+
+std::vector<uint32_t> LocalWS(const uint32_t *gws,
+                              const uint32_t kwg_size) {
+  uint64_t cache_size =
+    OpenCLRuntime::Global()->device_global_mem_cache_size();
+  uint32_t base = cache_size / kBaseGPUMemCacheSize;
+  std::vector<uint32_t> lws(4, 0);
+  lws[1] = std::min<uint32_t>(gws[1], kwg_size);
+  if (gws[0] < base) {
+    lws[0] = gws[0];
+  } else {
+    lws[0] = gws[0] / base;
+  }
+  lws[0] = std::min<uint32_t>(lws[0], kwg_size / lws[1]);
+  lws[2] = std::min<uint32_t>(gws[2], kwg_size / (lws[0] * lws[1]));
+  return lws;
+}
+
+}  // namespace
+
 template <typename T>
 void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
                                                       Tensor *output,
@@ -81,11 +102,11 @@ void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
    input_shape_ = logits->shape();
  }

-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
-  std::stringstream ss;
-  ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1)
-     << "_" << output->dim(2) << "_" << output->dim(3);
-  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  std::vector<uint32_t> lws = LocalWS(gws, kwg_size_);
+  std::string tuning_key =
+      Concat("softmax_opencl_kernel", output->dim(0),
+             output->dim(1), output->dim(2), output->dim(3));
+  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/space_to_batch_opencl.cc
+++ b/mace/kernels/opencl/space_to_batch_opencl.cc
@@ -105,12 +105,11 @@ void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
    space_shape_ = space_tensor->shape();
  }

-  const std::vector<uint32_t> lws = {8, kwg_size_ / 64, 8, 0};
-  std::stringstream ss;
-  ss << kernel_name << "_" << batch_tensor->dim(0) << "_"
-     << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_"
-     << batch_tensor->dim(3);
-  TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  const std::vector<uint32_t> lws = Default3DLocalWS(gws, kwg_size_);
+  std::string tuning_key =
+      Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
+             batch_tensor->dim(2), batch_tensor->dim(3));
+  TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -102,11 +102,11 @@ void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
  }

  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
-  std::stringstream ss;
-  ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_"
-     << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
-     << input_tensor->dim(3);
-  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
+  std::string tuning_key =
+      Concat("winograd_transform_kernel", output_tensor->dim(0),
+             output_tensor->dim(1), output_tensor->dim(2),
+             output_tensor->dim(3));
+  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);
@@ -216,12 +216,11 @@ void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
  }

  const std::vector<uint32_t> lws = {kwg_size_ / 8, 8, 0};
-
-  std::stringstream ss;
-  ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_"
-     << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_"
-     << input_tensor->dim(3);
-  TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future);
+  std::string tuning_key =
+      Concat("winograd_inverse_transform_kernel", output_tensor->dim(0),
+             output_tensor->dim(1), output_tensor->dim(2),
+             output_tensor->dim(3), input_tensor->dim(2));
+  TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future);

  if (runtime->IsOutOfRangeCheckEnabled()) {
    kernel_error_->Map(nullptr);