From 116d33db77ecadad5730f0eda9318d81b5833a4f Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Thu, 7 Dec 2017 14:00:32 +0800
Subject: [PATCH] Add auto-tuning for opencl kernel.

---
 mace/core/net.cc                              |  5 ++
 mace/kernels/opencl/batch_norm_opencl.cc      | 48 ++++++++------
 mace/kernels/opencl/bias_add_opencl.cc        |  2 +-
 mace/kernels/opencl/cl/addn.cl                | 10 ++-
 mace/kernels/opencl/cl/buffer_to_image.cl     |  9 +--
 mace/kernels/opencl/cl/conv_2d.cl             | 13 ++--
 mace/kernels/opencl/cl/conv_2d_1x1.cl         | 20 +++---
 mace/kernels/opencl/cl/conv_2d_3x3.cl         | 13 ++--
 mace/kernels/opencl/cl/resize_bilinear.cl     |  9 ++-
 mace/kernels/opencl/concat.cc                 |  8 +--
 mace/kernels/opencl/conv_2d_opencl_1x1.cc     | 57 ++++++++++++----
 mace/kernels/opencl/conv_2d_opencl_3x3.cc     | 54 ++++++++++++---
 mace/kernels/opencl/conv_2d_opencl_general.cc | 55 +++++++++++++---
 mace/kernels/opencl/relu_opencl.cc            | 66 +++++++++++++------
 mace/ops/conv_2d_benchmark.cc                 |  2 +
 mace/tools/benchmark/benchmark_model.cc       |  6 +-
 mace/utils/tuner.h                            |  1 +
 tools/validate.py                             | 11 +---
 tools/validate_gcn.sh                         |  5 +-
 19 files changed, 261 insertions(+), 133 deletions(-)
diff --git a/mace/core/net.cc b/mace/core/net.cc
index 40312f3d..e1b16a03 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -87,6 +87,11 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
     VLOG(1) << "Op " << op->debug_def().name()
             << " has shape: " << internal::MakeString(op->Output(0)->shape());
   }
+#ifdef __USE_OPENCL
+  if (device_type_ == DeviceType::OPENCL) {
+    OpenCLRuntime::Get()->command_queue().finish();
+  }
+#endif
   return true;
 }
 
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index 09b160d7..a5362262 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -11,7 +11,7 @@
 namespace mace {
 namespace kernels {
 
-template <typename T>
+template<typename T>
 void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
     const Tensor *input,
     const Tensor *scale,
@@ -27,10 +27,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
 
   const index_t channel_blocks = RoundUpDiv4(channels);
 
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
   auto runtime = OpenCLRuntime::Get();
   std::set<std::string> built_options;
   auto dt = DataTypeToEnum<T>::value;
@@ -38,9 +34,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
   built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
   auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);
 
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
-  const std::vector<uint32_t> lws = {1, kwg_size, 1};
-
   uint32_t idx = 0;
   bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
   bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(scale->buffer())));
@@ -50,18 +43,31 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
   bm_kernel.setArg(idx++, epsilon_);
   bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
 
-  auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> {
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  const std::vector<uint32_t> lws = {8, 16, 8};
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
+  auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
     return {{8, 128, 1}, //SNPE size
-            {1, 1, 64},
-            {1, 1, 128},
-            {1, kwg_size/16, 16},
-            {1, kwg_size/32, 32},
-            {1, kwg_size/64, 64},
-            {1, kwg_size/128, 128},
-            {1, 1, kwg_size},
+            {kwg_size / 16, 4, 4},
+            {kwg_size / 32, 4, 8},
+            {kwg_size / 32, 8, 4},
+            {kwg_size / 64, 8, 8},
+            {kwg_size / 64, 16, 4},
+            {kwg_size / 128, 8, 16},
+            {kwg_size / 128, 16, 8},
+            {kwg_size / 128, 32, 4},
+            {1, kwg_size / 32, 32},
+            {1, kwg_size / 64, 64},
+            {1, kwg_size / 128, 128},
+            {3, 15, 9},
+            {7, 15, 9},
+            {9, 7, 15},
+            {15, 7, 9},
             {1, kwg_size, 1}};
   };
-  auto func = [&](const std::vector<uint32_t>& params)->cl_int {
+  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
     cl_int error = runtime->command_queue().enqueueNDRangeKernel(
         bm_kernel, cl::NullRange,
         cl::NDRange(gws[0], gws[1], gws[2]),
@@ -73,10 +79,10 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
   };
   std::stringstream ss;
   ss << "batch_norm_opencl_kernel_"
-      << input->dim(0) << "_"
-      << input->dim(1) << "_"
-      << input->dim(2) << "_"
-      << input->dim(3);
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
   Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
                                                      lws,
                                                      params_generator,
diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc
index b778c6b7..b03bbc5e 100644
--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -36,7 +36,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
   auto bias_kernel = runtime->BuildKernel("bias_add", "bias_add", built_options);
 
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bias_kernel);
-  const std::vector<uint32_t> lws = {1, kwg_size, 1};
+  const std::vector<uint32_t> lws = {8, 16, 8};
 
   uint32_t idx = 0;
   bias_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl
index a9309930..9504d12a 100644
--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
@@ -12,19 +12,17 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
   const int w = get_global_id(0);
   const int hb = get_global_id(1);
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  DATA_TYPE4 in0 = READ_IMAGET(input0, sampler, (int2)(w, hb));
-  DATA_TYPE4 in1 = READ_IMAGET(input1, sampler, (int2)(w, hb));
+  DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb));
+  DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb));
   DATA_TYPE4 out = in0 + in1;
 
 #if INPUT_NUM > 2
-  DATA_TYPE4 in2 = READ_IMAGET(input2, sampler, (int2)(w, hb));
+  DATA_TYPE4 in2 = READ_IMAGET(input2, SAMPLER, (int2)(w, hb));
   out = out + in2;
 #endif
 
 #if INPUT_NUM > 3
-  DATA_TYPE4 in3 = READ_IMAGET(input3, sampler, (int2)(w, hb));
+  DATA_TYPE4 in3 = READ_IMAGET(input3, SAMPLER, (int2)(w, hb));
   out = out + in3;
 #endif
 
diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl
index b7812d6e..78e009a6 100644
--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
@@ -54,9 +54,8 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc
                            + out_channel_idx;
 
   if (in_channel_idx < in_channel) {
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
     int2 coord = (int2)(w, h);
-    VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
+    VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord);
     const int size = (out_channel - out_channel_idx);
     if (size < 4) {
       switch (size) {
@@ -119,9 +118,8 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
   const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
                            + channel_idx;
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
   int2 coord = (int2)(w, h);
-  VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
+  VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord);
   const int size = channels - channel_idx;
   if (size < 4) {
     switch (size) {
@@ -169,9 +167,8 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
   int h = get_global_id(1);
   const int offset = w * 4;
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
   int2 coord = (int2)(w, h);
-  VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
+  VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord);
   const int size = count - offset;
   if (size < 4) {
     switch (size) {
diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl
index e5ddb3d7..060cb39a 100644
--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -21,10 +21,9 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
   const int out_hb = get_global_id(2);
   const int rounded_in_ch = in_ch_blks * 4;
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 #ifdef BIAS
   DATA_TYPE4 out0 =
-     READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+     READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
   DATA_TYPE4 out1 = out0;
   DATA_TYPE4 out2 = out0;
   DATA_TYPE4 out3 = out0;
@@ -71,7 +70,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
         in_width_value = select(in_idx + in_width_value,                             \
                                 -1,                                                  \
                                 (in_width_value < 0 || in_width_value >= in_width)); \
-        in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
+        in##i = READ_IMAGET(input, SAMPLER, (int2)(in_width_value, in_hb_value));
 
         READ_INPUT(0);
         READ_INPUT(1);
@@ -81,10 +80,10 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
 #undef READ_INPUT
 
         int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch;
-        weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
-        weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
-        weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
-        weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
+        weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
+        weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
+        weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
+        weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
 
         // Will prefetch L2 improve performance? How to pretch image data?
 
diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index bf384467..1249f1fc 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -16,10 +16,8 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   const int out_w_blks = get_global_size(1);
   const int out_hb = get_global_id(2);
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
 #ifdef BIAS
-  DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+  DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
   DATA_TYPE4 out1 = out0;
   DATA_TYPE4 out2 = out0;
   DATA_TYPE4 out3 = out0;
@@ -58,16 +56,16 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   int in_x_base = 0;
   for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
 
-    DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
-    DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
-    DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
-    DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));
+    DATA_TYPE4 in0 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.x, out_hb_idx));
+    DATA_TYPE4 in1 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.y, out_hb_idx));
+    DATA_TYPE4 in2 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.z, out_hb_idx));
+    DATA_TYPE4 in3 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.w, out_hb_idx));
 
     const int filter_x0 = in_ch_blk << 2;
-    DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
-    DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
-    DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
-    DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
+    DATA_TYPE4 weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0, out_ch_blk));
+    DATA_TYPE4 weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 1, out_ch_blk));
+    DATA_TYPE4 weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 2, out_ch_blk));
+    DATA_TYPE4 weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 3, out_ch_blk));
     // Will prefetch L2 improve performance? How to pretch image data?
 
     out0 += in0.x * weights0;
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index 08bf04d3..9d032a72 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -19,10 +19,9 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   const int out_hb = get_global_id(2);
   const int rounded_in_ch = in_ch_blks * 4;
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 #ifdef BIAS
   DATA_TYPE4 out0 =
-     READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+     READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
   DATA_TYPE4 out1 = out0;
   DATA_TYPE4 out2 = out0;
   DATA_TYPE4 out3 = out0;
@@ -72,7 +71,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
         in_width_value = select(in_idx + in_width_value,                             \
                                 -1,                                                  \
                                 (in_width_value < 0 || in_width_value >= in_width)); \
-        in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
+        in##i = READ_IMAGET(input, SAMPLER, (int2)(in_width_value, in_hb_value));
 
         READ_INPUT(0);
         READ_INPUT(1);
@@ -83,10 +82,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 #undef READ_INPUT
 
         int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch;
-        weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
-        weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
-        weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
-        weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
+        weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
+        weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
+        weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
+        weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
 
         // Will prefetch L2 improve performance? How to pretch image data?
 
diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl
index efb769d2..72f09c96 100644
--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
@@ -25,17 +25,16 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
   const float h_lerp = h_in - h_lower;
   const float w_lerp = w_in - w_lower;
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
   const int in_w_offset = ch_blk * in_width;
   const int in_h_offset = b * in_height;
 
-  DATA_TYPE4 top_left = READ_IMAGET(input, sampler,
+  DATA_TYPE4 top_left = READ_IMAGET(input, SAMPLER,
           (int2)(in_w_offset + w_lower, in_h_offset + h_lower));
-  DATA_TYPE4 top_right = READ_IMAGET(input, sampler,
+  DATA_TYPE4 top_right = READ_IMAGET(input, SAMPLER,
           (int2)(in_w_offset + w_upper, in_h_offset + h_lower));
-  DATA_TYPE4 bottom_left = READ_IMAGET(input, sampler,
+  DATA_TYPE4 bottom_left = READ_IMAGET(input, SAMPLER,
           (int2)(in_w_offset + w_lower, in_h_offset + h_upper));
-  DATA_TYPE4 bottom_right = READ_IMAGET(input, sampler,
+  DATA_TYPE4 bottom_right = READ_IMAGET(input, SAMPLER,
           (int2)(in_w_offset + w_upper, in_h_offset + h_upper));
 
   DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp;
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 80a23a44..f80f370d 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -43,10 +43,10 @@ static void Concat2(const Tensor *input0,
 
   const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel);
 
-  uint32_t lws[3];
-  lws[0] = std::min<uint32_t>(channel_blk, kwg_size);
-  lws[1] = std::min<uint32_t>(width, kwg_size / lws[0]);
-  lws[2] = std::min<uint32_t>(height * batch, kwg_size / (lws[0] * lws[1]));
+  uint32_t lws[3] = {8, 16, 8};
+//  lws[0] = std::min<uint32_t>(channel_blk, kwg_size);
+//  lws[1] = std::min<uint32_t>(width, kwg_size / lws[0]);
+//  lws[2] = std::min<uint32_t>(height * batch, kwg_size / (lws[0] * lws[1]));
 
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
       concat_kernel, cl::NullRange,
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index d759689c..1fe00494 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -7,6 +7,7 @@
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
 #include "mace/utils/utils.h"
+#include "mace/utils/tuner.h"
 
 namespace mace {
 namespace kernels {
@@ -48,7 +49,6 @@ void Conv1x1(const Tensor *input,
   auto program = runtime->program();
 
   auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1", built_options);
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
 
   uint32_t idx = 0;
   conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
@@ -63,16 +63,51 @@ void Conv1x1(const Tensor *input,
   conv_2d_kernel.setArg(idx++, static_cast<int>(height));
   conv_2d_kernel.setArg(idx++, static_cast<int>(width));
 
-  auto command_queue = runtime->command_queue();
-  cl_int error;
-  error = command_queue.enqueueNDRangeKernel(
-      conv_2d_kernel, cl::NullRange,
-      cl::NDRange(static_cast<uint32_t>(channel_blocks),
-                  static_cast<uint32_t>(width_blocks),
-                  static_cast<uint32_t>(height * batch)),
-      cl::NDRange(4, 15, 8), // TODO auto tuning
-      nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS, error);
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+  const std::vector<uint32_t> lws = {8, 15, 8};
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
+  auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> {
+    return {{4, 15, 8}, //SNPE size
+            {kwg_size/16, 4, 4},
+            {kwg_size/32, 4, 8},
+            {kwg_size/32, 8, 4},
+            {kwg_size/64, 8, 8},
+            {kwg_size/64, 16, 4},
+            {kwg_size/128, 8, 16},
+            {kwg_size/128, 16, 8},
+            {kwg_size/128, 32, 4},
+            {1, kwg_size/32, 32},
+            {1, kwg_size/64, 64},
+            {1, kwg_size/128, 128},
+            {3, 15, 9},
+            {7, 15, 9},
+            {9, 7, 15},
+            {15, 7, 9},
+            {1, kwg_size, 1}};
+  };
+  auto func = [&](const std::vector<uint32_t>& params)->cl_int {
+    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+        conv_2d_kernel, cl::NullRange,
+        cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(params[0], params[1], params[2]),
+        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+    return error;
+  };
+  std::stringstream ss;
+  ss << "conv2d_1x1_opencl_kernel_"
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
+                                                     lws,
+                                                     params_generator,
+                                                     func);
+
 }
 
 extern void Conv2dOpenclK1x1S1(const Tensor *input,
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index 7b7453ad..858fc5fc 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -7,6 +7,7 @@
 #include "mace/kernels/conv_2d.h"
 #include "mace/kernels/opencl/helper.h"
 #include "mace/utils/utils.h"
+#include "mace/utils/tuner.h"
 
 namespace mace {
 namespace kernels {
@@ -54,15 +55,50 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
   conv_2d_kernel.setArg(idx++, padding[0] / 2);
   conv_2d_kernel.setArg(idx++, padding[1] / 2);
 
-  auto command_queue = runtime->command_queue();
-  cl_int error;
-  error = command_queue.enqueueNDRangeKernel(
-      conv_2d_kernel, cl::NullRange,
-      cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
-                  static_cast<uint32_t>(height * batch)),
-      cl::NDRange(16, 16, 4),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS, error);
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+  const std::vector<uint32_t> lws = {4, 15, 8};
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
+  auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
+    return {{4, 15, 8}, //SNPE size
+            {kwg_size / 16, 4, 4},
+            {kwg_size / 32, 4, 8},
+            {kwg_size / 32, 8, 4},
+            {kwg_size / 64, 8, 8},
+            {kwg_size / 64, 16, 4},
+            {kwg_size / 128, 8, 16},
+            {kwg_size / 128, 16, 8},
+            {kwg_size / 128, 32, 4},
+            {1, kwg_size / 32, 32},
+            {1, kwg_size / 64, 64},
+            {1, kwg_size / 128, 128},
+            {3, 15, 9},
+            {7, 15, 9},
+            {9, 7, 15},
+            {15, 7, 9},
+            {1, kwg_size, 1}};
+  };
+  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+        conv_2d_kernel, cl::NullRange,
+        cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(params[0], params[1], params[2]),
+        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+    return error;
+  };
+  std::stringstream ss;
+  ss << "conv2d_3x3_opencl_kernel_"
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
+                                                     lws,
+                                                     params_generator,
+                                                     func);
 
 }
 void Conv2dOpenclK3x3S1(const Tensor *input,
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index e46ecbca..7a74f86b 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -7,6 +7,7 @@
 #include "mace/kernels/conv_2d.h"
 #include "mace/kernels/opencl/helper.h"
 #include "mace/utils/utils.h"
+#include "mace/utils/tuner.h"
 
 namespace mace {
 namespace kernels {
@@ -38,7 +39,6 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
   auto program = runtime->program();
 
   auto conv_2d_kernel = runtime->BuildKernel("conv_2d", "conv_2d", built_options);
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
 
   uint32_t idx = 0;
   conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
@@ -57,15 +57,50 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
   conv_2d_kernel.setArg(idx++, padding[0] / 2);
   conv_2d_kernel.setArg(idx++, padding[1] / 2);
 
-  auto command_queue = runtime->command_queue();
-  cl_int error;
-  error = command_queue.enqueueNDRangeKernel(
-      conv_2d_kernel, cl::NullRange,
-      cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
-                  static_cast<uint32_t>(height * batch)),
-      cl::NDRange(16, 16, 4),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS, error);
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+  const std::vector<uint32_t> lws = {8, 16, 8};
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
+  auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
+    return {{4, 15, 8}, //SNPE size
+            {kwg_size / 16, 4, 4},
+            {kwg_size / 32, 4, 8},
+            {kwg_size / 32, 8, 4},
+            {kwg_size / 64, 8, 8},
+            {kwg_size / 64, 16, 4},
+            {kwg_size / 128, 8, 16},
+            {kwg_size / 128, 16, 8},
+            {kwg_size / 128, 32, 4},
+            {1, kwg_size / 32, 32},
+            {1, kwg_size / 64, 64},
+            {1, kwg_size / 128, 128},
+            {3, 15, 9},
+            {7, 15, 9},
+            {9, 7, 15},
+            {15, 7, 9},
+            {1, kwg_size, 1}};
+  };
+  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+        conv_2d_kernel, cl::NullRange,
+        cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(params[0], params[1], params[2]),
+        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+    return error;
+  };
+  std::stringstream ss;
+  ss << "conv2d_general_opencl_kernel_"
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
+                                                     lws,
+                                                     params_generator,
+                                                     func);
 
 }
 
diff --git a/mace/kernels/opencl/relu_opencl.cc b/mace/kernels/opencl/relu_opencl.cc
index 46988793..28ff881b 100644
--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -7,6 +7,7 @@
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
 #include "mace/utils/utils.h"
+#include "mace/utils/tuner.h"
 
 namespace mace {
 namespace kernels {
@@ -22,10 +23,6 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
 
   const index_t channel_blocks = RoundUpDiv4(channels);
 
-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
   auto runtime = OpenCLRuntime::Get();
   auto program = runtime->program();
 
@@ -33,38 +30,65 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
   auto dt = DataTypeToEnum<T>::value;
   built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
   built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  cl::Kernel relu_kernel;
   if (max_limit_ < 0) {
-    auto relu_kernel  = runtime->BuildKernel("relu", "relu", built_options);
-    const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
-    const uint32_t lws[3] = {1, kwg_size, 1};
+    relu_kernel  = runtime->BuildKernel("relu", "relu", built_options);
 
     uint32_t idx = 0;
     relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
     relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        relu_kernel, cl::NullRange,
-        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]),
-        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-    MACE_CHECK(error == CL_SUCCESS);
   } else {
-    auto relu_kernel  = runtime->BuildKernel("relu", "relux", built_options);
-    const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
-    const uint32_t lws[3] = {1, kwg_size, 1};
+    relu_kernel  = runtime->BuildKernel("relu", "relux", built_options);
 
     uint32_t idx = 0;
     relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
     relu_kernel.setArg(idx++, max_limit_);
     relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-
+  }
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  const std::vector<uint32_t> lws = {8, 16, 8};
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
+  auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
+    return {{4, 15, 8}, //SNPE size
+            {kwg_size / 16, 4, 4},
+            {kwg_size / 32, 4, 8},
+            {kwg_size / 32, 8, 4},
+            {kwg_size / 64, 8, 8},
+            {kwg_size / 64, 16, 4},
+            {kwg_size / 128, 8, 16},
+            {kwg_size / 128, 16, 8},
+            {kwg_size / 128, 32, 4},
+            {1, kwg_size / 32, 32},
+            {1, kwg_size / 64, 64},
+            {1, kwg_size / 128, 128},
+            {3, 15, 9},
+            {7, 15, 9},
+            {9, 7, 15},
+            {15, 7, 9},
+            {1, kwg_size, 1}};
+  };
+  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
     cl_int error = runtime->command_queue().enqueueNDRangeKernel(
         relu_kernel, cl::NullRange,
         cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]),
+        cl::NDRange(params[0], params[1], params[2]),
         NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-    MACE_CHECK(error == CL_SUCCESS);
-  }
+
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+    return error;
+  };
+  std::stringstream ss;
+  ss << "relu_opencl_kernel_"
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
+                                                     lws,
+                                                     params_generator,
+                                                     func);
 }
 
 template
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index b7f6fc73..0870f2b8 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -102,6 +102,8 @@ BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
 BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
 // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
 BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
+BM_CONV_2D(1, 3, 512, 512, 7, 7, 2, SAME, 64, half);
+BM_CONV_2D(1, 512, 64, 64, 1, 1, 1, SAME, 256, half);
 
 // Test RGB <-> YUV
 //BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
diff --git a/mace/tools/benchmark/benchmark_model.cc b/mace/tools/benchmark/benchmark_model.cc
index 09ac6fd6..06816386 100644
--- a/mace/tools/benchmark/benchmark_model.cc
+++ b/mace/tools/benchmark/benchmark_model.cc
@@ -260,8 +260,6 @@ int Main(int argc, char **argv) {
   DeviceType_Parse(device, &device_type);
   VLOG(0) << device_type;
 
-  if (device_type == DeviceType::OPENCL)
-    OpenCLRuntime::EnableProfiling();
 
   // load model
   std::ifstream model_file_stream(model_file, std::ios::in | std::ios::binary);
@@ -296,9 +294,11 @@ int Main(int argc, char **argv) {
       }
     }
   }
+  auto net = CreateNet(net_def, &ws, device_type, NetMode::INIT);
+  net->Run();
 
   // create net
-  auto net = CreateNet(net_def, &ws, device_type);
+  net = CreateNet(net_def, &ws, device_type);
 
   int64_t warmup_time_us = 0;
   int64_t num_warmup_runs = 0;
diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h
index 722c9c86..32c21e9b 100644
--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -42,6 +42,7 @@ class Tuner {
     } else {
       // run
       if (param_table_.find(param_key) != param_table_.end()) {
+        VLOG(1) << param_key << ": " << internal::MakeString(param_table_[param_key]);
         return func(param_table_[param_key]);
       } else {
         return func(default_param);
diff --git a/tools/validate.py b/tools/validate.py
index 42a856b0..9edbdd24 100644
--- a/tools/validate.py
+++ b/tools/validate.py
@@ -18,10 +18,9 @@ from tensorflow import gfile
 #        --input_file input_file \
 #        --mace_out_file icnet.out
 
-
 def generate_data(shape):
   np.random.seed(FLAGS.random_seed)
-  data = np.random.random(shape)
+  data = np.random.random(shape) * -1
   print FLAGS.input_file
   data.astype(np.float32).tofile(FLAGS.input_file)
   print "Generate input file done."
@@ -36,12 +35,8 @@ def valid_output(out_shape, mace_out_file, tf_out_value):
   mace_out_value = load_data(mace_out_file)
   if mace_out_value.size != 0:
     mace_out_value = mace_out_value.reshape(out_shape)
-    np.testing.assert_allclose(tf_out_value, mace_out_value, rtol=0, atol=1e-3)
-    res = np.allclose(tf_out_value, mace_out_value, rtol=0, atol=1e-3)
-    if res:
-      print '=======================Passed! Haha======================'
-    else:
-      print '=======================Failed! Oops======================'
+    np.testing.assert_allclose(mace_out_value, tf_out_value, rtol=0.05)
+    print '=======================Passed! Haha======================'
   else:
     print '=======================Skip empty node==================='
 
diff --git a/tools/validate_gcn.sh b/tools/validate_gcn.sh
index f4dfc6eb..524c752b 100644
--- a/tools/validate_gcn.sh
+++ b/tools/validate_gcn.sh
@@ -32,7 +32,7 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
                                          --output=${MODEL_DIR}/${MACE_MODEL_NAME} \
                                          --input_node=input \
                                          --output_node=GCN/br_result_2/fcn_br \
-                                         --data_type=DT_FLOAT \
+                                         --data_type=DT_HALF\
                                          --runtime=gpu
 
 
@@ -50,7 +50,7 @@ adb push ${MODEL_DIR}/${MACE_MODEL_NAME} ${PHONE_DATA_DIR}
 adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR}
 adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR}
 
-num_threads=${1:-1}
+num_threads=${1:-4}
 
 adb </dev/null shell MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
         MACE_KERNEL_PATH=$KERNEL_DIR \
@@ -77,4 +77,3 @@ python tools/validate.py --model_file ${TF_MODEL_FILE_PATH} \
     --input_node input \
     --output_node GCN/br_result_2/fcn_br\
     --output_shape 1,512,512,2
-
-- 
GitLab