Add auto-tuning for opencl kernel.

116d33db · liuqi · a66e0e35 · 116d33db · 116d33db · 116d33db
19 changed file
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -87,6 +87,11 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
    VLOG(1) << "Op " << op->debug_def().name()
            << " has shape: " << internal::MakeString(op->Output(0)->shape());
  }
+#ifdef __USE_OPENCL
+  if (device_type_ == DeviceType::OPENCL) {
+    OpenCLRuntime::Get()->command_queue().finish();
+  }
+#endif
  return true;
 }


--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -11,7 +11,7 @@
 namespace mace {
 namespace kernels {

-template <typename T>
+template<typename T>
 void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
    const Tensor *input,
    const Tensor *scale,
@@ -27,10 +27,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(

  const index_t channel_blocks = RoundUpDiv4(channels);

-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
  auto dt = DataTypeToEnum<T>::value;
@@ -38,9 +34,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
  auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);

-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
-  const std::vector<uint32_t> lws = {1, kwg_size, 1};
-
  uint32_t idx = 0;
  bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
  bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(scale->buffer())));
@@ -50,18 +43,31 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
  bm_kernel.setArg(idx++, epsilon_);
  bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));

-  auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> {
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  const std::vector<uint32_t> lws = {8, 16, 8};
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
+  auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
    return {{8, 128, 1}, //SNPE size
-            {1, 1, 64},
-            {1, 1, 128},
-            {1, kwg_size/16, 16},
-            {1, kwg_size/32, 32},
-            {1, kwg_size/64, 64},
-            {1, kwg_size/128, 128},
-            {1, 1, kwg_size},
+            {kwg_size / 16, 4, 4},
+            {kwg_size / 32, 4, 8},
+            {kwg_size / 32, 8, 4},
+            {kwg_size / 64, 8, 8},
+            {kwg_size / 64, 16, 4},
+            {kwg_size / 128, 8, 16},
+            {kwg_size / 128, 16, 8},
+            {kwg_size / 128, 32, 4},
+            {1, kwg_size / 32, 32},
+            {1, kwg_size / 64, 64},
+            {1, kwg_size / 128, 128},
+            {3, 15, 9},
+            {7, 15, 9},
+            {9, 7, 15},
+            {15, 7, 9},
            {1, kwg_size, 1}};
  };
-  auto func = [&](const std::vector<uint32_t>& params)->cl_int {
+  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        bm_kernel, cl::NullRange,
        cl::NDRange(gws[0], gws[1], gws[2]),
@@ -73,10 +79,10 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
  };
  std::stringstream ss;
  ss << "batch_norm_opencl_kernel_"
-      << input->dim(0) << "_"
-      << input->dim(1) << "_"
-      << input->dim(2) << "_"
-      << input->dim(3);
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
                                                     lws,
                                                     params_generator,

--- a/mace/kernels/opencl/bias_add_opencl.cc
+++ b/mace/kernels/opencl/bias_add_opencl.cc
@@ -36,7 +36,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
  auto bias_kernel = runtime->BuildKernel("bias_add", "bias_add", built_options);

  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bias_kernel);
-  const std::vector<uint32_t> lws = {1, kwg_size, 1};
+  const std::vector<uint32_t> lws = {8, 16, 8};

  uint32_t idx = 0;
  bias_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));

--- a/mace/kernels/opencl/cl/addn.cl
+++ b/mace/kernels/opencl/cl/addn.cl
@@ -12,19 +12,17 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
  const int w = get_global_id(0);
  const int hb = get_global_id(1);

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  DATA_TYPE4 in0 = READ_IMAGET(input0, sampler, (int2)(w, hb));
-  DATA_TYPE4 in1 = READ_IMAGET(input1, sampler, (int2)(w, hb));
+  DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb));
+  DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb));
  DATA_TYPE4 out = in0 + in1;

 #if INPUT_NUM > 2
-  DATA_TYPE4 in2 = READ_IMAGET(input2, sampler, (int2)(w, hb));
+  DATA_TYPE4 in2 = READ_IMAGET(input2, SAMPLER, (int2)(w, hb));
  out = out + in2;
 #endif

 #if INPUT_NUM > 3
-  DATA_TYPE4 in3 = READ_IMAGET(input3, sampler, (int2)(w, hb));
+  DATA_TYPE4 in3 = READ_IMAGET(input3, SAMPLER, (int2)(w, hb));
  out = out + in3;
 #endif


--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
@@ -54,9 +54,8 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc
                           + out_channel_idx;

  if (in_channel_idx < in_channel) {
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
    int2 coord = (int2)(w, h);
-    VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
+    VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord);
    const int size = (out_channel - out_channel_idx);
    if (size < 4) {
      switch (size) {
@@ -119,9 +118,8 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
  const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
                           + channel_idx;

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  int2 coord = (int2)(w, h);
-  VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
+  VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord);
  const int size = channels - channel_idx;
  if (size < 4) {
    switch (size) {
@@ -169,9 +167,8 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
  int h = get_global_id(1);
  const int offset = w * 4;

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  int2 coord = (int2)(w, h);
-  VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
+  VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord);
  const int size = count - offset;
  if (size < 4) {
    switch (size) {

--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -21,10 +21,9 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
  const int out_hb = get_global_id(2);
  const int rounded_in_ch = in_ch_blks * 4;

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 #ifdef BIAS
  DATA_TYPE4 out0 =
-     READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+     READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
  DATA_TYPE4 out1 = out0;
  DATA_TYPE4 out2 = out0;
  DATA_TYPE4 out3 = out0;
@@ -71,7 +70,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
        in_width_value = select(in_idx + in_width_value,                             \
                                -1,                                                  \
                                (in_width_value < 0 || in_width_value >= in_width)); \
-        in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
+        in##i = READ_IMAGET(input, SAMPLER, (int2)(in_width_value, in_hb_value));

        READ_INPUT(0);
        READ_INPUT(1);
@@ -81,10 +80,10 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
 #undef READ_INPUT

        int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch;
-        weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
-        weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
-        weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
-        weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
+        weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
+        weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
+        weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
+        weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));

        // Will prefetch L2 improve performance? How to pretch image data?


--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -16,10 +16,8 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  const int out_w_blks = get_global_size(1);
  const int out_hb = get_global_id(2);

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
 #ifdef BIAS
-  DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+  DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
  DATA_TYPE4 out1 = out0;
  DATA_TYPE4 out2 = out0;
  DATA_TYPE4 out3 = out0;
@@ -58,16 +56,16 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  int in_x_base = 0;
  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {

-    DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
-    DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
-    DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
-    DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));
+    DATA_TYPE4 in0 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.x, out_hb_idx));
+    DATA_TYPE4 in1 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.y, out_hb_idx));
+    DATA_TYPE4 in2 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.z, out_hb_idx));
+    DATA_TYPE4 in3 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.w, out_hb_idx));

    const int filter_x0 = in_ch_blk << 2;
-    DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
-    DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
-    DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
-    DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
+    DATA_TYPE4 weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0, out_ch_blk));
+    DATA_TYPE4 weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 1, out_ch_blk));
+    DATA_TYPE4 weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 2, out_ch_blk));
+    DATA_TYPE4 weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 3, out_ch_blk));
    // Will prefetch L2 improve performance? How to pretch image data?

    out0 += in0.x * weights0;

--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -19,10 +19,9 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  const int out_hb = get_global_id(2);
  const int rounded_in_ch = in_ch_blks * 4;

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 #ifdef BIAS
  DATA_TYPE4 out0 =
-     READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+     READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
  DATA_TYPE4 out1 = out0;
  DATA_TYPE4 out2 = out0;
  DATA_TYPE4 out3 = out0;
@@ -72,7 +71,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
        in_width_value = select(in_idx + in_width_value,                             \
                                -1,                                                  \
                                (in_width_value < 0 || in_width_value >= in_width)); \
-        in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
+        in##i = READ_IMAGET(input, SAMPLER, (int2)(in_width_value, in_hb_value));

        READ_INPUT(0);
        READ_INPUT(1);
@@ -83,10 +82,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 #undef READ_INPUT

        int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch;
-        weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
-        weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
-        weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
-        weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
+        weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
+        weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
+        weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
+        weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));

        // Will prefetch L2 improve performance? How to pretch image data?


--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
@@ -25,17 +25,16 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
  const float h_lerp = h_in - h_lower;
  const float w_lerp = w_in - w_lower;

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  const int in_w_offset = ch_blk * in_width;
  const int in_h_offset = b * in_height;

-  DATA_TYPE4 top_left = READ_IMAGET(input, sampler,
+  DATA_TYPE4 top_left = READ_IMAGET(input, SAMPLER,
          (int2)(in_w_offset + w_lower, in_h_offset + h_lower));
-  DATA_TYPE4 top_right = READ_IMAGET(input, sampler,
+  DATA_TYPE4 top_right = READ_IMAGET(input, SAMPLER,
          (int2)(in_w_offset + w_upper, in_h_offset + h_lower));
-  DATA_TYPE4 bottom_left = READ_IMAGET(input, sampler,
+  DATA_TYPE4 bottom_left = READ_IMAGET(input, SAMPLER,
          (int2)(in_w_offset + w_lower, in_h_offset + h_upper));
-  DATA_TYPE4 bottom_right = READ_IMAGET(input, sampler,
+  DATA_TYPE4 bottom_right = READ_IMAGET(input, SAMPLER,
          (int2)(in_w_offset + w_upper, in_h_offset + h_upper));

  DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp;

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -43,10 +43,10 @@ static void Concat2(const Tensor *input0,

  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel);

-  uint32_t lws[3];
-  lws[0] = std::min<uint32_t>(channel_blk, kwg_size);
-  lws[1] = std::min<uint32_t>(width, kwg_size / lws[0]);
-  lws[2] = std::min<uint32_t>(height * batch, kwg_size / (lws[0] * lws[1]));
+  uint32_t lws[3] = {8, 16, 8};
+//  lws[0] = std::min<uint32_t>(channel_blk, kwg_size);
+//  lws[1] = std::min<uint32_t>(width, kwg_size / lws[0]);
+//  lws[2] = std::min<uint32_t>(height * batch, kwg_size / (lws[0] * lws[1]));

  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      concat_kernel, cl::NullRange,

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -7,6 +7,7 @@
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
 #include "mace/utils/utils.h"
+#include "mace/utils/tuner.h"

 namespace mace {
 namespace kernels {
@@ -48,7 +49,6 @@ void Conv1x1(const Tensor *input,
  auto program = runtime->program();

  auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1", built_options);
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);

  uint32_t idx = 0;
  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
@@ -63,16 +63,51 @@ void Conv1x1(const Tensor *input,
  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
  conv_2d_kernel.setArg(idx++, static_cast<int>(width));

-  auto command_queue = runtime->command_queue();
-  cl_int error;
-  error = command_queue.enqueueNDRangeKernel(
-      conv_2d_kernel, cl::NullRange,
-      cl::NDRange(static_cast<uint32_t>(channel_blocks),
-                  static_cast<uint32_t>(width_blocks),
-                  static_cast<uint32_t>(height * batch)),
-      cl::NDRange(4, 15, 8), // TODO auto tuning
-      nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS, error);
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+  const std::vector<uint32_t> lws = {8, 15, 8};
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
+  auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> {
+    return {{4, 15, 8}, //SNPE size
+            {kwg_size/16, 4, 4},
+            {kwg_size/32, 4, 8},
+            {kwg_size/32, 8, 4},
+            {kwg_size/64, 8, 8},
+            {kwg_size/64, 16, 4},
+            {kwg_size/128, 8, 16},
+            {kwg_size/128, 16, 8},
+            {kwg_size/128, 32, 4},
+            {1, kwg_size/32, 32},
+            {1, kwg_size/64, 64},
+            {1, kwg_size/128, 128},
+            {3, 15, 9},
+            {7, 15, 9},
+            {9, 7, 15},
+            {15, 7, 9},
+            {1, kwg_size, 1}};
+  };
+  auto func = [&](const std::vector<uint32_t>& params)->cl_int {
+    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+        conv_2d_kernel, cl::NullRange,
+        cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(params[0], params[1], params[2]),
+        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+    return error;
+  };
+  std::stringstream ss;
+  ss << "conv2d_1x1_opencl_kernel_"
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
+                                                     lws,
+                                                     params_generator,
+                                                     func);
+
 }

 extern void Conv2dOpenclK1x1S1(const Tensor *input,

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -7,6 +7,7 @@
 #include "mace/kernels/conv_2d.h"
 #include "mace/kernels/opencl/helper.h"
 #include "mace/utils/utils.h"
+#include "mace/utils/tuner.h"

 namespace mace {
 namespace kernels {
@@ -54,15 +55,50 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
  conv_2d_kernel.setArg(idx++, padding[0] / 2);
  conv_2d_kernel.setArg(idx++, padding[1] / 2);

-  auto command_queue = runtime->command_queue();
-  cl_int error;
-  error = command_queue.enqueueNDRangeKernel(
-      conv_2d_kernel, cl::NullRange,
-      cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
-                  static_cast<uint32_t>(height * batch)),
-      cl::NDRange(16, 16, 4),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS, error);
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+  const std::vector<uint32_t> lws = {4, 15, 8};
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
+  auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
+    return {{4, 15, 8}, //SNPE size
+            {kwg_size / 16, 4, 4},
+            {kwg_size / 32, 4, 8},
+            {kwg_size / 32, 8, 4},
+            {kwg_size / 64, 8, 8},
+            {kwg_size / 64, 16, 4},
+            {kwg_size / 128, 8, 16},
+            {kwg_size / 128, 16, 8},
+            {kwg_size / 128, 32, 4},
+            {1, kwg_size / 32, 32},
+            {1, kwg_size / 64, 64},
+            {1, kwg_size / 128, 128},
+            {3, 15, 9},
+            {7, 15, 9},
+            {9, 7, 15},
+            {15, 7, 9},
+            {1, kwg_size, 1}};
+  };
+  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+        conv_2d_kernel, cl::NullRange,
+        cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(params[0], params[1], params[2]),
+        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+    return error;
+  };
+  std::stringstream ss;
+  ss << "conv2d_3x3_opencl_kernel_"
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
+                                                     lws,
+                                                     params_generator,
+                                                     func);

 }
 void Conv2dOpenclK3x3S1(const Tensor *input,

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -7,6 +7,7 @@
 #include "mace/kernels/conv_2d.h"
 #include "mace/kernels/opencl/helper.h"
 #include "mace/utils/utils.h"
+#include "mace/utils/tuner.h"

 namespace mace {
 namespace kernels {
@@ -38,7 +39,6 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
  auto program = runtime->program();

  auto conv_2d_kernel = runtime->BuildKernel("conv_2d", "conv_2d", built_options);
-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);

  uint32_t idx = 0;
  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
@@ -57,15 +57,50 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
  conv_2d_kernel.setArg(idx++, padding[0] / 2);
  conv_2d_kernel.setArg(idx++, padding[1] / 2);

-  auto command_queue = runtime->command_queue();
-  cl_int error;
-  error = command_queue.enqueueNDRangeKernel(
-      conv_2d_kernel, cl::NullRange,
-      cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
-                  static_cast<uint32_t>(height * batch)),
-      cl::NDRange(16, 16, 4),
-      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS, error);
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+  const std::vector<uint32_t> lws = {8, 16, 8};
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
+  auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
+    return {{4, 15, 8}, //SNPE size
+            {kwg_size / 16, 4, 4},
+            {kwg_size / 32, 4, 8},
+            {kwg_size / 32, 8, 4},
+            {kwg_size / 64, 8, 8},
+            {kwg_size / 64, 16, 4},
+            {kwg_size / 128, 8, 16},
+            {kwg_size / 128, 16, 8},
+            {kwg_size / 128, 32, 4},
+            {1, kwg_size / 32, 32},
+            {1, kwg_size / 64, 64},
+            {1, kwg_size / 128, 128},
+            {3, 15, 9},
+            {7, 15, 9},
+            {9, 7, 15},
+            {15, 7, 9},
+            {1, kwg_size, 1}};
+  };
+  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+        conv_2d_kernel, cl::NullRange,
+        cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(params[0], params[1], params[2]),
+        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+    return error;
+  };
+  std::stringstream ss;
+  ss << "conv2d_general_opencl_kernel_"
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
+                                                     lws,
+                                                     params_generator,
+                                                     func);

 }


--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -7,6 +7,7 @@
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
 #include "mace/utils/utils.h"
+#include "mace/utils/tuner.h"

 namespace mace {
 namespace kernels {
@@ -22,10 +23,6 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,

  const index_t channel_blocks = RoundUpDiv4(channels);

-  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(width),
-                           static_cast<uint32_t>(height * batch)};
-
  auto runtime = OpenCLRuntime::Get();
  auto program = runtime->program();

@@ -33,38 +30,65 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
  auto dt = DataTypeToEnum<T>::value;
  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  cl::Kernel relu_kernel;
  if (max_limit_ < 0) {
-    auto relu_kernel  = runtime->BuildKernel("relu", "relu", built_options);
-    const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
-    const uint32_t lws[3] = {1, kwg_size, 1};
+    relu_kernel  = runtime->BuildKernel("relu", "relu", built_options);

    uint32_t idx = 0;
    relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
    relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-
-    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-        relu_kernel, cl::NullRange,
-        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]),
-        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-    MACE_CHECK(error == CL_SUCCESS);
  } else {
-    auto relu_kernel  = runtime->BuildKernel("relu", "relux", built_options);
-    const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
-    const uint32_t lws[3] = {1, kwg_size, 1};
+    relu_kernel  = runtime->BuildKernel("relu", "relux", built_options);

    uint32_t idx = 0;
    relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
    relu_kernel.setArg(idx++, max_limit_);
    relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-
+  }
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width),
+                           static_cast<uint32_t>(height * batch)};
+  const std::vector<uint32_t> lws = {8, 16, 8};
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
+  auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
+    return {{4, 15, 8}, //SNPE size
+            {kwg_size / 16, 4, 4},
+            {kwg_size / 32, 4, 8},
+            {kwg_size / 32, 8, 4},
+            {kwg_size / 64, 8, 8},
+            {kwg_size / 64, 16, 4},
+            {kwg_size / 128, 8, 16},
+            {kwg_size / 128, 16, 8},
+            {kwg_size / 128, 32, 4},
+            {1, kwg_size / 32, 32},
+            {1, kwg_size / 64, 64},
+            {1, kwg_size / 128, 128},
+            {3, 15, 9},
+            {7, 15, 9},
+            {9, 7, 15},
+            {15, 7, 9},
+            {1, kwg_size, 1}};
+  };
+  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        relu_kernel, cl::NullRange,
        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(lws[0], lws[1], lws[2]),
+        cl::NDRange(params[0], params[1], params[2]),
        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-    MACE_CHECK(error == CL_SUCCESS);
-  }
+
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+    return error;
+  };
+  std::stringstream ss;
+  ss << "relu_opencl_kernel_"
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
+                                                     lws,
+                                                     params_generator,
+                                                     func);
 }

 template

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -102,6 +102,8 @@ BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
 BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
 // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
 BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
+BM_CONV_2D(1, 3, 512, 512, 7, 7, 2, SAME, 64, half);
+BM_CONV_2D(1, 512, 64, 64, 1, 1, 1, SAME, 256, half);

 // Test RGB <-> YUV
 //BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);

--- a/mace/tools/benchmark/benchmark_model.cc
+++ b/mace/tools/benchmark/benchmark_model.cc
@@ -260,8 +260,6 @@ int Main(int argc, char **argv) {
  DeviceType_Parse(device, &device_type);
  VLOG(0) << device_type;

-  if (device_type == DeviceType::OPENCL)
-    OpenCLRuntime::EnableProfiling();

  // load model
  std::ifstream model_file_stream(model_file, std::ios::in | std::ios::binary);
@@ -296,9 +294,11 @@ int Main(int argc, char **argv) {
      }
    }
  }
+  auto net = CreateNet(net_def, &ws, device_type, NetMode::INIT);
+  net->Run();

  // create net
-  auto net = CreateNet(net_def, &ws, device_type);
+  net = CreateNet(net_def, &ws, device_type);

  int64_t warmup_time_us = 0;
  int64_t num_warmup_runs = 0;

--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -42,6 +42,7 @@ class Tuner {
    } else {
      // run
      if (param_table_.find(param_key) != param_table_.end()) {
+        VLOG(1) << param_key << ": " << internal::MakeString(param_table_[param_key]);
        return func(param_table_[param_key]);
      } else {
        return func(default_param);

--- a/tools/validate.py
+++ b/tools/validate.py
@@ -18,10 +18,9 @@ from tensorflow import gfile
 #        --input_file input_file \
 #        --mace_out_file icnet.out

-
 def generate_data(shape):
  np.random.seed(FLAGS.random_seed)
-  data = np.random.random(shape)
+  data = np.random.random(shape) * -1
  print FLAGS.input_file
  data.astype(np.float32).tofile(FLAGS.input_file)
  print "Generate input file done."
@@ -36,12 +35,8 @@ def valid_output(out_shape, mace_out_file, tf_out_value):
  mace_out_value = load_data(mace_out_file)
  if mace_out_value.size != 0:
    mace_out_value = mace_out_value.reshape(out_shape)
-    np.testing.assert_allclose(tf_out_value, mace_out_value, rtol=0, atol=1e-3)
-    res = np.allclose(tf_out_value, mace_out_value, rtol=0, atol=1e-3)
-    if res:
-      print '=======================Passed! Haha======================'
-    else:
-      print '=======================Failed! Oops======================'
+    np.testing.assert_allclose(mace_out_value, tf_out_value, rtol=0.05)
+    print '=======================Passed! Haha======================'
  else:
    print '=======================Skip empty node==================='


--- a/tools/validate_gcn.sh
+++ b/tools/validate_gcn.sh
@@ -32,7 +32,7 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
                                         --output=${MODEL_DIR}/${MACE_MODEL_NAME} \
                                         --input_node=input \
                                         --output_node=GCN/br_result_2/fcn_br \
-                                         --data_type=DT_FLOAT \
+                                         --data_type=DT_HALF\
                                         --runtime=gpu


@@ -50,7 +50,7 @@ adb push ${MODEL_DIR}/${MACE_MODEL_NAME} ${PHONE_DATA_DIR}
 adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR}
 adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR}

-num_threads=${1:-1}
+num_threads=${1:-4}

 adb </dev/null shell MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
        MACE_KERNEL_PATH=$KERNEL_DIR \
@@ -77,4 +77,3 @@ python tools/validate.py --model_file ${TF_MODEL_FILE_PATH} \
    --input_node input \
    --output_node GCN/br_result_2/fcn_br\
    --output_shape 1,512,512,2
-