Optimize opencl conv 3x3 kernel.

1374a365 · liuqi · fd284f6a · 1374a365 · 1374a365 · 1374a365
5 changed file
--- a/mace/core/opencl_allocator.cc
+++ b/mace/core/opencl_allocator.cc
@@ -13,6 +13,7 @@ namespace {
 static cl_channel_type DataTypeToCLChannelType(const DataType t) {
  switch (t) {
    case DT_HALF:
+      return CL_HALF_FLOAT;
    case DT_FLOAT:
      return CL_FLOAT;
    case DT_INT8:

--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -8,7 +8,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                          __write_only image2d_t output,
                          __private const int in_height,
                          __private const int in_width,
-                          __private const int in_channels,
+                          __private const int in_ch_blks,
                          __private const int out_height,
                          __private const int out_width,
                          __private const int padding_top,
@@ -17,25 +17,26 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  const int out_w_blk = get_global_id(1);
  const int out_w_blks = get_global_size(1);
  const int out_hb = get_global_id(2);
-  const int in_ch_blks = (in_channels + 3) / 4;
  const int rounded_in_ch = in_ch_blks * 4;

  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

-  VEC_DATA_TYPE(DATA_TYPE, 4) out[4] = {0};
+  VEC_DATA_TYPE(DATA_TYPE, 4) out[5] = {0};
 #ifdef BIAS
  out[0] =
      CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0));
  out[1] = out[0];
  out[2] = out[0];
  out[3] = out[0];
+  out[4] = out[0];
 #endif

-  int w[4];
+  int w[5];
  w[0] = out_w_blk - padding_left;
  w[1] = w[0] + out_w_blks;
  w[2] = w[1] + out_w_blks;
  w[3] = w[2] + out_w_blks;
+  w[4] = w[3] + out_w_blks;

  const int batch_idx = out_hb / out_height;
  const int height_idx = out_hb % out_height;
@@ -50,64 +51,83 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]

  const int input_image_width = in_ch_blks * in_width;

+  VEC_DATA_TYPE(DATA_TYPE, 4) in[5];
+  VEC_DATA_TYPE(DATA_TYPE, 4) weights[4];
+  int in_idx, hb_idx, width_idx, in_width_idx;
  // Unrolling this loop hurt perfmance
-  int idx = 0;
  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    VEC_DATA_TYPE(DATA_TYPE, 4) in[36];
-    VEC_DATA_TYPE(DATA_TYPE, 4) weights[36];
-
-    int filter_idx = in_ch_blk << 2;
-    int in_idx = in_ch_blk * in_width;
-
-    #pragma unroll
-    for (int i = 0; i < 3; ++i) {
-      for (int j = 0; j < 3; ++j) {
-        idx = i * 12 + j * 4;
-        int in_width_idx = w[0] + j;
-        // Judge the width border for padding input.
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
-          in[idx + 0] = 0;
-        } else {
-          in[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
-        }
-        in_width_idx = w[1] + j;
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
-          in[idx + 1] = 0;
-        } else {
-          in[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
-        }
-        in_width_idx = w[2] + j;
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
-          in[idx + 2] = 0;
-        } else {
-          in[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
-        }
-        in_width_idx = w[3] + j;
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
-          in[idx + 3] = 0;
-        } else {
-          in[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
-        }
-
-        weights[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
-        weights[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
-        weights[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
-        weights[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
-
-        filter_idx += rounded_in_ch;
+    for (int i = 0; i < 9; ++i) {
+
+      in_idx = in_ch_blk * in_width;
+
+      hb_idx = i / 3;
+      width_idx = i % 3;
+      in_width_idx = w[0] + width_idx;
+      // Judge the width border for padding input.
+      if (in_width_idx < 0 || in_width_idx >= in_width) {
+        in[0] = 0;
+      } else {
+        in[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
      }
-    }
-    // Will prefetch L2 improve performance? How to pretch image data?
-
-    // Interleaving load and mul does not improve performance as expected
-    #pragma unroll
-    for (int c = 0; c < 4; ++c) {
-      for (int i = 0; i < 9; ++i) {
-        out[c] += in[c + i * 4].x * weights[0 + i * 4];
-        out[c] += in[c + i * 4].y * weights[1 + i * 4];
-        out[c] += in[c + i * 4].z * weights[2 + i * 4];
-        out[c] += in[c + i * 4].w * weights[3 + i * 4];
+      in_width_idx = w[1] + width_idx;
+      if (in_width_idx < 0 || in_width_idx >= in_width) {
+        in[1] = 0;
+      } else {
+        in[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
+      }
+      in_width_idx = w[2] + width_idx;
+      if (in_width_idx < 0 || in_width_idx >= in_width) {
+        in[2] = 0;
+      } else {
+        in[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
+      }
+      in_width_idx = w[3] + width_idx;
+      if (in_width_idx < 0 || in_width_idx >= in_width) {
+        in[3] = 0;
+      } else {
+        in[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
+      }
+      in_width_idx = w[4] + width_idx;
+      if (in_width_idx < 0 || in_width_idx >= in_width) {
+        in[4] = 0;
+      } else {
+        in[4] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
      }
+
+
+      int filter_idx = (in_ch_blk << 2) + i * rounded_in_ch;
+      weights[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
+      weights[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
+      weights[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
+      weights[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
+
+      // Will prefetch L2 improve performance? How to pretch image data?
+
+      // Interleaving load and mul does not improve performance as expected
+      out[0] += in[0].x * weights[0];
+      out[0] += in[0].y * weights[1];
+      out[0] += in[0].z * weights[2];
+      out[0] += in[0].w * weights[3];
+
+      out[1] += in[1].x * weights[0];
+      out[1] += in[1].y * weights[1];
+      out[1] += in[1].z * weights[2];
+      out[1] += in[1].w * weights[3];
+
+      out[2] += in[2].x * weights[0];
+      out[2] += in[2].y * weights[1];
+      out[2] += in[2].z * weights[2];
+      out[2] += in[2].w * weights[3];
+
+      out[3] += in[3].x * weights[0];
+      out[3] += in[3].y * weights[1];
+      out[3] += in[3].z * weights[2];
+      out[3] += in[3].w * weights[3];
+
+      out[4] += in[4].x * weights[0];
+      out[4] += in[4].y * weights[1];
+      out[4] += in[4].z * weights[2];
+      out[4] += in[4].w * weights[3];
    }
  }

@@ -133,4 +153,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
                                       (int2)(out_x_base + w[3], out_hb),
                                       out[3]);
+
+  w[4] += padding_left;
+  if (w[4] >= out_width) return;
+  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
+                                       (int2)(out_x_base + w[4], out_hb),
+                                       out[4]);
 }
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -22,7 +22,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,

  const index_t channel_blocks = RoundUpDiv4(channels);
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
-  const index_t width_blocks = RoundUpDiv4(width);
+  const index_t width_blocks = RoundUpDiv<index_t, 5>(width);

  std::set<std::string> built_options;
  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
@@ -44,7 +44,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3)));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
  conv_2d_kernel.setArg(idx++, padding[0] / 2);
@@ -56,7 +56,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
      conv_2d_kernel, cl::NullRange,
      cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
                  static_cast<uint32_t>(height * batch)),
-      cl::NDRange(4, 15, 8),
+      cl::NDRange(16, 16, 4),
      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS, error);


--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -27,10 +27,10 @@ static void Conv2d(int iters,
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
-  net.AddRandomInput<D, float>("Filter",
+  net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
+  net.AddRandomInput<D, T>("Filter",
                               {kernel_h, kernel_w, channels, output_channels});
-  net.AddRandomInput<D, float>("Bias", {output_channels});
+  net.AddRandomInput<D, T>("Bias", {output_channels});

  if (D == DeviceType::OPENCL) {
    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
@@ -88,6 +88,7 @@ static void Conv2d(int iters,
      BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)

 #define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE)        \
+  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU);  \
  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);

 // ICNet
@@ -99,7 +100,7 @@ BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float);
 BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float);

 // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
-BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, VALID, 32, float);
+BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, float);

 // Test RGB <-> YUV
 BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -84,18 +84,18 @@ TEST_F(Conv2dOpTest, NEONSimple) {
  TestSimple3x3SAME<DeviceType::NEON>();
 }

-template<DeviceType D>
+template<DeviceType D, typename T>
 void TestNHWCSimple3x3VALID() {
  OpsTestNet net;
  // Add input data
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
      "Input", {1, 3, 3, 2},
      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
      "Filter", {3, 3, 2, 1},
      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
+  net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});

  if (D == DeviceType::OPENCL) {
    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
@@ -130,23 +130,23 @@ void TestNHWCSimple3x3VALID() {
    net.RunOp(D);
  }

-  auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+  auto expected = CreateTensor<T>({1, 1, 1, 1}, {18.1f});
+  ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.001);
 }

-template<DeviceType D>
+template<DeviceType D, typename T>
 void TestNHWCSimple3x3SAME() {
  OpsTestNet net;

  // Add input data
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
      "Input", {1, 3, 3, 2},
      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
      "Filter", {3, 3, 2, 1},
      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
+  net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});

  if (D == DeviceType::OPENCL) {
    BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
@@ -181,21 +181,21 @@ void TestNHWCSimple3x3SAME() {
    net.RunOp(D);
  }

-  auto expected = CreateTensor<float>(
+  auto expected = CreateTensor<T>(
      {1, 3, 3, 1},
      {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});

-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.001);
 }

 TEST_F(Conv2dOpTest, CPUSimple) {
-  TestNHWCSimple3x3VALID<DeviceType::CPU>();
-  TestNHWCSimple3x3SAME<DeviceType::CPU>();
+  TestNHWCSimple3x3VALID<DeviceType::CPU, float>();
+  TestNHWCSimple3x3SAME<DeviceType::CPU, float>();
 }

 TEST_F(Conv2dOpTest, OPENCLSimple) {
-  TestNHWCSimple3x3VALID<DeviceType::OPENCL>();
-  TestNHWCSimple3x3SAME<DeviceType::OPENCL>();
+  TestNHWCSimple3x3VALID<DeviceType::OPENCL, float>();
+  TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
 }

 template<DeviceType D>
@@ -457,11 +457,11 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
    srand(time(NULL));

    // generate random input
-    index_t batch = 3 + rand() % 10;
+    index_t batch = 1;
    index_t height = shape[0];
    index_t width = shape[1];
-    index_t input_channels = shape[2] + rand() % 10;
-    index_t output_channels = shape[3] + rand() % 10;
+    index_t input_channels = shape[2];
+    index_t output_channels = shape[3];
    // Construct graph
    OpsTestNet net;
    OpDefBuilder("Conv2D", "Conv2dTest")
@@ -509,6 +509,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {

  for (int kernel_size : {3}) {
    for (int stride : {1}) {
+      func(kernel_size, kernel_size, stride, stride, VALID);
      func(kernel_size, kernel_size, stride, stride, SAME);
    }
  }