Support conv 1x1 with stride == 2 and padding == SAME.

e52c49b3 · liuqi · 29c3f0f7 · e52c49b3 · e52c49b3 · e52c49b3
6 changed file
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -10,8 +10,13 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                          __read_only image2d_t bn_offset, /* cout%4 * cout/4 */
 #endif
                          __write_only image2d_t output,
+                          __private const int in_height,
+                          __private const int in_width,
                          __private const int in_ch_blks,
-                          __private const int width) {
+                          __private const int height,
+                          __private const int width,
+                          __private const int padding_top,
+                          __private const int padding_left) {
  const int out_ch_blk = get_global_id(0);
  const int out_w_blk = get_global_id(1);
  const int out_w_blks = get_global_size(1);
@@ -32,24 +37,37 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 #endif

  int4 w;
-  w.x = out_w_blk;
+#if STRIDE == 1
+  w.x = out_w_blk - padding_left;
  w.y = w.x + out_w_blks;
  w.z = w.y + out_w_blks;
  w.w = w.z + out_w_blks;
+  int out_hb_idx = (out_hb % height) - padding_top;
+#else
+  w.x = out_w_blk * 2 - padding_left;
+  w.y = (out_w_blk + out_w_blks) * 2 - padding_left;
+  w.z = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
+  w.w = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
+  int out_hb_idx = (out_hb % height) * 2 - padding_top;
+#endif
+
+  w.x = select(w.x, INT_MIN, (w.x < 0 || w.x >= in_width));
+  w.y = select(w.y, INT_MIN, (w.y < 0 || w.y >= in_width));
+  w.z = select(w.z, INT_MIN, (w.z < 0 || w.z >= in_width));
+  w.w = select(w.w, INT_MIN, (w.w < 0 || w.w >= in_width));
+
+  out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height,
+                      -1,
+                      out_hb_idx >= in_height);

  // Unrolling this loop hurt perfmance
  int in_x_base = 0;
  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb));
-    DATA_TYPE4 in1 = 0;
-    DATA_TYPE4 in2 = 0;
-    DATA_TYPE4 in3 = 0;
-    if (w.y < width) {
-      // conditional load hurt perf, this branching helps sometimes
-      in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb));
-      in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb));
-      in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb));
-    }
+
+    DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
+    DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
+    DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
+    DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));

    const int filter_x0 = in_ch_blk << 2;
    DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
@@ -78,7 +96,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
    out3 += in3.z * weights2;
    out3 += in3.w * weights3;

-    in_x_base += width;
+    in_x_base += in_width;
  }

 #ifdef FUSED_BATCH_NORM
@@ -111,14 +129,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 #endif

  const int out_x_base = out_ch_blk * width;
-  WRITE_IMAGET(output, (int2)(out_x_base + w.x, out_hb), out0);
+  int out_x_idx = out_w_blk;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);

-  if (w.y >= width) return;
-  WRITE_IMAGET(output, (int2)(out_x_base + w.y, out_hb), out1);
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1);

-  if (w.z >= width) return;
-  WRITE_IMAGET(output, (int2)(out_x_base + w.z, out_hb), out2);
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2);

-  if (w.w >= width) return;
-  WRITE_IMAGET(output, (int2)(out_x_base + w.w, out_hb), out3);
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
 }
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -19,23 +19,24 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  const int out_hb = get_global_id(2);
  const int rounded_in_ch = in_ch_blks * 4;

+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+#ifdef BIAS
+  float4 out0 =
+     convert_float4(READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)));
+  float4 out1 = out0;
+  float4 out2 = out0;
+  float4 out3 = out0;
+  float4 out4 = out0;
+#else
  float4 out0 = 0;
  float4 out1 = 0;
  float4 out2 = 0;
  float4 out3 = 0;
  float4 out4 = 0;
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-#ifdef BIAS
-  out0 =
-     convert_float4(READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)));
-  out1 = out0;
-  out2 = out0;
-  out3 = out0;
-  out4 = out0;
 #endif

-#ifdef STRIDE_1
+#if STRIDE == 1
  int in_width0 = out_w_blk - padding_left;
  int in_width1 = in_width0 + out_w_blks;
  int in_width2 = in_width1 + out_w_blks;

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -15,6 +15,7 @@ void Conv1x1(const Tensor *input,
             const Tensor *filter,
             const Tensor *bias,
             const int stride,
+             const int *padding,
             Tensor *output) {
  const index_t batch = output->dim(0);
  const index_t height = output->dim(1);
@@ -29,9 +30,7 @@ void Conv1x1(const Tensor *input,
  const index_t width_blocks = RoundUpDiv4(width);
  const index_t input_channel_blocks = RoundUpDiv4(input_channels);

-  MACE_CHECK(stride == 1);
  MACE_CHECK(input_batch == batch);
-  MACE_CHECK(stride != 1 || (input_height == height && input_width == width));

  std::set<std::string> built_options;
  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
@@ -54,8 +53,13 @@ void Conv1x1(const Tensor *input,
    conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
  }
  conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_height));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_width));
  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
+  conv_2d_kernel.setArg(idx++, padding[0] / 2);
+  conv_2d_kernel.setArg(idx++, padding[1] / 2);

  auto command_queue = runtime->command_queue();
  cl_int error;
@@ -74,7 +78,7 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
                               const Tensor *bias,
                               const int *padding,
                               Tensor *output) {
-  Conv1x1(input, filter, bias, 1, output);
+  Conv1x1(input, filter, bias, 1, padding, output);
 };

 extern void Conv2dOpenclK1x1S2(const Tensor *input,
@@ -82,7 +86,7 @@ extern void Conv2dOpenclK1x1S2(const Tensor *input,
                               const Tensor *bias,
                               const int *padding,
                               Tensor *output) {
-  Conv1x1(input, filter, bias, 2, output);
+  Conv1x1(input, filter, bias, 2, padding, output);
 };

 }  // namespace kernels

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -28,7 +28,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
  built_options.emplace(input->dtype() == DT_FLOAT ? "-DTYPE_FLOAT" : "");
  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
+  built_options.emplace("-DSTRIDE=" + ToString(stride));

  auto runtime = OpenCLRuntime::Get();
  auto program = runtime->program();

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -420,15 +420,6 @@ template<DeviceType D>
 void TestConv1x1() {
  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("Conv2D", "Conv2DTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddInputFromArray<D, float>(
@@ -445,8 +436,37 @@ void TestConv1x1() {
      {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});

-  // Run
-  net.RunOp(D);
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, float>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, float>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }

  // Check
  auto expected = CreateTensor<float>(
@@ -465,9 +485,9 @@ TEST_F(Conv2dOpTest, CPUConv1x1) {
  TestConv1x1<DeviceType::CPU>();
 }

-//TEST_F(Conv2dOpTest, OPENCLConv1x1) {
-//  TestConv1x1<DeviceType::OPENCL>();
-//}
+TEST_F(Conv2dOpTest, OPENCLConv1x1) {
+  TestConv1x1<DeviceType::OPENCL>();
+}

 template<DeviceType D, typename T>
 static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
@@ -631,4 +651,3 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
 //TEST_F(Conv2dOpTest, OPENCLHalfAlignedConvNxNS12) {
 //  TestHalfComplexConvNxNS12<DeviceType::OPENCL, half>({32, 32, 64, 128});
 //}
-
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -351,13 +351,6 @@ void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
  Expector<EXP_TYPE, RES_TYPE>::Near(x, y, abs_err);
 }

-template <typename T>
-std::string ToString(const T &input) {
-  std::stringstream ss;
-  ss << input;
-  return ss.str();
-}
-
 template <DeviceType D, typename T>
 void BufferToImage(OpsTestNet &net,
                   const std::string &input_name,