From e52c49b3bc46271f79c50e3883800c30ba2eb650 Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Thu, 30 Nov 2017 17:19:46 +0800
Subject: [PATCH] Support conv 1x1 with stride == 2 and padding == SAME.

---
 mace/kernels/opencl/cl/conv_2d_1x1.cl     | 62 +++++++++++++++--------
 mace/kernels/opencl/cl/conv_2d_3x3.cl     | 21 ++++----
 mace/kernels/opencl/conv_2d_opencl_1x1.cc | 12 +++--
 mace/kernels/opencl/conv_2d_opencl_3x3.cc |  2 +-
 mace/ops/conv_2d_test.cc                  | 49 ++++++++++++------
 mace/ops/ops_test_util.h                  |  7 ---
 6 files changed, 96 insertions(+), 57 deletions(-)

diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index 689a7df2..9c7a37fd 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -10,8 +10,13 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                           __read_only image2d_t bn_offset, /* cout%4 * cout/4 */
 #endif
                           __write_only image2d_t output,
+                          __private const int in_height,
+                          __private const int in_width,
                           __private const int in_ch_blks,
-                          __private const int width) {
+                          __private const int height,
+                          __private const int width,
+                          __private const int padding_top,
+                          __private const int padding_left) {
   const int out_ch_blk = get_global_id(0);
   const int out_w_blk = get_global_id(1);
   const int out_w_blks = get_global_size(1);
@@ -32,24 +37,37 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 #endif
 
   int4 w;
-  w.x = out_w_blk;
+#if STRIDE == 1
+  w.x = out_w_blk - padding_left;
   w.y = w.x + out_w_blks;
   w.z = w.y + out_w_blks;
   w.w = w.z + out_w_blks;
+  int out_hb_idx = (out_hb % height) - padding_top;
+#else
+  w.x = out_w_blk * 2 - padding_left;
+  w.y = (out_w_blk + out_w_blks) * 2 - padding_left;
+  w.z = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
+  w.w = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
+  int out_hb_idx = (out_hb % height) * 2 - padding_top;
+#endif
+
+  w.x = select(w.x, INT_MIN, (w.x < 0 || w.x >= in_width));
+  w.y = select(w.y, INT_MIN, (w.y < 0 || w.y >= in_width));
+  w.z = select(w.z, INT_MIN, (w.z < 0 || w.z >= in_width));
+  w.w = select(w.w, INT_MIN, (w.w < 0 || w.w >= in_width));
+
+  out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height,
+                      -1,
+                      out_hb_idx >= in_height);
 
   // Unrolling this loop hurt perfmance
   int in_x_base = 0;
   for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb));
-    DATA_TYPE4 in1 = 0;
-    DATA_TYPE4 in2 = 0;
-    DATA_TYPE4 in3 = 0;
-    if (w.y < width) {
-      // conditional load hurt perf, this branching helps sometimes
-      in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb));
-      in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb));
-      in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb));
-    }
+
+    DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
+    DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
+    DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
+    DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));
 
     const int filter_x0 = in_ch_blk << 2;
     DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
@@ -78,7 +96,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
     out3 += in3.z * weights2;
     out3 += in3.w * weights3;
 
-    in_x_base += width;
+    in_x_base += in_width;
   }
 
 #ifdef FUSED_BATCH_NORM
@@ -111,14 +129,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 #endif
 
   const int out_x_base = out_ch_blk * width;
-  WRITE_IMAGET(output, (int2)(out_x_base + w.x, out_hb), out0);
+  int out_x_idx = out_w_blk;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
 
-  if (w.y >= width) return;
-  WRITE_IMAGET(output, (int2)(out_x_base + w.y, out_hb), out1);
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1);
 
-  if (w.z >= width) return;
-  WRITE_IMAGET(output, (int2)(out_x_base + w.z, out_hb), out2);
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2);
 
-  if (w.w >= width) return;
-  WRITE_IMAGET(output, (int2)(out_x_base + w.w, out_hb), out3);
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
 }
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index 12849ee3..e2f2827c 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -19,23 +19,24 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   const int out_hb = get_global_id(2);
   const int rounded_in_ch = in_ch_blks * 4;
 
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+#ifdef BIAS
+  float4 out0 =
+     convert_float4(READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)));
+  float4 out1 = out0;
+  float4 out2 = out0;
+  float4 out3 = out0;
+  float4 out4 = out0;
+#else
   float4 out0 = 0;
   float4 out1 = 0;
   float4 out2 = 0;
   float4 out3 = 0;
   float4 out4 = 0;
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-#ifdef BIAS
-  out0 =
-     convert_float4(READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)));
-  out1 = out0;
-  out2 = out0;
-  out3 = out0;
-  out4 = out0;
 #endif
 
-#ifdef STRIDE_1
+#if STRIDE == 1
   int in_width0 = out_w_blk - padding_left;
   int in_width1 = in_width0 + out_w_blks;
   int in_width2 = in_width1 + out_w_blks;
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index e8a88db5..477dbc98 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -15,6 +15,7 @@ void Conv1x1(const Tensor *input,
              const Tensor *filter,
              const Tensor *bias,
              const int stride,
+             const int *padding,
              Tensor *output) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
@@ -29,9 +30,7 @@ void Conv1x1(const Tensor *input,
   const index_t width_blocks = RoundUpDiv4(width);
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
 
-  MACE_CHECK(stride == 1);
   MACE_CHECK(input_batch == batch);
-  MACE_CHECK(stride != 1 || (input_height == height && input_width == width));
 
   std::set<std::string> built_options;
   built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
@@ -54,8 +53,13 @@ void Conv1x1(const Tensor *input,
     conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
   }
   conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_height));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_width));
   conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
   conv_2d_kernel.setArg(idx++, static_cast<int>(width));
+  conv_2d_kernel.setArg(idx++, padding[0] / 2);
+  conv_2d_kernel.setArg(idx++, padding[1] / 2);
 
   auto command_queue = runtime->command_queue();
   cl_int error;
@@ -74,7 +78,7 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
                                const Tensor *bias,
                                const int *padding,
                                Tensor *output) {
-  Conv1x1(input, filter, bias, 1, output);
+  Conv1x1(input, filter, bias, 1, padding, output);
 };
 
 extern void Conv2dOpenclK1x1S2(const Tensor *input,
@@ -82,7 +86,7 @@ extern void Conv2dOpenclK1x1S2(const Tensor *input,
                                const Tensor *bias,
                                const int *padding,
                                Tensor *output) {
-  Conv1x1(input, filter, bias, 2, output);
+  Conv1x1(input, filter, bias, 2, padding, output);
 };
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index 9a39b499..fc824708 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -28,7 +28,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
   built_options.emplace(input->dtype() == DT_FLOAT ? "-DTYPE_FLOAT" : "");
   built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
   built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
+  built_options.emplace("-DSTRIDE=" + ToString(stride));
 
   auto runtime = OpenCLRuntime::Get();
   auto program = runtime->program();
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index b5f2df23..220056b6 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -420,15 +420,6 @@ template<DeviceType D>
 void TestConv1x1() {
   // Construct graph
   OpsTestNet net;
-  OpDefBuilder("Conv2D", "Conv2DTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
 
   // Add input data
   net.AddInputFromArray<D, float>(
@@ -445,8 +436,37 @@ void TestConv1x1() {
       {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f});
   net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
 
-  // Run
-  net.RunOp(D);
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
+    BufferToImage<D, float>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, float>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
+
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+
+    ImageToBuffer<D, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
 
   // Check
   auto expected = CreateTensor<float>(
@@ -465,9 +485,9 @@ TEST_F(Conv2dOpTest, CPUConv1x1) {
   TestConv1x1<DeviceType::CPU>();
 }
 
-//TEST_F(Conv2dOpTest, OPENCLConv1x1) {
-//  TestConv1x1<DeviceType::OPENCL>();
-//}
+TEST_F(Conv2dOpTest, OPENCLConv1x1) {
+  TestConv1x1<DeviceType::OPENCL>();
+}
 
 template<DeviceType D, typename T>
 static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
@@ -631,4 +651,3 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
 //TEST_F(Conv2dOpTest, OPENCLHalfAlignedConvNxNS12) {
 //  TestHalfComplexConvNxNS12<DeviceType::OPENCL, half>({32, 32, 64, 128});
 //}
-
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index fefcfe5f..8d593940 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -351,13 +351,6 @@ void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
   Expector<EXP_TYPE, RES_TYPE>::Near(x, y, abs_err);
 }
 
-template <typename T>
-std::string ToString(const T &input) {
-  std::stringstream ss;
-  ss << input;
-  return ss.str();
-}
-
 template <DeviceType D, typename T>
 void BufferToImage(OpsTestNet &net,
                    const std::string &input_name,
-- 
GitLab