From b469a9451441ece4d2055e25d7267b99330f235a Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Fri, 3 Nov 2017 09:55:42 +0800
Subject: [PATCH] Finish depthwise 3x3 conv with stride 2.

---
 mace/kernels/opencl/cl/conv_2d_3x3.cl         | 17 +---
 mace/kernels/opencl/cl/conv_helper.h          | 15 +++
 mace/kernels/opencl/cl/depthwise_conv_3x3.cl  | 93 ++++++++++---------
 mace/kernels/opencl/conv_2d_opencl_3x3.cc     | 28 +++---
 mace/kernels/opencl/depthwise_conv_opencl.cc  |  4 +-
 .../opencl/depthwise_conv_opencl_3x3.cc       | 73 +++++++++------
 6 files changed, 133 insertions(+), 97 deletions(-)
 create mode 100644 mace/kernels/opencl/cl/conv_helper.h

diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index 1c97a581..cd4e5eed 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -1,11 +1,4 @@
-float4 conv1x3_s1(const float *input_ptr,
-                  const float *filter_ptr);
-float4 conv1x3_s2(const float *input_ptr,
-                  const float *filter_ptr);
-float conv3x3(const float *input_ptr,
-              const float *filter_ptr,
-              const int row_width);
-
+#include <conv_helper.h>
 void kernel conv_2d_3x3(global const float *input,
                         global const float *filter,
                         global const float *bias,
@@ -18,9 +11,9 @@ void kernel conv_2d_3x3(global const float *input,
                         private const uint out_width,
                         private const uint stride_h,
                         private const uint stride_w) {
-  int batch = get_global_id(0);
-  int out_chan_blk = get_global_id(1);
-  int out_pixel_blk = get_global_id(2);
+  const int batch = get_global_id(0);
+  const int out_chan_blk = get_global_id(1);
+  const int out_pixel_blk = get_global_id(2);
 
   const uint in_pixel = in_height * in_width;
   const uint out_pixel = out_height * out_width;
@@ -43,10 +36,10 @@ void kernel conv_2d_3x3(global const float *input,
   uint pixels = out_pixel_end - out_pixel_begin;
 
   for (uint i = out_chan_begin; i < out_chan_end; ++i) {
-    float4 res = (float4)bias[i];
     float *output_ptr = output_base + i * out_pixel;
     const float *filter_base = filter + i * in_chan_num * 9;
     if (pixels == 4) {
+      float4 res = (float4)bias[i];
       for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
         const float* input_ptr = input_base + in_chan_idx * in_pixel;
         const float* filter_ptr = filter_base + in_chan_idx * 9;
diff --git a/mace/kernels/opencl/cl/conv_helper.h b/mace/kernels/opencl/cl/conv_helper.h
new file mode 100644
index 00000000..553af09d
--- /dev/null
+++ b/mace/kernels/opencl/cl/conv_helper.h
@@ -0,0 +1,15 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
+#define MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
+
+float4 conv1x3_s1(const float *input_ptr,
+                  const float *filter_ptr);
+float4 conv1x3_s2(const float *input_ptr,
+                  const float *filter_ptr);
+float conv3x3(const float *input_ptr,
+              const float *filter_ptr,
+              const int row_width);
+#endif //  MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
diff --git a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
index 2cdbe3fa..b88e339a 100644
--- a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
@@ -1,61 +1,66 @@
-float4 conv1x3_s1(const float *input_ptr,
-                  const float *filter_ptr);
-float conv3x3(const float *input_ptr,
-                     const float *filter_ptr,
-                     const int row_width);
+#include <conv_helper.h>
+//TODO merge the depthwise with conv 3x3 to remove duplicate code.
+void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */
+                               global const float *filter, /* m, i, kh, kw */
+                               global const float *bias, /* o */
+                               global float *output, /* n, c, h, w */
+                               private const uint in_chan_num,
+                               private const uint out_chan_num,
+                               private const uint in_height,
+                               private const uint in_width,
+                               private const uint out_height,
+                               private const uint out_width,
+                               private const uint stride_h,
+                               private const uint stride_w) {
+  const int batch = get_global_id(0);
+  const int out_chan_blk = get_global_id(1);
+  const int out_pixel_blk = get_global_id(2);
 
-void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */
-                                  global const float *filter, /* m, i, kh, kw */
-                                  global const float *bias, /* o */
-                                  global float *output, /* n, c, h, w */
-                                  private const int in_chan_num,
-                                  private const int out_chan_num,
-                                  private const int in_height,
-                                  private const int in_width,
-                                  private const int out_height,
-                                  private const int out_width) {
-  int batch = get_global_id(0);
-  int out_chan_blk = get_global_id(1);
-  int out_pixel_blk = get_global_id(2);
+  const uint in_pixel = in_height * in_width;
+  const uint out_pixel = out_height * out_width;
+  const uint multiplier = out_chan_num / in_chan_num;
 
-  const int in_pixel = in_height * in_width;
-  const int out_pixel = out_height * out_width;
-  const int multiplier = out_chan_num / in_chan_num;
+  const uint round_out_width = (out_width + 3) / 4;
+  const uint out_pixel_height = out_pixel_blk / round_out_width;
+  const uint out_pixel_width = out_pixel_blk % round_out_width;
 
-  const int round_out_width = (out_width + 3) / 4;
-  const int out_pixel_height = out_pixel_blk / round_out_width;
-  const int out_pixel_width = out_pixel_blk % round_out_width;
+  const uint out_chan_begin = out_chan_blk * 4;
+  const uint out_chan_end = min(out_chan_begin + 4, out_chan_num);
+  const uint out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
+  const uint out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
+  const uint in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
 
-  const int out_chan_begin = out_chan_blk * 4;
-  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
-  const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
-  const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-  const int in_pixel_begin = out_pixel_height * in_width + out_pixel_width * 4;
-
-  const int in_offset = batch * in_chan_num * in_pixel;
-  const int out_offset = batch * out_chan_num * out_pixel;
+  const uint in_offset = batch * in_chan_num * in_pixel;
+  const uint out_offset = batch * out_chan_num * out_pixel;
   const float *input_base = input + in_offset + in_pixel_begin;
   float *output_base = output + out_offset + out_pixel_begin;
 
-  int pixels = out_pixel_end - out_pixel_begin;
+  uint pixels = out_pixel_end - out_pixel_begin;
 
-  for (int i = out_chan_begin; i < out_chan_end; ++i) {
+  for (uint i = out_chan_begin; i < out_chan_end; ++i) {
     float bias_value = bias[i];
     const float *input_ptr = input_base + (i / multiplier) * in_pixel;
     const float *filter_ptr = filter + i * 9;
     float *output_ptr = output_base + i * out_pixel;
-    if (pixels < 4) {
-      for (int out_idx = 0; out_idx < pixels; ++out_idx) {
-        output_ptr[out_idx] = bias_value;
-        output_ptr[out_idx] += conv3x3(input_ptr, filter_ptr, in_width);
-        input_ptr += 1;
+    if (pixels == 4) {
+      float4 res = (float4)bias[i];
+      if (stride_w == 1) {
+        res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+        res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+        res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+      } else {
+        res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+        res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+        res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
       }
-    } else {
-      float4 res = (float4)bias_value;
-      res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
-      res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
-      res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
       vstore4(res, 0, output_ptr);
+    } else {
+      for (uint p = 0; p < pixels; ++p) {
+        float res = bias[i];
+        res += conv3x3(input_ptr, filter_ptr, in_width);
+        output_ptr[p] = res;
+        input_ptr += stride_w;
+      }
     }
   }
 
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index 448f31d6..90011b8f 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -22,21 +22,21 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
 
   auto runtime = OpenCLRuntime::Get();
   auto program = runtime->program();
-  auto bm_kernel = cl::Kernel(program, "conv_2d_3x3");
+  auto conv_kernel = cl::Kernel(program, "conv_2d_3x3");
 
   uint32_t idx = 0;
-  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(channels));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(height));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(width));
-  bm_kernel.setArg(idx++, stride);
-  bm_kernel.setArg(idx++, stride);
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(channels));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(height));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(width));
+  conv_kernel.setArg(idx++, stride);
+  conv_kernel.setArg(idx++, stride);
   const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
                            static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(pixel_blocks)};
@@ -44,7 +44,7 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
                            static_cast<uint32_t>(1),
                            static_cast<uint32_t>(256)};
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      bm_kernel, cl::NullRange,
+      conv_kernel, cl::NullRange,
       cl::NDRange(gws[0], gws[1], gws[2]),
       cl::NDRange(lws[0], lws[1], lws[2]));
   MACE_CHECK(error == CL_SUCCESS);
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc
index ca6a5b44..7dcb996f 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -10,6 +10,8 @@ namespace kernels {
 extern void DepthwiseConvOpenclK3x3S1(const Tensor *input, const Tensor *filter,
                                       const Tensor *bias, Tensor *output);
 
+extern void DepthwiseConvOpenclK3x3S2(const Tensor *input, const Tensor *filter,
+                                      const Tensor *bias, Tensor *output);
 template <>
 void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
                                                                    const Tensor *filter,
@@ -21,7 +23,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor
   static const Conv2dOpenclFunction selector[5][2] = {
       {nullptr, nullptr},
       {nullptr, nullptr},
-      {DepthwiseConvOpenclK3x3S1, nullptr},
+      {DepthwiseConvOpenclK3x3S1, DepthwiseConvOpenclK3x3S2},
       {nullptr, nullptr},
       {nullptr, nullptr}};
 
diff --git a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
index e76858a5..0c393e76 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
@@ -9,10 +9,11 @@
 namespace mace {
 namespace kernels {
 
-extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
-                                      const Tensor *filter,
-                                      const Tensor *bias,
-                                      Tensor *output) {
+static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
+                                            const Tensor *filter,
+                                            const Tensor *bias,
+                                            const uint32_t stride,
+                                            Tensor *output) {
   const index_t batch = output->dim(0);
   const index_t channels = output->dim(1);
   const index_t height = output->dim(2);
@@ -24,33 +25,53 @@ extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
   const index_t input_width = input->dim(3);
 
   MACE_CHECK(input_batch == batch);
-
-  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
-  auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer,
-                                   int, int, int, int, int, int, int>(program, "depthwise_conv_3x3_s1");
   const index_t pixels = height * width;
   const index_t channel_blocks = (channels + 3) / 4;
   const index_t pixel_blocks = (width + 3) / 4 * height;
 
-  cl_int error;
-  conv_2d(cl::EnqueueArgs(runtime->command_queue(),
-                          cl::NDRange(static_cast<int>(batch),
-                                      static_cast<int>(channel_blocks),
-                                      static_cast<int>(pixel_blocks)),
-                          cl::NDRange(1, 1, 256)),
-          *(static_cast<cl::Buffer *>(input->buffer())),
-          *(static_cast<cl::Buffer *>(filter->buffer())),
-          *(static_cast<cl::Buffer *>(bias->buffer())),
-          *(static_cast<cl::Buffer *>(output->buffer())),
-          static_cast<int>(input_channels),
-          static_cast<int>(channels),
-          static_cast<int>(input_height),
-          static_cast<int>(input_width),
-          static_cast<int>(height),
-          static_cast<int>(width),
-          error);
+  auto runtime = OpenCLRuntime::Get();
+  auto program = runtime->program();
+  auto conv_kernel = cl::Kernel(program, "depthwise_conv_3x3");
+
+  uint32_t idx = 0;
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(channels));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(height));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(width));
+  conv_kernel.setArg(idx++, stride);
+  conv_kernel.setArg(idx++, stride);
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
+                           static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(pixel_blocks)};
+  const uint32_t lws[3] = {static_cast<uint32_t>(1),
+                           static_cast<uint32_t>(1),
+                           static_cast<uint32_t>(256)};
+  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+      conv_kernel, cl::NullRange,
+      cl::NDRange(gws[0], gws[1], gws[2]),
+      cl::NDRange(lws[0], lws[1], lws[2]));
   MACE_CHECK(error == CL_SUCCESS);
+}
+
+extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
+                                      const Tensor *filter,
+                                      const Tensor *bias,
+                                      Tensor *output) {
+  InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 1, output);
+};
+
+extern void DepthwiseConvOpenclK3x3S2(const Tensor *input,
+                                      const Tensor *filter,
+                                      const Tensor *bias,
+                                      Tensor *output) {
+  InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 2, output);
 };
 
 }  // namespace kernels
-- 
GitLab