Finish depthwise 3x3 conv with stride 2.

b469a945 · liuqi · a7290142 · b469a945 · b469a945 · b469a945
6 changed file
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
-float4 conv1x3_s1(const float *input_ptr,
-                  const float *filter_ptr);
-float4 conv1x3_s2(const float *input_ptr,
-                  const float *filter_ptr);
-float conv3x3(const float *input_ptr,
-              const float *filter_ptr,
-              const int row_width);
-
+#include <conv_helper.h>
 void kernel conv_2d_3x3(global const float *input,
                        global const float *filter,
                        global const float *bias,
@@ -18,9 +11,9 @@ void kernel conv_2d_3x3(global const float *input,
                        private const uint out_width,
                        private const uint stride_h,
                        private const uint stride_w) {
-  int batch = get_global_id(0);
-  int out_chan_blk = get_global_id(1);
-  int out_pixel_blk = get_global_id(2);
+  const int batch = get_global_id(0);
+  const int out_chan_blk = get_global_id(1);
+  const int out_pixel_blk = get_global_id(2);

  const uint in_pixel = in_height * in_width;
  const uint out_pixel = out_height * out_width;
@@ -43,10 +36,10 @@ void kernel conv_2d_3x3(global const float *input,
  uint pixels = out_pixel_end - out_pixel_begin;

  for (uint i = out_chan_begin; i < out_chan_end; ++i) {
-    float4 res = (float4)bias[i];
    float *output_ptr = output_base + i * out_pixel;
    const float *filter_base = filter + i * in_chan_num * 9;
    if (pixels == 4) {
+      float4 res = (float4)bias[i];
      for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
        const float* input_ptr = input_base + in_chan_idx * in_pixel;
        const float* filter_ptr = filter_base + in_chan_idx * 9;

--- a/mace/kernels/opencl/cl/conv_helper.h
+++ b/mace/kernels/opencl/cl/conv_helper.h
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
+#define MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
+
+float4 conv1x3_s1(const float *input_ptr,
+                  const float *filter_ptr);
+float4 conv1x3_s2(const float *input_ptr,
+                  const float *filter_ptr);
+float conv3x3(const float *input_ptr,
+              const float *filter_ptr,
+              const int row_width);
+#endif //  MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
--- a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
-float4 conv1x3_s1(const float *input_ptr,
-                  const float *filter_ptr);
-float conv3x3(const float *input_ptr,
-                     const float *filter_ptr,
-                     const int row_width);
+#include <conv_helper.h>
+//TODO merge the depthwise with conv 3x3 to remove duplicate code.
+void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */
+                               global const float *filter, /* m, i, kh, kw */
+                               global const float *bias, /* o */
+                               global float *output, /* n, c, h, w */
+                               private const uint in_chan_num,
+                               private const uint out_chan_num,
+                               private const uint in_height,
+                               private const uint in_width,
+                               private const uint out_height,
+                               private const uint out_width,
+                               private const uint stride_h,
+                               private const uint stride_w) {
+  const int batch = get_global_id(0);
+  const int out_chan_blk = get_global_id(1);
+  const int out_pixel_blk = get_global_id(2);

-void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */
-                                  global const float *filter, /* m, i, kh, kw */
-                                  global const float *bias, /* o */
-                                  global float *output, /* n, c, h, w */
-                                  private const int in_chan_num,
-                                  private const int out_chan_num,
-                                  private const int in_height,
-                                  private const int in_width,
-                                  private const int out_height,
-                                  private const int out_width) {
-  int batch = get_global_id(0);
-  int out_chan_blk = get_global_id(1);
-  int out_pixel_blk = get_global_id(2);
+  const uint in_pixel = in_height * in_width;
+  const uint out_pixel = out_height * out_width;
+  const uint multiplier = out_chan_num / in_chan_num;

-  const int in_pixel = in_height * in_width;
-  const int out_pixel = out_height * out_width;
-  const int multiplier = out_chan_num / in_chan_num;
+  const uint round_out_width = (out_width + 3) / 4;
+  const uint out_pixel_height = out_pixel_blk / round_out_width;
+  const uint out_pixel_width = out_pixel_blk % round_out_width;

-  const int round_out_width = (out_width + 3) / 4;
-  const int out_pixel_height = out_pixel_blk / round_out_width;
-  const int out_pixel_width = out_pixel_blk % round_out_width;
+  const uint out_chan_begin = out_chan_blk * 4;
+  const uint out_chan_end = min(out_chan_begin + 4, out_chan_num);
+  const uint out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
+  const uint out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
+  const uint in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;

-  const int out_chan_begin = out_chan_blk * 4;
-  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
-  const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
-  const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-  const int in_pixel_begin = out_pixel_height * in_width + out_pixel_width * 4;
-
-  const int in_offset = batch * in_chan_num * in_pixel;
-  const int out_offset = batch * out_chan_num * out_pixel;
+  const uint in_offset = batch * in_chan_num * in_pixel;
+  const uint out_offset = batch * out_chan_num * out_pixel;
  const float *input_base = input + in_offset + in_pixel_begin;
  float *output_base = output + out_offset + out_pixel_begin;

-  int pixels = out_pixel_end - out_pixel_begin;
+  uint pixels = out_pixel_end - out_pixel_begin;

-  for (int i = out_chan_begin; i < out_chan_end; ++i) {
+  for (uint i = out_chan_begin; i < out_chan_end; ++i) {
    float bias_value = bias[i];
    const float *input_ptr = input_base + (i / multiplier) * in_pixel;
    const float *filter_ptr = filter + i * 9;
    float *output_ptr = output_base + i * out_pixel;
-    if (pixels < 4) {
-      for (int out_idx = 0; out_idx < pixels; ++out_idx) {
-        output_ptr[out_idx] = bias_value;
-        output_ptr[out_idx] += conv3x3(input_ptr, filter_ptr, in_width);
-        input_ptr += 1;
+    if (pixels == 4) {
+      float4 res = (float4)bias[i];
+      if (stride_w == 1) {
+        res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+        res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+        res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+      } else {
+        res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+        res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+        res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
      }
-    } else {
-      float4 res = (float4)bias_value;
-      res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
-      res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
-      res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
      vstore4(res, 0, output_ptr);
+    } else {
+      for (uint p = 0; p < pixels; ++p) {
+        float res = bias[i];
+        res += conv3x3(input_ptr, filter_ptr, in_width);
+        output_ptr[p] = res;
+        input_ptr += stride_w;
+      }
    }
  }


--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -22,21 +22,21 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,

  auto runtime = OpenCLRuntime::Get();
  auto program = runtime->program();
-  auto bm_kernel = cl::Kernel(program, "conv_2d_3x3");
+  auto conv_kernel = cl::Kernel(program, "conv_2d_3x3");

  uint32_t idx = 0;
-  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(channels));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(height));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(width));
-  bm_kernel.setArg(idx++, stride);
-  bm_kernel.setArg(idx++, stride);
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(channels));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(height));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(width));
+  conv_kernel.setArg(idx++, stride);
+  conv_kernel.setArg(idx++, stride);
  const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
                           static_cast<uint32_t>(channel_blocks),
                           static_cast<uint32_t>(pixel_blocks)};
@@ -44,7 +44,7 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
                           static_cast<uint32_t>(1),
                           static_cast<uint32_t>(256)};
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      bm_kernel, cl::NullRange,
+      conv_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
      cl::NDRange(lws[0], lws[1], lws[2]));
  MACE_CHECK(error == CL_SUCCESS);

--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -10,6 +10,8 @@ namespace kernels {
 extern void DepthwiseConvOpenclK3x3S1(const Tensor *input, const Tensor *filter,
                                      const Tensor *bias, Tensor *output);

+extern void DepthwiseConvOpenclK3x3S2(const Tensor *input, const Tensor *filter,
+                                      const Tensor *bias, Tensor *output);
 template <>
 void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
                                                                   const Tensor *filter,
@@ -21,7 +23,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor
  static const Conv2dOpenclFunction selector[5][2] = {
      {nullptr, nullptr},
      {nullptr, nullptr},
-      {DepthwiseConvOpenclK3x3S1, nullptr},
+      {DepthwiseConvOpenclK3x3S1, DepthwiseConvOpenclK3x3S2},
      {nullptr, nullptr},
      {nullptr, nullptr}};


--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
@@ -9,10 +9,11 @@
 namespace mace {
 namespace kernels {

-extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
-                                      const Tensor *filter,
-                                      const Tensor *bias,
-                                      Tensor *output) {
+static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
+                                            const Tensor *filter,
+                                            const Tensor *bias,
+                                            const uint32_t stride,
+                                            Tensor *output) {
  const index_t batch = output->dim(0);
  const index_t channels = output->dim(1);
  const index_t height = output->dim(2);
@@ -24,33 +25,53 @@ extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
  const index_t input_width = input->dim(3);

  MACE_CHECK(input_batch == batch);
-
-  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
-  auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer,
-                                   int, int, int, int, int, int, int>(program, "depthwise_conv_3x3_s1");
  const index_t pixels = height * width;
  const index_t channel_blocks = (channels + 3) / 4;
  const index_t pixel_blocks = (width + 3) / 4 * height;

-  cl_int error;
-  conv_2d(cl::EnqueueArgs(runtime->command_queue(),
-                          cl::NDRange(static_cast<int>(batch),
-                                      static_cast<int>(channel_blocks),
-                                      static_cast<int>(pixel_blocks)),
-                          cl::NDRange(1, 1, 256)),
-          *(static_cast<cl::Buffer *>(input->buffer())),
-          *(static_cast<cl::Buffer *>(filter->buffer())),
-          *(static_cast<cl::Buffer *>(bias->buffer())),
-          *(static_cast<cl::Buffer *>(output->buffer())),
-          static_cast<int>(input_channels),
-          static_cast<int>(channels),
-          static_cast<int>(input_height),
-          static_cast<int>(input_width),
-          static_cast<int>(height),
-          static_cast<int>(width),
-          error);
+  auto runtime = OpenCLRuntime::Get();
+  auto program = runtime->program();
+  auto conv_kernel = cl::Kernel(program, "depthwise_conv_3x3");
+
+  uint32_t idx = 0;
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(channels));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(height));
+  conv_kernel.setArg(idx++, static_cast<uint32_t>(width));
+  conv_kernel.setArg(idx++, stride);
+  conv_kernel.setArg(idx++, stride);
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
+                           static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(pixel_blocks)};
+  const uint32_t lws[3] = {static_cast<uint32_t>(1),
+                           static_cast<uint32_t>(1),
+                           static_cast<uint32_t>(256)};
+  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+      conv_kernel, cl::NullRange,
+      cl::NDRange(gws[0], gws[1], gws[2]),
+      cl::NDRange(lws[0], lws[1], lws[2]));
  MACE_CHECK(error == CL_SUCCESS);
+}
+
+extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
+                                      const Tensor *filter,
+                                      const Tensor *bias,
+                                      Tensor *output) {
+  InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 1, output);
+};
+
+extern void DepthwiseConvOpenclK3x3S2(const Tensor *input,
+                                      const Tensor *filter,
+                                      const Tensor *bias,
+                                      Tensor *output) {
+  InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 2, output);
 };

 }  // namespace kernels