[Bugfix][OpenCL] fix depthwise_conv2d_3x3 with dilation > 1 (#4281)

* [Bugfix][OpenCL] fix depthwise_conv2d_3x3 with dilation > 1. test=develop

[Bugfix][OpenCL] fix depthwise_conv2d_3x3 with dilation > 1 (#4281)
* [Bugfix][OpenCL] fix depthwise_conv2d_3x3 with dilation > 1. test=develop
bd4fd8d3 · zhaoyang-star · GitHub · 2b40367c · bd4fd8d3 · bd4fd8d3
2 changed file
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -48,7 +48,7 @@ __kernel void depth_conv2d_3x3(
  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);

  int2 in_pos_in_one_block =
-      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+      ouput_pos_in_one_block * stride_xy + (int2)(offset + dilation - 1, offset + dilation - 1);

 #ifdef BIASE_CH
  CL_DTYPE4 output =
@@ -77,13 +77,13 @@ __kernel void depth_conv2d_3x3(
      READ_IMG_TYPE(CL_DTYPE_CHAR,
                    input,
                    sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y - dilation)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
-                 in_pos_in_one_block.y - 1 < 0 ||
-                 in_pos_in_one_block.x - 1 >= input_width ||
-                 in_pos_in_one_block.y - 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                 in_pos_in_one_block.y - dilation < 0 ||
+                 in_pos_in_one_block.x - dilation >= input_width ||
+                 in_pos_in_one_block.y - dilation >= input_height)
                << 15));

  inputs[1] = select(
@@ -91,45 +91,37 @@ __kernel void depth_conv2d_3x3(
                    input,
                    sampler,
                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
-                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           pos_in_input_block.y + in_pos_in_one_block.y - dilation)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 ||
                 in_pos_in_one_block.x >= input_width ||
-                 in_pos_in_one_block.y - 1 >= input_height)
+                 in_pos_in_one_block.y - dilation >= input_height)
                << 15));

  inputs[2] = select(
      READ_IMG_TYPE(CL_DTYPE_CHAR,
                    input,
                    sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y - dilation)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
-                 in_pos_in_one_block.y - 1 < 0 ||
-                 in_pos_in_one_block.x + 1 >= input_width ||
-                 in_pos_in_one_block.y - 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                 in_pos_in_one_block.y - dilation < 0 ||
+                 in_pos_in_one_block.x + dilation >= input_width ||
+                 in_pos_in_one_block.y - dilation >= input_height)
                << 15));

  inputs[3] = select(
      READ_IMG_TYPE(CL_DTYPE_CHAR,
                    input,
                    sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation,
                           pos_in_input_block.y + in_pos_in_one_block.y)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
-                 in_pos_in_one_block.x - 1 >= input_width ||
+      (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x - dilation >= input_width ||
                 in_pos_in_one_block.y >= input_height)
                << 15));
-  /*
-  if (output_pos.x == 112 && output_pos.y == 0) {
-        CL_DTYPE4 input1 = inputs[3];
-        float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-        printf(" input4 3 - %v4hlf \n", in);
-        printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
-  }
-  */

  inputs[4] = select(
      READ_IMG_TYPE(CL_DTYPE_CHAR,
@@ -147,11 +139,11 @@ __kernel void depth_conv2d_3x3(
      READ_IMG_TYPE(CL_DTYPE_CHAR,
                    input,
                    sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation,
                           pos_in_input_block.y + in_pos_in_one_block.y)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
-                 in_pos_in_one_block.x + 1 >= input_width ||
+      (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x + dilation >= input_width ||
                 in_pos_in_one_block.y >= input_height)
                << 15));

@@ -159,13 +151,13 @@ __kernel void depth_conv2d_3x3(
      READ_IMG_TYPE(CL_DTYPE_CHAR,
                    input,
                    sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y + dilation)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
-                 in_pos_in_one_block.y + 1 < 0 ||
-                 in_pos_in_one_block.x - 1 >= input_width ||
-                 in_pos_in_one_block.y + 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                 in_pos_in_one_block.y + dilation < 0 ||
+                 in_pos_in_one_block.x - dilation >= input_width ||
+                 in_pos_in_one_block.y + dilation >= input_height)
                << 15));

  inputs[7] = select(
@@ -173,24 +165,24 @@ __kernel void depth_conv2d_3x3(
                    input,
                    sampler,
                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
-                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           pos_in_input_block.y + in_pos_in_one_block.y + dilation)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 ||
                 in_pos_in_one_block.x >= input_width ||
-                 in_pos_in_one_block.y + 1 >= input_height)
+                 in_pos_in_one_block.y + dilation >= input_height)
                << 15));

  inputs[8] = select(
      READ_IMG_TYPE(CL_DTYPE_CHAR,
                    input,
                    sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y + dilation)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
-                 in_pos_in_one_block.y + 1 < 0 ||
-                 in_pos_in_one_block.x + 1 >= input_width ||
-                 in_pos_in_one_block.y + 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                 in_pos_in_one_block.y + dilation < 0 ||
+                 in_pos_in_one_block.x + dilation >= input_width ||
+                 in_pos_in_one_block.y + dilation >= input_height)
                << 15));

  CL_DTYPE4 filters[9];
@@ -221,14 +213,18 @@ __kernel void depth_conv2d_3x3(

  /*

-  if (output_pos.x == 112 && output_pos.y == 0) {
+  if (output_pos.x == 0 && output_pos.y == 0) {

      for (int i = 0; i < 9; ++i) {
          CL_DTYPE4 input1 = inputs[i];
          float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-          printf(" input4 %d - %v4hlf \n", i, in);
+          printf(" input4[%d]: %v4hlf \n", i, in);
+      }
+      for (int i = 0; i < 9; ++i) {
+          CL_DTYPE4 filters1 = filters[i];
+          float4 f = (float4)(filters1.x, filters1.y, filters1.z, filters1.w);
+          printf(" weights4[%d]: %v4hlf \n", i, f);
      }
-
      float4 out = (float4)(output.x, output.y, output.z, output.w);
      printf(" depth wise output output4 = %v4hlf \n", out);
      printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);

--- a/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
@@ -32,6 +32,93 @@ namespace lite {
 // #define TEST_DEPTHWISE_CONV_IMAGE_BASIC
 #define TEST_DEPTHWISE_CONV_IMAGE_3X3

+template <typename Dtype1, typename Dtype2>
+static void conv_basic(const Dtype1* din,
+                       Dtype2* dout,
+                       int num,
+                       int chout,
+                       int hout,
+                       int wout,
+                       int chin,
+                       int hin,
+                       int win,
+                       const Dtype1* weights,
+                       const Dtype2* bias,
+                       int group,
+                       int kernel_w,
+                       int kernel_h,
+                       int stride_w,
+                       int stride_h,
+                       int dila_w,
+                       int dila_h,
+                       int pad_w,
+                       int pad_h,
+                       bool flag_bias,
+                       bool flag_relu) {
+  CHECK(!flag_relu);
+  auto src_data = din;
+  auto dst_data_ref = dout;
+  auto weights_data = weights;
+  auto with_bias = flag_bias;
+  auto bias_data = bias;
+
+  int in_num = num;
+  int out_channels = chout;
+  int out_h = hout;
+  int out_w = wout;
+
+  int in_channel = chin;
+  int in_h = hin;
+  int in_w = win;
+  int out_c_group = out_channels / group;
+  int in_c_group = in_channel / group;
+
+  for (int n = 0; n < in_num; ++n) {
+    for (int g = 0; g < group; ++g) {
+      for (int oc = 0; oc < out_c_group; ++oc) {
+        for (int oh = 0; oh < out_h; ++oh) {
+          for (int ow = 0; ow < out_w; ++ow) {
+            int out_idx = n * group * out_c_group * out_h * out_w +
+                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
+                          oh * out_w + ow;
+            Dtype2 bias_d =
+                with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0;
+            dst_data_ref[out_idx] = bias_d;
+            for (int ic = 0; ic < in_c_group; ++ic) {
+              for (int kh = 0; kh < kernel_h; ++kh) {
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                  int iw = ow * stride_w - pad_w + kw * (dila_w);
+                  int ih = oh * stride_h - pad_h + kh * (dila_h);
+                  if (iw < 0 || iw >= in_w) continue;
+                  if (ih < 0 || ih >= in_h) continue;
+
+                  int iidx = n * in_channel * in_h * in_w +
+                             g * in_c_group * in_h * in_w + ic * in_h * in_w +
+                             ih * in_w + iw;
+                  int widx =
+                      g * out_c_group * in_c_group * kernel_h * kernel_w +
+                      oc * in_c_group * kernel_h * kernel_w +
+                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
+
+                  dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx];
+                  /*
+                  if (out_idx == 0) {
+                     VLOG(5) << "src[" << iidx << "]: " << src_data[iidx]
+                             << "\tweights[" << widx << "]: "
+                             << weights_data[widx]
+                             << "\tdst[" << out_idx << "]: "
+                             << dst_data_ref[out_idx];
+                  */
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 template <typename T, int STRIDE_H = 1, int STRIDE_W = 1>
 void depth_conv(const T* input_data,
                const lite::DDim& input_dims,
@@ -384,11 +471,14 @@ TEST(depthwise_conv2d, compute_basic) {
 #ifdef TEST_DEPTHWISE_CONV_IMAGE_3X3
 // #define LOOP_TEST
 TEST(depthwise_conv2d, compute_image2d_3x3) {
+  const int fc = 1;
  const int fw = 3;
  const int fh = fw;
-  int dilation = 1;
-  int stride = 1;
-  int pad = 0;
+  const int dilation = 4;
+  const int stride = 2;
+  const int pad = 2;
+  const bool bias_flag = false;
+  const bool relu_flag = false;
 #ifdef LOOP_TEST
  // for (int batch_size = 1; batch_size < 2; ++batch_size) {
  for (int oc = 4; oc < 10; oc += 1) {      // oc = ic
@@ -399,12 +489,18 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
  const int ih = 112;
  const int iw = 112;
 #endif
-        stride = (stride == 1) ? 2 : 1;
-        // pad = (pad == 0) ? 1 : 0;
        const int fb = oc;
        const int ic = oc;
        const int oh = ConvOutputSize(ih, fh, dilation, pad, pad, stride);
        const int ow = ConvOutputSize(iw, fw, dilation, pad, pad, stride);
+        if (oh <= 0 || ow <= 0) {
+#ifdef LOOP_TEST
+          continue;
+#else
+    LOG(FATAL) << "Output tensor of depthwise conv is illegal!"
+               << "Please check your input dims and conv params";
+#endif
+        }

        LOG(INFO) << "to get kernel ...";
        auto kernels =
@@ -417,7 +513,7 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
        auto kernel = std::move(kernels.front());

        LOG(INFO) << "get kernel";
-        lite::Tensor input, filter, output;
+        lite::Tensor input, filter, bias, output;
        operators::ConvParam param;
        param.x = &input;
        param.filter = &filter;
@@ -428,6 +524,8 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
        param.strides = std::vector<int>{stride, stride};
        std::vector<int> dilations = {dilation, dilation};
        param.dilations = std::make_shared<std::vector<int>>(dilations);
+        param.bias = bias_flag ? &bias : nullptr;
+        param.fuse_relu = relu_flag;

        std::unique_ptr<KernelContext> context(new KernelContext);
        context->As<OpenCLContext>().InitOnce();
@@ -442,9 +540,11 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
        const DDim& input_dim =
            lite::DDim{std::vector<int64_t>({1, ic, ih, iw})};
        const DDim& filter_dim =
-            lite::DDim{std::vector<int64_t>({fb, 1, 3, 3})};
+            lite::DDim{std::vector<int64_t>({fb, fc, fh, fw})};
        const DDim& output_dim =
            lite::DDim{std::vector<int64_t>({1, oc, oh, ow})};
+        // element wise bias
+        const DDim bias_dim = DDim(std::vector<DDim::value_type>{oc});
        input.Resize(input_dim);
        filter.Resize(filter_dim);
        output.Resize(output_dim);
@@ -460,6 +560,14 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
        for (auto& f : filter_v) {
          f = gen(engine);
        }
+        std::vector<float> bias_v;
+        if (bias_flag) {
+          bias.Resize(bias_dim);
+          bias_v.resize(bias_dim.production());
+          for (auto& b : bias_v) {
+            b = gen(engine);
+          }
+        }

        LOG(INFO) << "prepare input";
        CLImageConverterDefault* default_converter =
@@ -496,21 +604,29 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
        lite::Tensor out_ref;
        out_ref.Resize(output_dim);
        auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
-        if (stride == 1) {
-          depth_conv<float, 1, 1>(input_v.data(),
-                                  input.dims(),
-                                  filter_v.data(),
-                                  filter.dims(),
-                                  out_ref_data,
-                                  out_ref.dims());
-        } else if (stride == 2) {
-          depth_conv<float, 2, 2>(input_v.data(),
-                                  input.dims(),
-                                  filter_v.data(),
-                                  filter.dims(),
-                                  out_ref_data,
-                                  out_ref.dims());
-        }
+
+        conv_basic<float, float>(input_v.data(),
+                                 out_ref_data,
+                                 1,
+                                 oc,
+                                 oh,
+                                 ow,
+                                 ic,
+                                 ih,
+                                 iw,
+                                 filter_v.data(),
+                                 bias_v.data(),
+                                 param.groups,
+                                 fw,
+                                 fh,
+                                 stride,
+                                 stride,
+                                 dilation,
+                                 dilation,
+                                 pad,
+                                 pad,
+                                 bias_flag,
+                                 relu_flag);

        const size_t cl_image2d_row_pitch{0};
        const size_t cl_image2d_slice_pitch{0};
@@ -538,7 +654,7 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
          EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
                       abs_diff > FP16_ABS_DIFF);
          if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) {
-            LOG(FATAL) << "error idx:" << i << "output_v[" << i
+            LOG(FATAL) << "error idx:" << i << " output_v[" << i
                       << "]:" << output_v[i] << " "
                                                 "out_ref_data["
                       << i << "]:" << out_ref_data[i];