From e439268f1b26ed13cb76baadb33a258e1f2a9304 Mon Sep 17 00:00:00 2001
From: zhaoyang-star <zhaoyangstar@foxmail.com>
Date: Wed, 16 Sep 2020 14:53:38 +0800
Subject: [PATCH] [Cherry-pick][Bugfix][OpenCL] fix depthwise_conv2d_3x3 with
 dilation > 1 (#4281) (#4325)

* [Bugfix][OpenCL] fix depthwise_conv2d_3x3 with dilation > 1 (#4281)

Co-authored-by: ysh329 <ysh329@users.noreply.github.com>
---
 .../image/depthwise_conv2d_kernel.cl          |  92 +++++-----
 .../depthwise_conv2d_image_compute_test.cc    | 162 +++++++++++++++---
 2 files changed, 183 insertions(+), 71 deletions(-)

diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
index 6fbdc21f93..7d86730b93 100755
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -48,7 +48,7 @@ __kernel void depth_conv2d_3x3(
   int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
 
   int2 in_pos_in_one_block =
-      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+      ouput_pos_in_one_block * stride_xy + (int2)(offset + dilation - 1, offset + dilation - 1);
 
 #ifdef BIASE_CH
   CL_DTYPE4 output =
@@ -77,13 +77,13 @@ __kernel void depth_conv2d_3x3(
       READ_IMG_TYPE(CL_DTYPE_CHAR,
                     input,
                     sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y - dilation)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
-                 in_pos_in_one_block.y - 1 < 0 ||
-                 in_pos_in_one_block.x - 1 >= input_width ||
-                 in_pos_in_one_block.y - 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                 in_pos_in_one_block.y - dilation < 0 ||
+                 in_pos_in_one_block.x - dilation >= input_width ||
+                 in_pos_in_one_block.y - dilation >= input_height)
                 << 15));
 
   inputs[1] = select(
@@ -91,45 +91,37 @@ __kernel void depth_conv2d_3x3(
                     input,
                     sampler,
                     (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
-                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           pos_in_input_block.y + in_pos_in_one_block.y - dilation)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 ||
                  in_pos_in_one_block.x >= input_width ||
-                 in_pos_in_one_block.y - 1 >= input_height)
+                 in_pos_in_one_block.y - dilation >= input_height)
                 << 15));
 
   inputs[2] = select(
       READ_IMG_TYPE(CL_DTYPE_CHAR,
                     input,
                     sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y - dilation)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
-                 in_pos_in_one_block.y - 1 < 0 ||
-                 in_pos_in_one_block.x + 1 >= input_width ||
-                 in_pos_in_one_block.y - 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                 in_pos_in_one_block.y - dilation < 0 ||
+                 in_pos_in_one_block.x + dilation >= input_width ||
+                 in_pos_in_one_block.y - dilation >= input_height)
                 << 15));
 
   inputs[3] = select(
       READ_IMG_TYPE(CL_DTYPE_CHAR,
                     input,
                     sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation,
                            pos_in_input_block.y + in_pos_in_one_block.y)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
-                 in_pos_in_one_block.x - 1 >= input_width ||
+      (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x - dilation >= input_width ||
                  in_pos_in_one_block.y >= input_height)
                 << 15));
-  /*
-  if (output_pos.x == 112 && output_pos.y == 0) {
-        CL_DTYPE4 input1 = inputs[3];
-        float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-        printf(" input4 3 - %v4hlf \n", in);
-        printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
-  }
-  */
 
   inputs[4] = select(
       READ_IMG_TYPE(CL_DTYPE_CHAR,
@@ -147,11 +139,11 @@ __kernel void depth_conv2d_3x3(
       READ_IMG_TYPE(CL_DTYPE_CHAR,
                     input,
                     sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation,
                            pos_in_input_block.y + in_pos_in_one_block.y)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
-                 in_pos_in_one_block.x + 1 >= input_width ||
+      (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x + dilation >= input_width ||
                  in_pos_in_one_block.y >= input_height)
                 << 15));
 
@@ -159,13 +151,13 @@ __kernel void depth_conv2d_3x3(
       READ_IMG_TYPE(CL_DTYPE_CHAR,
                     input,
                     sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y + dilation)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
-                 in_pos_in_one_block.y + 1 < 0 ||
-                 in_pos_in_one_block.x - 1 >= input_width ||
-                 in_pos_in_one_block.y + 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                 in_pos_in_one_block.y + dilation < 0 ||
+                 in_pos_in_one_block.x - dilation >= input_width ||
+                 in_pos_in_one_block.y + dilation >= input_height)
                 << 15));
 
   inputs[7] = select(
@@ -173,24 +165,24 @@ __kernel void depth_conv2d_3x3(
                     input,
                     sampler,
                     (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
-                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           pos_in_input_block.y + in_pos_in_one_block.y + dilation)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 ||
                  in_pos_in_one_block.x >= input_width ||
-                 in_pos_in_one_block.y + 1 >= input_height)
+                 in_pos_in_one_block.y + dilation >= input_height)
                 << 15));
 
   inputs[8] = select(
       READ_IMG_TYPE(CL_DTYPE_CHAR,
                     input,
                     sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y + dilation)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
-                 in_pos_in_one_block.y + 1 < 0 ||
-                 in_pos_in_one_block.x + 1 >= input_width ||
-                 in_pos_in_one_block.y + 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                 in_pos_in_one_block.y + dilation < 0 ||
+                 in_pos_in_one_block.x + dilation >= input_width ||
+                 in_pos_in_one_block.y + dilation >= input_height)
                 << 15));
 
   CL_DTYPE4 filters[9];
@@ -221,14 +213,18 @@ __kernel void depth_conv2d_3x3(
 
   /*
 
-  if (output_pos.x == 112 && output_pos.y == 0) {
+  if (output_pos.x == 0 && output_pos.y == 0) {
 
       for (int i = 0; i < 9; ++i) {
           CL_DTYPE4 input1 = inputs[i];
           float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-          printf(" input4 %d - %v4hlf \n", i, in);
+          printf(" input4[%d]: %v4hlf \n", i, in);
+      }
+      for (int i = 0; i < 9; ++i) {
+          CL_DTYPE4 filters1 = filters[i];
+          float4 f = (float4)(filters1.x, filters1.y, filters1.z, filters1.w);
+          printf(" weights4[%d]: %v4hlf \n", i, f);
       }
-
       float4 out = (float4)(output.x, output.y, output.z, output.w);
       printf(" depth wise output output4 = %v4hlf \n", out);
       printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
diff --git a/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
index e36be300ba..2199d28716 100644
--- a/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
@@ -32,6 +32,93 @@ namespace lite {
 // #define TEST_DEPTHWISE_CONV_IMAGE_BASIC
 #define TEST_DEPTHWISE_CONV_IMAGE_3X3
 
+template <typename Dtype1, typename Dtype2>
+static void conv_basic(const Dtype1* din,
+                       Dtype2* dout,
+                       int num,
+                       int chout,
+                       int hout,
+                       int wout,
+                       int chin,
+                       int hin,
+                       int win,
+                       const Dtype1* weights,
+                       const Dtype2* bias,
+                       int group,
+                       int kernel_w,
+                       int kernel_h,
+                       int stride_w,
+                       int stride_h,
+                       int dila_w,
+                       int dila_h,
+                       int pad_w,
+                       int pad_h,
+                       bool flag_bias,
+                       bool flag_relu) {
+  CHECK(!flag_relu);
+  auto src_data = din;
+  auto dst_data_ref = dout;
+  auto weights_data = weights;
+  auto with_bias = flag_bias;
+  auto bias_data = bias;
+
+  int in_num = num;
+  int out_channels = chout;
+  int out_h = hout;
+  int out_w = wout;
+
+  int in_channel = chin;
+  int in_h = hin;
+  int in_w = win;
+  int out_c_group = out_channels / group;
+  int in_c_group = in_channel / group;
+
+  for (int n = 0; n < in_num; ++n) {
+    for (int g = 0; g < group; ++g) {
+      for (int oc = 0; oc < out_c_group; ++oc) {
+        for (int oh = 0; oh < out_h; ++oh) {
+          for (int ow = 0; ow < out_w; ++ow) {
+            int out_idx = n * group * out_c_group * out_h * out_w +
+                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
+                          oh * out_w + ow;
+            Dtype2 bias_d =
+                with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0;
+            dst_data_ref[out_idx] = bias_d;
+            for (int ic = 0; ic < in_c_group; ++ic) {
+              for (int kh = 0; kh < kernel_h; ++kh) {
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                  int iw = ow * stride_w - pad_w + kw * (dila_w);
+                  int ih = oh * stride_h - pad_h + kh * (dila_h);
+                  if (iw < 0 || iw >= in_w) continue;
+                  if (ih < 0 || ih >= in_h) continue;
+
+                  int iidx = n * in_channel * in_h * in_w +
+                             g * in_c_group * in_h * in_w + ic * in_h * in_w +
+                             ih * in_w + iw;
+                  int widx =
+                      g * out_c_group * in_c_group * kernel_h * kernel_w +
+                      oc * in_c_group * kernel_h * kernel_w +
+                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
+
+                  dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx];
+                  /*
+                  if (out_idx == 0) {
+                     VLOG(5) << "src[" << iidx << "]: " << src_data[iidx]
+                             << "\tweights[" << widx << "]: "
+                             << weights_data[widx]
+                             << "\tdst[" << out_idx << "]: "
+                             << dst_data_ref[out_idx];
+                  */
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 template <typename T, int STRIDE_H = 1, int STRIDE_W = 1>
 void depth_conv(const T* input_data,
                 const lite::DDim& input_dims,
@@ -384,11 +471,14 @@ TEST(depthwise_conv2d, compute_basic) {
 #ifdef TEST_DEPTHWISE_CONV_IMAGE_3X3
 // #define LOOP_TEST
 TEST(depthwise_conv2d, compute_image2d_3x3) {
+  const int fc = 1;
   const int fw = 3;
   const int fh = fw;
-  int dilation = 1;
-  int stride = 1;
-  int pad = 0;
+  const int dilation = 4;
+  const int stride = 2;
+  const int pad = 2;
+  const bool bias_flag = false;
+  const bool relu_flag = false;
 #ifdef LOOP_TEST
   // for (int batch_size = 1; batch_size < 2; ++batch_size) {
   for (int oc = 4; oc < 10; oc += 1) {      // oc = ic
@@ -399,12 +489,18 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
   const int ih = 112;
   const int iw = 112;
 #endif
-        stride = (stride == 1) ? 2 : 1;
-        // pad = (pad == 0) ? 1 : 0;
         const int fb = oc;
         const int ic = oc;
         const int oh = ConvOutputSize(ih, fh, dilation, pad, pad, stride);
         const int ow = ConvOutputSize(iw, fw, dilation, pad, pad, stride);
+        if (oh <= 0 || ow <= 0) {
+#ifdef LOOP_TEST
+          continue;
+#else
+    LOG(FATAL) << "Output tensor of depthwise conv is illegal!"
+               << "Please check your input dims and conv params";
+#endif
+        }
 
         LOG(INFO) << "to get kernel ...";
         auto kernels =
@@ -417,7 +513,7 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
         auto kernel = std::move(kernels.front());
 
         LOG(INFO) << "get kernel";
-        lite::Tensor input, filter, output;
+        lite::Tensor input, filter, bias, output;
         operators::ConvParam param;
         param.x = &input;
         param.filter = &filter;
@@ -428,6 +524,8 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
         param.strides = std::vector<int>{stride, stride};
         std::vector<int> dilations = {dilation, dilation};
         param.dilations = std::make_shared<std::vector<int>>(dilations);
+        param.bias = bias_flag ? &bias : nullptr;
+        param.fuse_relu = relu_flag;
 
         std::unique_ptr<KernelContext> context(new KernelContext);
         context->As<OpenCLContext>().InitOnce();
@@ -442,9 +540,11 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
         const DDim& input_dim =
             lite::DDim{std::vector<int64_t>({1, ic, ih, iw})};
         const DDim& filter_dim =
-            lite::DDim{std::vector<int64_t>({fb, 1, 3, 3})};
+            lite::DDim{std::vector<int64_t>({fb, fc, fh, fw})};
         const DDim& output_dim =
             lite::DDim{std::vector<int64_t>({1, oc, oh, ow})};
+        // element wise bias
+        const DDim bias_dim = DDim(std::vector<DDim::value_type>{oc});
         input.Resize(input_dim);
         filter.Resize(filter_dim);
         output.Resize(output_dim);
@@ -460,6 +560,14 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
         for (auto& f : filter_v) {
           f = gen(engine);
         }
+        std::vector<float> bias_v;
+        if (bias_flag) {
+          bias.Resize(bias_dim);
+          bias_v.resize(bias_dim.production());
+          for (auto& b : bias_v) {
+            b = gen(engine);
+          }
+        }
 
         LOG(INFO) << "prepare input";
         CLImageConverterDefault* default_converter =
@@ -496,21 +604,29 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
         lite::Tensor out_ref;
         out_ref.Resize(output_dim);
         auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
-        if (stride == 1) {
-          depth_conv<float, 1, 1>(input_v.data(),
-                                  input.dims(),
-                                  filter_v.data(),
-                                  filter.dims(),
-                                  out_ref_data,
-                                  out_ref.dims());
-        } else if (stride == 2) {
-          depth_conv<float, 2, 2>(input_v.data(),
-                                  input.dims(),
-                                  filter_v.data(),
-                                  filter.dims(),
-                                  out_ref_data,
-                                  out_ref.dims());
-        }
+
+        conv_basic<float, float>(input_v.data(),
+                                 out_ref_data,
+                                 1,
+                                 oc,
+                                 oh,
+                                 ow,
+                                 ic,
+                                 ih,
+                                 iw,
+                                 filter_v.data(),
+                                 bias_v.data(),
+                                 param.groups,
+                                 fw,
+                                 fh,
+                                 stride,
+                                 stride,
+                                 dilation,
+                                 dilation,
+                                 pad,
+                                 pad,
+                                 bias_flag,
+                                 relu_flag);
 
         const size_t cl_image2d_row_pitch{0};
         const size_t cl_image2d_slice_pitch{0};
@@ -538,7 +654,7 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
           EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
                        abs_diff > FP16_ABS_DIFF);
           if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) {
-            LOG(FATAL) << "error idx:" << i << "output_v[" << i
+            LOG(FATAL) << "error idx:" << i << " output_v[" << i
                        << "]:" << output_v[i] << " "
                                                  "out_ref_data["
                        << i << "]:" << out_ref_data[i];
-- 
GitLab