From e439268f1b26ed13cb76baadb33a258e1f2a9304 Mon Sep 17 00:00:00 2001 From: zhaoyang-star Date: Wed, 16 Sep 2020 14:53:38 +0800 Subject: [PATCH] [Cherry-pick][Bugfix][OpenCL] fix depthwise_conv2d_3x3 with dilation > 1 (#4281) (#4325) * [Bugfix][OpenCL] fix depthwise_conv2d_3x3 with dilation > 1 (#4281) Co-authored-by: ysh329 --- .../image/depthwise_conv2d_kernel.cl | 92 +++++----- .../depthwise_conv2d_image_compute_test.cc | 162 +++++++++++++++--- 2 files changed, 183 insertions(+), 71 deletions(-) diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl index 6fbdc21f93..7d86730b93 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl @@ -48,7 +48,7 @@ __kernel void depth_conv2d_3x3( int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); int2 in_pos_in_one_block = - ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + ouput_pos_in_one_block * stride_xy + (int2)(offset + dilation - 1, offset + dilation - 1); #ifdef BIASE_CH CL_DTYPE4 output = @@ -77,13 +77,13 @@ __kernel void depth_conv2d_3x3( READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, - (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, - pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation, + pos_in_input_block.y + in_pos_in_one_block.y - dilation)), (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || - in_pos_in_one_block.y - 1 < 0 || - in_pos_in_one_block.x - 1 >= input_width || - in_pos_in_one_block.y - 1 >= input_height) + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) << 15)); inputs[1] = select( @@ -91,45 +91,37 @@ __kernel void depth_conv2d_3x3( input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, - pos_in_input_block.y + in_pos_in_one_block.y - 1)), + pos_in_input_block.y + in_pos_in_one_block.y - dilation)), (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y - 1 >= input_height) + in_pos_in_one_block.y - dilation >= input_height) << 15)); inputs[2] = select( READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, - (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, - pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation, + pos_in_input_block.y + in_pos_in_one_block.y - dilation)), (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || - in_pos_in_one_block.y - 1 < 0 || - in_pos_in_one_block.x + 1 >= input_width || - in_pos_in_one_block.y - 1 >= input_height) + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) << 15)); inputs[3] = select( READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, - (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation, pos_in_input_block.y + in_pos_in_one_block.y)), (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x - 1 >= input_width || + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - /* - if (output_pos.x == 112 && output_pos.y == 0) { - CL_DTYPE4 input1 = inputs[3]; - float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); - printf(" input4 3 - %v4hlf \n", in); - printf(" --- %d ---\n", in_pos_in_one_block.x - 1); - } - */ inputs[4] = select( READ_IMG_TYPE(CL_DTYPE_CHAR, @@ -147,11 +139,11 @@ __kernel void depth_conv2d_3x3( READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, - (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation, pos_in_input_block.y + in_pos_in_one_block.y)), (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || - in_pos_in_one_block.x + 1 >= input_width || + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); @@ -159,13 +151,13 @@ __kernel void depth_conv2d_3x3( READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, - (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, - pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation, + pos_in_input_block.y + in_pos_in_one_block.y + dilation)), (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || - in_pos_in_one_block.y + 1 < 0 || - in_pos_in_one_block.x - 1 >= input_width || - in_pos_in_one_block.y + 1 >= input_height) + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) << 15)); inputs[7] = select( @@ -173,24 +165,24 @@ __kernel void depth_conv2d_3x3( input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, - pos_in_input_block.y + in_pos_in_one_block.y + 1)), + pos_in_input_block.y + in_pos_in_one_block.y + dilation)), (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || - in_pos_in_one_block.y + 1 >= input_height) + in_pos_in_one_block.y + dilation >= input_height) << 15)); inputs[8] = select( READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, - (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, - pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation, + pos_in_input_block.y + in_pos_in_one_block.y + dilation)), (CL_DTYPE4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || - in_pos_in_one_block.y + 1 < 0 || - in_pos_in_one_block.x + 1 >= input_width || - in_pos_in_one_block.y + 1 >= input_height) + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) << 15)); CL_DTYPE4 filters[9]; @@ -221,14 +213,18 @@ __kernel void depth_conv2d_3x3( /* - if (output_pos.x == 112 && output_pos.y == 0) { + if (output_pos.x == 0 && output_pos.y == 0) { for (int i = 0; i < 9; ++i) { CL_DTYPE4 input1 = inputs[i]; float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); - printf(" input4 %d - %v4hlf \n", i, in); + printf(" input4[%d]: %v4hlf \n", i, in); + } + for (int i = 0; i < 9; ++i) { + CL_DTYPE4 filters1 = filters[i]; + float4 f = (float4)(filters1.x, filters1.y, filters1.z, filters1.w); + printf(" weights4[%d]: %v4hlf \n", i, f); } - float4 out = (float4)(output.x, output.y, output.z, output.w); printf(" depth wise output output4 = %v4hlf \n", out); printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); diff --git a/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc index e36be300ba..2199d28716 100644 --- a/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc +++ b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc @@ -32,6 +32,93 @@ namespace lite { // #define TEST_DEPTHWISE_CONV_IMAGE_BASIC #define TEST_DEPTHWISE_CONV_IMAGE_3X3 +template +static void conv_basic(const Dtype1* din, + Dtype2* dout, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + const Dtype1* weights, + const Dtype2* bias, + int group, + int kernel_w, + int kernel_h, + int stride_w, + int stride_h, + int dila_w, + int dila_h, + int pad_w, + int pad_h, + bool flag_bias, + bool flag_relu) { + CHECK(!flag_relu); + auto src_data = din; + auto dst_data_ref = dout; + auto weights_data = weights; + auto with_bias = flag_bias; + auto bias_data = bias; + + int in_num = num; + int out_channels = chout; + int out_h = hout; + int out_w = wout; + + int in_channel = chin; + int in_h = hin; + int in_w = win; + int out_c_group = out_channels / group; + int in_c_group = in_channel / group; + + for (int n = 0; n < in_num; ++n) { + for (int g = 0; g < group; ++g) { + for (int oc = 0; oc < out_c_group; ++oc) { + for (int oh = 0; oh < out_h; ++oh) { + for (int ow = 0; ow < out_w; ++ow) { + int out_idx = n * group * out_c_group * out_h * out_w + + g * out_c_group * out_h * out_w + oc * out_h * out_w + + oh * out_w + ow; + Dtype2 bias_d = + with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0; + dst_data_ref[out_idx] = bias_d; + for (int ic = 0; ic < in_c_group; ++ic) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int iw = ow * stride_w - pad_w + kw * (dila_w); + int ih = oh * stride_h - pad_h + kh * (dila_h); + if (iw < 0 || iw >= in_w) continue; + if (ih < 0 || ih >= in_h) continue; + + int iidx = n * in_channel * in_h * in_w + + g * in_c_group * in_h * in_w + ic * in_h * in_w + + ih * in_w + iw; + int widx = + g * out_c_group * in_c_group * kernel_h * kernel_w + + oc * in_c_group * kernel_h * kernel_w + + ic * kernel_h * kernel_w + kh * kernel_w + kw; + + dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx]; + /* + if (out_idx == 0) { + VLOG(5) << "src[" << iidx << "]: " << src_data[iidx] + << "\tweights[" << widx << "]: " + << weights_data[widx] + << "\tdst[" << out_idx << "]: " + << dst_data_ref[out_idx]; + */ + } + } + } + } + } + } + } + } +} + template void depth_conv(const T* input_data, const lite::DDim& input_dims, @@ -384,11 +471,14 @@ TEST(depthwise_conv2d, compute_basic) { #ifdef TEST_DEPTHWISE_CONV_IMAGE_3X3 // #define LOOP_TEST TEST(depthwise_conv2d, compute_image2d_3x3) { + const int fc = 1; const int fw = 3; const int fh = fw; - int dilation = 1; - int stride = 1; - int pad = 0; + const int dilation = 4; + const int stride = 2; + const int pad = 2; + const bool bias_flag = false; + const bool relu_flag = false; #ifdef LOOP_TEST // for (int batch_size = 1; batch_size < 2; ++batch_size) { for (int oc = 4; oc < 10; oc += 1) { // oc = ic @@ -399,12 +489,18 @@ TEST(depthwise_conv2d, compute_image2d_3x3) { const int ih = 112; const int iw = 112; #endif - stride = (stride == 1) ? 2 : 1; - // pad = (pad == 0) ? 1 : 0; const int fb = oc; const int ic = oc; const int oh = ConvOutputSize(ih, fh, dilation, pad, pad, stride); const int ow = ConvOutputSize(iw, fw, dilation, pad, pad, stride); + if (oh <= 0 || ow <= 0) { +#ifdef LOOP_TEST + continue; +#else + LOG(FATAL) << "Output tensor of depthwise conv is illegal!" + << "Please check your input dims and conv params"; +#endif + } LOG(INFO) << "to get kernel ..."; auto kernels = @@ -417,7 +513,7 @@ TEST(depthwise_conv2d, compute_image2d_3x3) { auto kernel = std::move(kernels.front()); LOG(INFO) << "get kernel"; - lite::Tensor input, filter, output; + lite::Tensor input, filter, bias, output; operators::ConvParam param; param.x = &input; param.filter = &filter; @@ -428,6 +524,8 @@ TEST(depthwise_conv2d, compute_image2d_3x3) { param.strides = std::vector{stride, stride}; std::vector dilations = {dilation, dilation}; param.dilations = std::make_shared>(dilations); + param.bias = bias_flag ? &bias : nullptr; + param.fuse_relu = relu_flag; std::unique_ptr context(new KernelContext); context->As().InitOnce(); @@ -442,9 +540,11 @@ TEST(depthwise_conv2d, compute_image2d_3x3) { const DDim& input_dim = lite::DDim{std::vector({1, ic, ih, iw})}; const DDim& filter_dim = - lite::DDim{std::vector({fb, 1, 3, 3})}; + lite::DDim{std::vector({fb, fc, fh, fw})}; const DDim& output_dim = lite::DDim{std::vector({1, oc, oh, ow})}; + // element wise bias + const DDim bias_dim = DDim(std::vector{oc}); input.Resize(input_dim); filter.Resize(filter_dim); output.Resize(output_dim); @@ -460,6 +560,14 @@ TEST(depthwise_conv2d, compute_image2d_3x3) { for (auto& f : filter_v) { f = gen(engine); } + std::vector bias_v; + if (bias_flag) { + bias.Resize(bias_dim); + bias_v.resize(bias_dim.production()); + for (auto& b : bias_v) { + b = gen(engine); + } + } LOG(INFO) << "prepare input"; CLImageConverterDefault* default_converter = @@ -496,21 +604,29 @@ TEST(depthwise_conv2d, compute_image2d_3x3) { lite::Tensor out_ref; out_ref.Resize(output_dim); auto* out_ref_data = out_ref.mutable_data(TARGET(kARM)); - if (stride == 1) { - depth_conv(input_v.data(), - input.dims(), - filter_v.data(), - filter.dims(), - out_ref_data, - out_ref.dims()); - } else if (stride == 2) { - depth_conv(input_v.data(), - input.dims(), - filter_v.data(), - filter.dims(), - out_ref_data, - out_ref.dims()); - } + + conv_basic(input_v.data(), + out_ref_data, + 1, + oc, + oh, + ow, + ic, + ih, + iw, + filter_v.data(), + bias_v.data(), + param.groups, + fw, + fh, + stride, + stride, + dilation, + dilation, + pad, + pad, + bias_flag, + relu_flag); const size_t cl_image2d_row_pitch{0}; const size_t cl_image2d_slice_pitch{0}; @@ -538,7 +654,7 @@ TEST(depthwise_conv2d, compute_image2d_3x3) { EXPECT_FALSE(relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF); if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) { - LOG(FATAL) << "error idx:" << i << "output_v[" << i + LOG(FATAL) << "error idx:" << i << " output_v[" << i << "]:" << output_v[i] << " " "out_ref_data[" << i << "]:" << out_ref_data[i]; -- GitLab