optimize conv7*7, compute 2 point in 1 thread. test=develop (#2028)

9851c008 · NazgulLee · Yanzhan Yang · 8ab543ff · 9851c008 · 9851c008
4 changed file
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
@@ -32,6 +32,135 @@ void WinogradConv3x3<4, 3>(framework::CLHelper *cl_helper,
                           const framework::CLImage *new_scale,
                           const framework::CLImage *new_bias) {}
+void ConvAddBnReluPt1x2(framework::CLHelper *cl_helper,
+                        const ConvParam<GPU_CL> &param, bool ifRelu,
+                        const framework::CLImage *biase,
+                        const framework::CLImage *new_scale,
+                        const framework::CLImage *new_bias) {
+  auto kernel = cl_helper->KernelAt(0);
+  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
+  default_work_size[1] = (default_work_size[1] + 1) / 2;
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+  int filter_channel = param.Filter()->dims()[1];
+  int input_channel = param.Input()->dims()[1];
+  //
+  //    DLOG << " c block " << c_block;
+  //    DLOG << " w " << w;
+  //    DLOG << " nh " << nh;
+  //    DLOG << " stride " << stride;
+  //    DLOG << " offset " << offset;
+  //    DLOG << " input_c " << input_c;
+  //    DLOG << " dilation " << dilation;
+  //    DLOG << " input width " << input_width;
+  //    DLOG << " input height " << input_height;
+  //    DLOG << " output width " << output_width;
+  //    DLOG << " output height " << output_height;
+  //    DLOG << " input dim " << param.Input()->dims();
+  //    DLOG << " output dim " << param.Output()->dims();
+  //    DLOG << " filter dim " << param.Filter()->dims();
+  cl_int status;
+  int index = 0;
+  status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, index++, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+  if (biase) {
+    auto bias_mem = biase->GetCLImage();
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
+    CL_CHECK_ERRORS(status);
+  }
+  if (new_scale && new_bias) {
+    auto new_scale_mem = new_scale->GetCLImage();
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
+    CL_CHECK_ERRORS(status);
+    auto new_bias_mem = new_bias->GetCLImage();
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
+    CL_CHECK_ERRORS(status);
+  }
+  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, index++, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+  if (param.Filter()->dims()[2] == 3 && param.Filter()->dims()[3] == 3) {
+    if (filter_channel != input_channel) {
+      if (filter_channel != 1) {
+        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_channel);
+        CL_CHECK_ERRORS(status);
+        int has_group = 1;
+        status = clSetKernelArg(kernel, index++, sizeof(int), &has_group);
+        CL_CHECK_ERRORS(status);
+      }
+    } else {
+      status = clSetKernelArg(kernel, index++, sizeof(int), &filter_channel);
+      CL_CHECK_ERRORS(status);
+      int has_group = 0;
+      status = clSetKernelArg(kernel, index++, sizeof(int), &has_group);
+      CL_CHECK_ERRORS(status);
+    }
+  }
+  //  DLOG<<"default_work_size"<<default_work_size[0]<<"
+  //  "<<default_work_size[1]<<" "<<default_work_size[2];
+  status = clEnqueueNDRangeKernel(
+      cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
 void ConvAddBnRelu(framework::CLHelper *cl_helper,
                   const ConvParam<GPU_CL> &param, bool ifRelu,
                   const framework::CLImage *biase,

--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h
@@ -41,6 +41,12 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
                   const framework::CLImage *new_scale = nullptr,
                   const framework::CLImage *new_bias = nullptr);
+void ConvAddBnReluPt1x2(framework::CLHelper *cl_helper,
+                        const ConvParam<GPU_CL> &param, bool ifRelu = false,
+                        const framework::CLImage *biase = nullptr,
+                        const framework::CLImage *new_scale = nullptr,
+                        const framework::CLImage *new_bias = nullptr);
 void DWConvAddBnRelu(framework::CLHelper *cl_helper,
                     const ConvParam<GPU_CL> &param, bool ifRelu = false,
                     const framework::CLImage *biase = nullptr,

--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -2111,6 +2111,157 @@ __kernel void conv_7x7(__private const int global_size_dim0,
    write_imageh(output_image, output_pos, output);
 }
+__kernel void conv_7x7Pt1x2(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input_image,
+                                              __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height,/* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+    const int out_c = get_global_id(0);
+    const int out_w1 = get_global_id(1);
+    const int out_nh = get_global_id(2);
+    if (out_c >= global_size_dim0 ||
+        out_w1 >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        return;
+    }
+    const int out_w = out_w1 * 2;
+    int2 output_pos = (int2)(out_c * output_width + out_w, out_nh);
+    const int filter_n0 = 4 * out_c + 0;
+    const int filter_n1 = 4 * out_c + 1;
+    const int filter_n2 = 4 * out_c + 2;
+    const int filter_n3 = 4 * out_c + 3;
+    int2 stride_xy;
+    stride_xy.x = stride;
+    stride_xy.y = stride;
+    int2 ouput_pos_in_one_block;
+    ouput_pos_in_one_block.x = out_w;
+    ouput_pos_in_one_block.y = out_nh;
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+    int2 in_pos_in_one_block;
+    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+    half4 output0 = 0.0f;
+    half4 output1 = 0.0f;
+#ifdef BIASE_CH
+    output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+    output1 = output0;
+#elif defined(BIASE_ELE)
+    output0 = read_imageh(bias, sampler, output_pos);
+    output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y));
+#else
+    output0 = 0.0f;
+    output1 = 0.0f;
+#endif
+   half4 input[8];
+   half4 filter0[4];
+   half4 filter1[4];
+   half4 filter2[4];
+   half4 filter3[4];
+   int2 filter_pos0;
+   int2 filter_pos1;
+   int2 filter_pos2;
+   int2 filter_pos3;
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        for(int k = 0; k < 7; k++){
+         for (int j = 0; j < 8; j++) {
+             input[j]  =  select(read_imageh(input_image, sampler,
+                                                                  (int2)(pos_in.x + (j - 3) * dilation, pos_in.y +  (k - 3) * dilation)),
+                                                                  (half4)(0.0f),
+                                                                  (ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15));
+             int filter_h = k;
+             int filter_w = j;
+             int filter_c = i;
+             if (j < 7) {
+                filter_pos0.x = filter_c * 7 + filter_w;
+                filter_pos0.y = filter_n0 * 7 + filter_h;
+                filter_pos1.x = filter_c * 7 + filter_w;
+                filter_pos1.y = filter_n1 * 7 + filter_h;
+                filter_pos2.x = filter_c * 7 + filter_w;
+                filter_pos2.y = filter_n2 * 7 + filter_h;
+                filter_pos3.x = filter_c * 7 + filter_w;
+                filter_pos3.y = filter_n3 * 7 + filter_h;
+                filter0[0] =  read_imageh(filter_image, sampler, filter_pos0);
+                filter0[1] =  read_imageh(filter_image, sampler, filter_pos1);
+                filter0[2] =  read_imageh(filter_image, sampler, filter_pos2);
+                filter0[3] =  read_imageh(filter_image, sampler, filter_pos3);
+                output0.x += dot(input[j], filter0[0]);
+                output0.y += dot(input[j], filter0[1]);
+                output0.z += dot(input[j], filter0[2]);
+                output0.w += dot(input[j], filter0[3]);
+             }
+             if (j > 0) {
+               output1.x += dot(input[j], filter1[0]);
+               output1.y += dot(input[j], filter1[1]);
+               output1.z += dot(input[j], filter1[2]);
+               output1.w += dot(input[j], filter1[3]);
+             }
+             filter1[0] = filter0[0];
+             filter1[1] = filter0[1];
+             filter1[2] = filter0[2];
+             filter1[3] = filter0[3];
+         }
+        }
+   }
+#ifdef BATCH_NORM
+    half s = read_imageh(new_scale, sampler, (int2)(out_c, 0));
+    half b = read_imageh(new_biase, sampler, (int2)(out_c, 0));
+    output0 = output0 * s + b;
+    output1 = output1 * s + b;
+#endif
+#ifdef RELU
+    output0 = activation(output0);
+    output1 = activation(output1);
+#endif
+    write_imageh(output_image, output_pos, output0);
+    if ((output_pos.x + 1) % output_width != 0) {
+      write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1);
+    }
+}
 __kernel void conv_5x5(__private const int global_size_dim0,
                                              __private const int global_size_dim1,
                                              __private const int global_size_dim2,

--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -95,7 +95,8 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
    param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                 cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_7x7", conv_kernel_file, build_options);
+    this->cl_helper_.AddKernel("conv_7x7Pt1x2", conv_kernel_file,
+                               build_options);
  } else if (param->Filter()->dims()[2] == 5 &&
             param->Filter()->dims()[3] == 5) {
@@ -118,10 +119,12 @@ void ConvAddKernel<GPU_CL, float>::Compute(
      break;
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
      ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
      break;
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
+      ConvAddBnReluPt1x2(&this->cl_helper_, param, false, param.Bias());
+      break;
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
      DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
      break;