add opencl slidingwindow3*3s1 (#1799)

* add CPU_ARCH info, improve the performance of GEMM1*1s1 * improve the performance of gemm1*1s1_conv_add and gemm1*1s1_conv_add_bn_relu * improve the performance of slidingwindow_bn_relu,slidingwindow_add,slidingwindow_add_bn_relu,gemm1*1s1_bn_relu,gemm1*1s1_add_relu * add faster sgemv_notrans_mx1, fix test_fusion_op * add opencl slidingwindow3*3s1

add opencl slidingwindow3*3s1 (#1799)
* add CPU_ARCH info, improve the performance of GEMM1*1s1 * improve the performance of gemm1*1s1_conv_add and gemm1*1s1_conv_add_bn_relu * improve the performance of slidingwindow_bn_relu,slidingwindow_add,slidingwindow_add_bn_relu,gemm1*1s1_bn_relu,gemm1*1s1_add_relu * add faster sgemv_notrans_mx1, fix test_fusion_op * add opencl slidingwindow3*3s1
6089d6b4 · StarryRain · Jiaying Zhao · 41a8af2b · 6089d6b4 · 6089d6b4
5 changed file
--- a/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
@@ -19,6 +19,7 @@ limitations under the License. */

 namespace paddle_mobile {
 namespace operators {
+bool use_lws = true;

 template <>
 void winograd_transform_weight<4, 3>(framework::CLHelper *cl_helper,
@@ -144,9 +145,18 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
        static_cast<const uint32_t>(maped_w),
        static_cast<const uint32_t>(default_work_size.data()[2])};

-    status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel,
-                                    default_work_size.size(), NULL, work_size,
-                                    NULL, 0, NULL, NULL);
+    if (work_size[1] % 60 == 0 && use_lws) {
+      const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
+                                         static_cast<const uint32_t>(60),
+                                         static_cast<const uint32_t>(1)};
+      status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel,
+                                      default_work_size.size(), NULL, work_size,
+                                      local_work_size, 0, NULL, NULL);
+    } else {
+      status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel,
+                                      default_work_size.size(), NULL, work_size,
+                                      NULL, 0, NULL, NULL);
+    }
    CL_CHECK_ERRORS(status);
  } else {
    status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
@@ -335,11 +345,128 @@ void DWConvAddBnRelu(framework::CLHelper *cl_helper,
  status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
  CL_CHECK_ERRORS(status);

-  status = clEnqueueNDRangeKernel(
-      cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
+  if (default_work_size.data()[1] % 60 == 0 && use_lws) {
+    const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
+                                       static_cast<const uint32_t>(60),
+                                       static_cast<const uint32_t>(1)};
+    status = clEnqueueNDRangeKernel(
+        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
+        default_work_size.data(), local_work_size, 0, NULL, NULL);
+  } else {
+    status = clEnqueueNDRangeKernel(
+        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
+        default_work_size.data(), NULL, 0, NULL, NULL);
+  }
+
  CL_CHECK_ERRORS(status);
 }

+void SWConvAddBnRelu(framework::CLHelper *cl_helper,
+                     const ConvParam<GPU_CL> &param, bool ifRelu,
+                     const framework::CLImage *biase,
+                     const framework::CLImage *new_scale,
+                     const framework::CLImage *new_bias) {
+  auto kernel = cl_helper->KernelAt(0);
+  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+
+  int w_blk_size = 5;
+  int w_blk = (w + w_blk_size - 1) / w_blk_size;
+  default_work_size[1] = w_blk;
+
+  int h_blk_size = 1;
+  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
+  default_work_size[2] = h_blk;
+
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int pad = param.Paddings()[0];
+  int dilation = param.Dilations()[0];
+
+  int input_channel = param.Input()->dims()[1];
+  int input_height = param.Input()->dims()[2];
+  int input_width = param.Input()->dims()[3];
+
+  int output_height = param.Output()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+
+  cl_int status;
+  int index = 0;
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+
+  if (biase) {
+    auto bias_mem = biase->GetCLImage();
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
+    CL_CHECK_ERRORS(status);
+  }
+
+  if (new_scale && new_bias) {
+    auto new_scale_mem = new_scale->GetCLImage();
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
+    CL_CHECK_ERRORS(status);
+
+    auto new_bias_mem = new_bias->GetCLImage();
+    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
+    CL_CHECK_ERRORS(status);
+  }
+
+  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &pad);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  if (default_work_size.data()[1] % 60 == 0 && use_lws) {
+    const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
+                                       static_cast<const uint32_t>(60),
+                                       static_cast<const uint32_t>(1)};
+    status = clEnqueueNDRangeKernel(
+        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
+        default_work_size.data(), local_work_size, 0, NULL, NULL);
+  } else {
+    status = clEnqueueNDRangeKernel(
+        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
+        default_work_size.data(), NULL, 0, NULL, NULL);
+  }
+  CL_CHECK_ERRORS(status);
+}
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/kernel/cl/cl-kernel-func/conv_func.h
+++ b/src/operators/kernel/cl/cl-kernel-func/conv_func.h
@@ -47,6 +47,12 @@ void DWConvAddBnRelu(framework::CLHelper *cl_helper,
                     const framework::CLImage *new_scale = nullptr,
                     const framework::CLImage *new_bias = nullptr);

+void SWConvAddBnRelu(framework::CLHelper *cl_helper,
+                     const ConvParam<GPU_CL> &param, bool ifRelu = false,
+                     const framework::CLImage *biase = nullptr,
+                     const framework::CLImage *new_scale = nullptr,
+                     const framework::CLImage *new_bias = nullptr);
+
 }  // namespace operators
 }  // namespace paddle_mobile


--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -424,6 +424,202 @@ __kernel void conv_3x3(__private const int global_size_dim0,
    write_imageh(output_image, output_pos, output);
 }

+   // dilation == 1 && stride == 1 && ou_nh == ou_h
+__kernel void conv_3x3s1(__private const int item_ch,
+                               __private const int item_w,
+                               __private const int item_h,
+                               __read_only image2d_t input_image,
+                               __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+        __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+__read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                               __write_only image2d_t output_image,
+                               __private const int stride,
+                               __private const int pad,
+                               __private const int dilation,
+                               __private const int in_ch,
+                               __private const int in_w,
+                               __private const int in_h,
+                               __private const int out_w,
+                               __private const int out_h) {
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    // item_id
+    const int item_ch_id = get_global_id(0);
+    const int item_w_id = get_global_id(1);
+    const int item_h_id = get_global_id(2);
+
+    // in_width_id_per_blk
+    int in_w_id0 = item_w_id - pad;
+    int in_w_id1 = in_w_id0 + item_w;
+    int in_w_id2 = in_w_id1 + item_w;
+    int in_w_id3 = in_w_id2 + item_w;
+    int in_w_id4 = in_w_id3 + item_w;
+
+    // out_width_id_per_blk
+    int out_w_base_id = item_ch_id * out_w;
+    int out_w_id0 = item_w_id;
+    int out_w_id1 = out_w_id0 + item_w;
+    int out_w_id2 = out_w_id1 + item_w;
+    int out_w_id3 = out_w_id2 + item_w;
+    int out_w_id4 = out_w_id3 + item_w;
+
+#ifdef BIASE_CH
+
+    half4 output[5];
+    output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
+    output[1] = output[0];
+    output[2] = output[0];
+    output[3] = output[0];
+    output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+    half4 output[5];
+    output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
+    if (out_w_id1 < out_w) {
+        output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id));
+    }
+    if (out_w_id2 < out_w) {
+        output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id));
+    }
+    if (out_w_id3 < out_w) {
+        output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id));
+    }
+    if (out_w_id4 < out_w) {
+        output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id));
+    }
+#else
+    half4 output[5] = {0.0f};
+#endif
+
+    half4 filter[4] = {0.0f};
+    half4 filter_trans[4] = {0.0f};
+    half4 input[5] = {0.0f};
+
+    int filter_h_val0 = item_ch_id * 4 * 3;
+    int filter_h_val1 = filter_h_val0 + 3;
+    int filter_h_val2 = filter_h_val1 + 3;
+    int filter_h_val3 = filter_h_val2 + 3;
+
+    for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+        int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+        const int in_w_base_id = mul24(ch, in_w);
+
+        int filter_w_val = ch * 3;
+
+        for (int h = 0; h < 3; h++) {
+
+            int in_h_val = select(item_h_id + h - pad, -1,
+                                   (item_h_id + h - pad < 0 || item_h_id + h - pad >= in_h));
+
+            for (int w = 0; w < 3; w++) {
+
+                int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
+                                  (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+                int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
+                                   (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+                int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
+                                   (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+                int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
+                                   (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+                int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
+                                   (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+                filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0
+                filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1
+                filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
+                filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
+
+
+                filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x);    // in_ch:0,out_ch:0-3
+                filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y);    // in_ch:1,out_ch:0-3
+                filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z);    // in_ch:2,out_ch:0-3
+                filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w);    // in_ch:3,out_ch:0-3
+
+                input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
+                input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
+                input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
+                input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
+                input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+                output[0] = mad(input[0].x, filter_trans[0], output[0]);
+                output[1] = mad(input[1].x, filter_trans[0], output[1]);
+                output[2] = mad(input[2].x, filter_trans[0], output[2]);
+                output[3] = mad(input[3].x, filter_trans[0], output[3]);
+                output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+                if (ch_surplus < 3) {
+                    output[0] = mad(input[0].y, filter_trans[1], output[0]);
+                    output[1] = mad(input[1].y, filter_trans[1], output[1]);
+                    output[2] = mad(input[2].y, filter_trans[1], output[2]);
+                    output[3] = mad(input[3].y, filter_trans[1], output[3]);
+                    output[4] = mad(input[4].y, filter_trans[1], output[4]);
+                }
+                if (ch_surplus < 2) {
+                    output[0] = mad(input[0].z, filter_trans[2], output[0]);
+                    output[1] = mad(input[1].z, filter_trans[2], output[1]);
+                    output[2] = mad(input[2].z, filter_trans[2], output[2]);
+                    output[3] = mad(input[3].z, filter_trans[2], output[3]);
+                    output[4] = mad(input[4].z, filter_trans[2], output[4]);
+                }
+                if (ch_surplus < 1) {
+                    output[0] = mad(input[0].w, filter_trans[3], output[0]);
+                    output[1] = mad(input[1].w, filter_trans[3], output[1]);
+                    output[2] = mad(input[2].w, filter_trans[3], output[2]);
+                    output[3] = mad(input[3].w, filter_trans[3], output[3]);
+                    output[4] = mad(input[4].w, filter_trans[3], output[4]);
+                }
+            }
+        }
+    }
+#ifdef BATCH_NORM
+    half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
+    half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
+    output[0] = mad(scale, output[0], biase);
+    if (out_w_id1 < out_w) {
+        output[1] =  mad(scale, output[1], biase);
+    }
+    if (out_w_id2 < out_w) {
+        output[2] =  mad(scale, output[2], biase);
+    }
+    if (out_w_id3 < out_w) {
+        output[3] =  mad(scale, output[3], biase);
+    }
+    if (out_w_id4 < out_w) {
+        output[4] =  mad(scale, output[4], biase);
+    }
+#endif
+
+#ifdef RELU
+    output[0] = activation(output[0]);
+    output[1] = activation(output[1]);
+    output[2] = activation(output[2]);
+    output[3] = activation(output[3]);
+    output[4] = activation(output[4]);
+#endif
+    write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]);
+    if (out_w_id1 < out_w) {
+        write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]);
+    }
+    if (out_w_id2 < out_w) {
+        write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]);
+    }
+    if (out_w_id3 < out_w) {
+        write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]);
+    }
+    if (out_w_id4 < out_w) {
+        write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]);
+    }
+}




--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -82,11 +82,17 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
    //
    //    } else {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT;
+      param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                   cl_helper_.CLCommandQueue());
+      this->cl_helper_.AddKernel("conv_3x3s1", conv_kernel_file, build_options);
+    } else {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
+      param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                   cl_helper_.CLCommandQueue());
+      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+    }
    //    }

  } else if (param->Filter()->dims()[2] == 7 &&
@@ -126,6 +132,9 @@ void ConvAddKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
      DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
      break;
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
+      SWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
+      break;
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                    param.ExecMode());

--- a/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_relu_kernel.cpp
@@ -82,11 +82,17 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
    //
    //    } else {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT;
+      param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                   cl_helper_.CLCommandQueue());
+      this->cl_helper_.AddKernel("conv_3x3s1", conv_kernel_file, build_options);
+    } else {
+      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
+      param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                   cl_helper_.CLCommandQueue());
+      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+    }
    //    }
    DLOG << "conv 3x3";

@@ -112,6 +118,9 @@ void ConvReluKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
      DWConvAddBnRelu(&this->cl_helper_, param, true);
      break;
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
+      SWConvAddBnRelu(&this->cl_helper_, param, true);
+      break;
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                    param.ExecMode());