Optimal OPENCL conv_3x3 Performance (#1994)

* add opencl depthwise_conv_trans_op * test=develop * Optimal OPENCL conv_3x3S2 Performance * test=develop

Optimal OPENCL conv_3x3 Performance (#1994)
* add opencl depthwise_conv_trans_op * test=develop * Optimal OPENCL conv_3x3S2 Performance * test=develop
5d9b0471 · StarryRain · Yanzhan Yang · f38a741a · 5d9b0471 · 5d9b0471
3 changed file
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -424,8 +424,8 @@ __kernel void conv_3x3(__private const int global_size_dim0,
    write_imageh(output_image, output_pos, output);
 }
-   // dilation == 1 && stride == 1 && ou_nh == ou_h
+   // dilation == 1
-__kernel void conv_3x3s1(__private const int item_ch,
+__kernel void conv_3x3spl(__private const int item_ch,
                               __private const int item_w,
                               __private const int item_h,
                               __read_only image2d_t input_image,
@@ -456,14 +456,8 @@ __read_only image2d_t new_scale,
    const int item_w_id = get_global_id(1);
    const int item_h_id = get_global_id(2);
-    // in_width_id_per_blk
+    // out_width_id_per_blk and out_batch_id
-    int in_w_id0 = item_w_id - pad;
+    int out_batch_id = item_h_id / in_h;
-    int in_w_id1 = in_w_id0 + item_w;
-    int in_w_id2 = in_w_id1 + item_w;
-    int in_w_id3 = in_w_id2 + item_w;
-    int in_w_id4 = in_w_id3 + item_w;
-    // out_width_id_per_blk
    int out_w_base_id = item_ch_id * out_w;
    int out_w_id0 = item_w_id;
    int out_w_id1 = out_w_id0 + item_w;
@@ -471,6 +465,14 @@ __read_only image2d_t new_scale,
    int out_w_id3 = out_w_id2 + item_w;
    int out_w_id4 = out_w_id3 + item_w;
+    // in_width_id_per_blk and in_height_id_per_batch
+    int in_h_id = (item_h_id % out_h) * stride - pad;
+    int in_w_id0 = item_w_id * stride - pad;
+    int in_w_id1 = in_w_id0 + item_w * stride;
+    int in_w_id2 = in_w_id1 + item_w * stride;
+    int in_w_id3 = in_w_id2 + item_w * stride;
+    int in_w_id4 = in_w_id3 + item_w * stride;
 #ifdef BIASE_CH
    half4 output[5];
@@ -518,8 +520,8 @@ __read_only image2d_t new_scale,
        for (int h = 0; h < 3; h++) {
-            int in_h_val = select(item_h_id + h - pad, -1,
+            int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
-                                   (item_h_id + h - pad < 0 || item_h_id + h - pad >= in_h));
+                                 (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h));
            for (int w = 0; w < 3; w++) {
@@ -539,7 +541,6 @@ __read_only image2d_t new_scale,
                filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
                filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
                filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x);    // in_ch:0,out_ch:0-3
                filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y);    // in_ch:1,out_ch:0-3
                filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z);    // in_ch:2,out_ch:0-3

--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -82,17 +82,11 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
    //
    //    } else {
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT;
+    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-      param->Filter()->InitCLImage(cl_helper_.CLContext(),
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                   cl_helper_.CLCommandQueue());
+                                 cl_helper_.CLCommandQueue());
-      this->cl_helper_.AddKernel("conv_3x3s1", conv_kernel_file, build_options);
+    this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-      param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                   cl_helper_.CLCommandQueue());
-      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
-    }
    //    }
  } else if (param->Filter()->dims()[2] == 7 &&
@@ -123,7 +117,6 @@ void ConvAddKernel<GPU_CL, float>::Compute(
      WinogradConv3x3<4, 3>(&this->cl_helper_, param, false, param.Bias());
      break;
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
@@ -132,7 +125,7 @@ void ConvAddKernel<GPU_CL, float>::Compute(
    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
      DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
      SWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
      break;
    default:

--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
@@ -86,7 +86,8 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT;
      param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                   cl_helper_.CLCommandQueue());
-      this->cl_helper_.AddKernel("conv_3x3s1", conv_kernel_file, build_options);
+      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
+                                 build_options);
    } else {
      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
      param->Filter()->InitCLImage(cl_helper_.CLContext(),