optimise conv 1x1 ... ,test=develop (#2388)

optimise conv 1x1 ... ,test=develop (#2388)

optimise conv 1x1 ... ,test=develop (#2388)
f077276f · xiebaiyuan · GitHub · e74609b7 · f077276f · f077276f
7 changed file
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -1006,8 +1006,160 @@ __kernel void conv_1x1(__private const int global_size_dim0,
  write_imageh(output_image, output_pos, output);
 }
+__kernel void conv_1x1_simple(
+    __private const int global_size_dim0, __private const int global_size_dim1,
+    __private const int global_size_dim2, __read_only image2d_t input_image,
+    __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+    __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
+#endif
+    __write_only image2d_t output_image, __private const int stride,
+    __private const int offset, __private const int input_c,__private const int input_c_origin,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width,
+    __private const int output_height,
+    __private const int old_w
+) {
+  half zero = 0.0f;
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  int out_w0 = out_w;
+  int out_w1 = out_w + global_size_dim1;
+  int out_w2 = out_w + global_size_dim1 * 2;
+  int out_w3 = out_w + global_size_dim1 * 3;
+  int outpos_main = mul24(out_c , old_w);
+  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
+  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
+  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
+  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
+  int2 in_pos_in_one_block0 =
+      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
+  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
+  int2 in_pos_in_one_block1 =
+      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
+  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
+  int2 in_pos_in_one_block2 =
+      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
-__kernel void conv_1x1_spl(
+  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
+  int2 in_pos_in_one_block3 =
+      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
+#ifdef BIASE_CH
+  half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
+    half4 output1 = output0;
+    half4 output2 = output0;
+    half4 output3 = output0;
+#elif defined(BIASE_ELE)
+  half4 output0 = read_imageh(bias, sampler, output_pos0);
+    half4 output1 = output0;
+    half4 output2 = output0;
+    half4 output3 = output0;
+#else
+  half4 output0 = 0.0f;
+  half4 output1 = 0.0f;
+  half4 output2 = 0.0f;
+  half4 output3 = 0.0f;
+#endif
+  for (int i = 0; i < input_c; ++i) {
+    // ------------0---------------
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
+    half4 input0 = read_imageh(input_image, sampler, pos_in);
+    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+    output0 = mad(input0.x, weight0, output0);
+    output0 = mad(input0.y, weight1, output0);
+    output0 = mad(input0.z, weight2, output0);
+    output0 = mad(input0.w, weight3, output0);
+    // -------------1--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
+    half4 input1 = read_imageh(input_image, sampler, pos_in);
+    output1 = mad(input1.x, weight0, output1);
+    output1 = mad(input1.y, weight1, output1);
+    output1 = mad(input1.z, weight2, output1);
+    output1 = mad(input1.w, weight3, output1);
+    // -------------2--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
+    half4 input2 = read_imageh(input_image, sampler, pos_in);
+    output2 = mad(input2.x, weight0, output2);
+    output2 = mad(input2.y, weight1, output2);
+    output2 = mad(input2.z, weight2, output2);
+    output2 = mad(input2.w, weight3, output2);
+    // -------------3--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
+    half4 input3 = read_imageh(input_image, sampler, pos_in);
+    output3 = mad(input3.x, weight0, output3);
+    output3 = mad(input3.y, weight1, output3);
+    output3 = mad(input3.z, weight2, output3);
+    output3 = mad(input3.w, weight3, output3);
+  }
+#ifdef BATCH_NORM
+  output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+    output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+    output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+    output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
+          read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+#ifdef RELU
+  output0 = activation(output0);
+     output1 = activation(output1);
+     output2 = activation(output2);
+     output3 = activation(output3);
+#endif
+  if (out_w0 < old_w) {
+    write_imageh(output_image, output_pos0, output0);
+  }
+  if (out_w1 < old_w){
+    write_imageh(output_image, output_pos1, output1);
+  }
+  if (out_w2 < old_w){
+    write_imageh(output_image, output_pos2, output2);
+  }
+  if (out_w3 < old_w){
+    write_imageh(output_image, output_pos3, output3);
+  }
+}
+__kernel void conv_1x1_wrapped(
    __private const int global_size_dim0, __private const int global_size_dim1,
    __private const int global_size_dim2, __read_only image2d_t input_image,
    __read_only image2d_t filter,
@@ -1026,7 +1178,7 @@ __kernel void conv_1x1_spl(
    __private const int output_height,
    __private const int old_w
    ) {
+  half zero = 0.0f;
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
@@ -1065,14 +1217,14 @@ __kernel void conv_1x1_spl(
 #ifdef BIASE_CH
    half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
+    half4 output1 = output0;
-    half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
+    half4 output2 = output0;
-    half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
+    half4 output3 = output0;
 #elif defined(BIASE_ELE)
    half4 output0 = read_imageh(bias, sampler, output_pos0);
-    half4 output1 = read_imageh(bias, sampler, output_pos1);
+    half4 output1 = output0;
-    half4 output2 = read_imageh(bias, sampler, output_pos2);
+    half4 output2 = output0;
-    half4 output3 = read_imageh(bias, sampler, output_pos3);
+    half4 output3 = output0;
 #else
  half4 output0 = 0.0f;
@@ -1083,6 +1235,10 @@ __kernel void conv_1x1_spl(
  int max_w_bound = input_c * input_width;
  int burndary_index = input_c * 4 - input_c_origin;
+  bool burndary_index_w = burndary_index==1||burndary_index==2||burndary_index==3;
+  bool burndary_index_z = burndary_index==2||burndary_index==3;
+  bool burndary_index_y = burndary_index==3;
  for (int i = 0; i < input_c; ++i) {
    // ------------0---------------
    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
@@ -1092,22 +1248,13 @@ __kernel void conv_1x1_spl(
    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
    int bound_gap = max_w_bound - pos_in.x - 1;
-    if (bound_gap < input_width && bound_gap >= 0){
-      if (burndary_index==0){
+    bool outof_bound =  bound_gap < input_width && bound_gap >= 0;
-        // do nothing
+    input0.w = select(input0.w,zero,outof_bound && burndary_index_w);
-      } else if (burndary_index==1){
+    input0.z = select(input0.z,zero,outof_bound && burndary_index_z);
-        input0.w = 0.0f;
+    input0.y = select(input0.y,zero,outof_bound && burndary_index_y);
-      } else if (burndary_index==2){
-        input0.z = 0.0f;
-        input0.w = 0.0f;
-      } else if (burndary_index==3){
-        input0.y = 0.0f;
-        input0.z = 0.0f;
-        input0.w = 0.0f;
-      }
-    }
    output0 = mad(input0.x, weight0, output0);
    output0 = mad(input0.y, weight1, output0);
    output0 = mad(input0.z, weight2, output0);
@@ -1117,20 +1264,12 @@ __kernel void conv_1x1_spl(
    half4 input1 = read_imageh(input_image, sampler, pos_in);
    bound_gap = max_w_bound - pos_in.x - 1;
-    if (bound_gap < input_width && bound_gap >= 0){
-      if (burndary_index==0){
+    outof_bound =  bound_gap < input_width && bound_gap >= 0;
-        // do nothing
+    input1.w = select(input1.w,zero,outof_bound && burndary_index_w);
-      } else if (burndary_index==1){
+    input1.z = select(input1.z,zero,outof_bound && burndary_index_z);
-        input1.w = 0.0f;
+    input1.y = select(input1.y,zero,outof_bound && burndary_index_y);
-      } else if (burndary_index==2){
-        input1.z = 0.0f;
-        input1.w = 0.0f;
-      } else if (burndary_index==3){
-        input1.y = 0.0f;
-        input1.z = 0.0f;
-        input1.w = 0.0f;
-      }
-    }
    output1 = mad(input1.x, weight0, output1);
    output1 = mad(input1.y, weight1, output1);
    output1 = mad(input1.z, weight2, output1);
@@ -1141,20 +1280,12 @@ __kernel void conv_1x1_spl(
    half4 input2 = read_imageh(input_image, sampler, pos_in);
    bound_gap = max_w_bound - pos_in.x - 1;
-    if (bound_gap < input_width && bound_gap >= 0){
-      if (burndary_index==0){
+    outof_bound =  bound_gap < input_width && bound_gap >= 0;
-        // do nothing
+    input2.w = select(input2.w,zero,outof_bound && burndary_index_w);
-      } else if (burndary_index==1){
+    input2.z = select(input2.z,zero,outof_bound && burndary_index_z);
-        input2.w = 0.0f;
+    input2.y = select(input2.y,zero,outof_bound && burndary_index_y);
-      } else if (burndary_index==2){
-        input2.z = 0.0f;
-        input2.w = 0.0f;
-      } else if (burndary_index==3){
-        input2.y = 0.0f;
-        input2.z = 0.0f;
-        input2.w = 0.0f;
-      }
-    }
    output2 = mad(input2.x, weight0, output2);
    output2 = mad(input2.y, weight1, output2);
    output2 = mad(input2.z, weight2, output2);
@@ -1164,20 +1295,11 @@ __kernel void conv_1x1_spl(
    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
    half4 input3 = read_imageh(input_image, sampler, pos_in);
    bound_gap = max_w_bound - pos_in.x - 1;
-    if (bound_gap < input_width && bound_gap >= 0){
-      if (burndary_index==0){
+    outof_bound =  bound_gap < input_width && bound_gap >= 0;
-        // do nothing
+    input3.w = select(input3.w,zero,outof_bound && (burndary_index==1||burndary_index==2||burndary_index==3));
-      } else if (burndary_index==1){
+    input3.z = select(input3.z,zero,outof_bound && (burndary_index==2||burndary_index==3));
-        input3.w = 0.0f;
+    input3.y = select(input3.y,zero,outof_bound && burndary_index==3);
-      } else if (burndary_index==2){
-        input3.z = 0.0f;
-        input3.w = 0.0f;
-      } else if (burndary_index==3){
-        input3.y = 0.0f;
-        input3.z = 0.0f;
-        input3.w = 0.0f;
-      }
-    }
    output3 = mad(input3.x, weight0, output3);
    output3 = mad(input3.y, weight1, output3);

--- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -152,8 +152,13 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
+    if (param->Input()->dims()[1] % 4 == 0) {
+      this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file,
+                                 build_options);
+    } else {
+      this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file,
+                                 build_options);
+    }
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {

--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -48,8 +48,13 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
+    if (param->Input()->dims()[1] % 4 == 0) {
-    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
+      this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file,
+                                 build_options);
+    } else {
+      this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file,
+                                 build_options);
+    }
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&

--- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
@@ -50,8 +50,13 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
+    if (param->Input()->dims()[1] % 4 == 0) {
+      this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file,
+                                 build_options);
+    } else {
+      this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file,
+                                 build_options);
+    }
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {

--- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
@@ -107,8 +107,13 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
+    if (param->Input()->dims()[1] % 4 == 0) {
+      this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file,
+                                 build_options);
+    } else {
+      this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file,
+                                 build_options);
+    }
  } else if (param->Filter()->dims()[1] == 1 &&
             param->Input()->dims()[1] == param->Output()->dims()[1] &&
             param->Filter()->dims()[2] == 3) {

--- a/mobile/src/operators/kernel/cl/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp
@@ -45,7 +45,11 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file);
+    if (param->Input()->dims()[1] % 4 == 0) {
+      this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file);
+    } else {
+      this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file);
+    }
    DLOG << "conv 1x1";
  } else if (param->Filter()->dims()[1] == 1 &&

--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
@@ -46,7 +46,13 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
    param->Filter()->InitNImage(cl_helper_.CLContext(),
                                cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_1x1_spl", conv_kernel_file, build_options);
+    if (param->Input()->dims()[1] % 4 == 0) {
+      this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file,
+                                 build_options);
+    } else {
+      this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file,
+                                 build_options);
+    }
    DLOG << "conv 1x1";
  } else if (param->Filter()->dims()[1] == 1 &&