[LITE][OPENCL][Image] conv 1x1 5x5 7x7 suite (#2998)

* [LITE][OPENCL][Image] conv 1x1 5x5 7x7 suite # Conflicts: # lite/kernels/opencl/conv_image_compute_test.cc * [LITE][OPENCL][Image] conv 1x1 5x5 7x7 suite,test=develop * [LITE][OPENCL][Image] conv 1x1 5x5 7x7 suite,test=develop * [LITE][OPENCL][Image] conv 1x1 5x5 7x7 suite,test=develop * [LITE][OPENCL][Image] conv 1x1 5x5 7x7 suite,rm1x1 old,test=develop

[LITE][OPENCL][Image] conv 1x1 5x5 7x7 suite (#2998)
* [LITE][OPENCL][Image] conv 1x1 5x5 7x7 suite # Conflicts: # lite/kernels/opencl/conv_image_compute_test.cc * [LITE][OPENCL][Image] conv 1x1 5x5 7x7 suite,test=develop * [LITE][OPENCL][Image] conv 1x1 5x5 7x7 suite,test=develop * [LITE][OPENCL][Image] conv 1x1 5x5 7x7 suite,test=develop * [LITE][OPENCL][Image] conv 1x1 5x5 7x7 suite,rm1x1 old,test=develop
b4c5fdb8 · xiebaiyuan · GitHub · 2c229275 · b4c5fdb8 · b4c5fdb8
3 changed file
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
@@ -15,7 +15,7 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
                         __write_only image2d_t output_image,
                         __private const int stride,
                         __private const int offset,
-                         __private const int input_c,
+                         __private const int input_c_block,
                         __private const int input_c_origin,
                         __private const int dilation,
                         __private const int input_width,  /* of one block */
@@ -79,14 +79,14 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
  CL_DTYPE4 output3 = 0.0f;
 #endif
-  int max_w_bound = input_c * input_width;
+  int max_w_bound = input_c_block * input_width;
-  int burndary_index = input_c * 4 - input_c_origin;
+  int burndary_index = input_c_block * 4 - input_c_origin;
  bool burndary_index_w =
      burndary_index == 1 || burndary_index == 2 || burndary_index == 3;
  bool burndary_index_z = burndary_index == 2 || burndary_index == 3;
  bool burndary_index_y = burndary_index == 3;
-  for (int i = 0; i < input_c; ++i) {
+  for (int i = 0; i < input_c_block; ++i) {
    // ------------0---------------
    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
                         in_pos_in_one_block0.y);
@@ -107,11 +107,81 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
    input0.w = select(input0.w, zero, outof_bound && burndary_index_w);
    input0.z = select(input0.z, zero, outof_bound && burndary_index_z);
    input0.y = select(input0.y, zero, outof_bound && burndary_index_y);
+#ifdef DEBUG
+    if (output_pos0.x == 0 && output_pos0.y == 0) {
+      printf("i ={ %d, }\n", i);
+      printf("in={ %f , %f , %f , %f } \n",
+             convert_float(input0.x),
+             convert_float(input0.y),
+             convert_float(input0.z),
+             convert_float(input0.w));
+      printf("filter0={ %f , %f , %f , %f } \n",
+             convert_float(weight0.x),
+             convert_float(weight0.y),
+             convert_float(weight0.z),
+             convert_float(weight0.w));
+      printf("filter1={ %f , %f , %f , %f } \n",
+             convert_float(weight1.x),
+             convert_float(weight1.y),
+             convert_float(weight1.z),
+             convert_float(weight1.w));
+      printf("filter2={ %f , %f , %f , %f } \n",
+             convert_float(weight2.x),
+             convert_float(weight2.y),
+             convert_float(weight2.z),
+             convert_float(weight2.w));
+      printf("filter3={ %f , %f , %f , %f } \n",
+             convert_float(weight3.x),
+             convert_float(weight3.y),
+             convert_float(weight3.z),
+             convert_float(weight3.w));
+      printf("000---- output={ %f , %f , %f , %f } \n",
+             convert_float(output0.x),
+             convert_float(output0.y),
+             convert_float(output0.z),
+             convert_float(output0.w));
+    }
+#endif
    output0 = mad(input0.x, weight0, output0);
+#ifdef DEBUG
+    if (output_pos0.x == 0 && output_pos0.y == 0) {
+      printf("111---- output={ %f , %f , %f , %f } \n",
+             convert_float(output0.x),
+             convert_float(output0.y),
+             convert_float(output0.z),
+             convert_float(output0.w));
+    }
+#endif
    output0 = mad(input0.y, weight1, output0);
+#ifdef DEBUG
+    if (output_pos0.x == 0 && output_pos0.y == 0) {
+      printf("222---- output={ %f , %f , %f , %f } \n",
+             convert_float(output0.x),
+             convert_float(output0.y),
+             convert_float(output0.z),
+             convert_float(output0.w));
+    }
+#endif
    output0 = mad(input0.z, weight2, output0);
+#ifdef DEBUG
+    if (output_pos0.x == 0 && output_pos0.y == 0) {
+      printf("333---- output={ %f , %f , %f , %f } \n",
+             convert_float(output0.x),
+             convert_float(output0.y),
+             convert_float(output0.z),
+             convert_float(output0.w));
+    }
+#endif
    output0 = mad(input0.w, weight3, output0);
+#ifdef DEBUG
+    if (output_pos0.x == 0 && output_pos0.y == 0) {
+      printf("444---- output={ %f , %f , %f , %f } \n",
+             convert_float(output0.x),
+             convert_float(output0.y),
+             convert_float(output0.z),
+             convert_float(output0.w));
+    }
+#endif
    // -------------1--------------
    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
                    in_pos_in_one_block1.y);
@@ -171,6 +241,43 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
    output3 = mad(input3.y, weight1, output3);
    output3 = mad(input3.z, weight2, output3);
    output3 = mad(input3.w, weight3, output3);
+#ifdef DEBUG
+    if (output_pos0.x == 0 && output_pos0.y == 0) {
+      //  printf("i,j,k ={ %d, %d , %d }\n", i,j,k);
+      printf("i ={ %d, }\n", i);
+      printf("in={ %f , %f , %f , %f } \n",
+             convert_float(input0.x),
+             convert_float(input0.y),
+             convert_float(input0.z),
+             convert_float(input0.w));
+      printf("filter0={ %f , %f , %f , %f } \n",
+             convert_float(weight0.x),
+             convert_float(weight0.y),
+             convert_float(weight0.z),
+             convert_float(weight0.w));
+      printf("filter1={ %f , %f , %f , %f } \n",
+             convert_float(weight1.x),
+             convert_float(weight1.y),
+             convert_float(weight1.z),
+             convert_float(weight1.w));
+      printf("filter2={ %f , %f , %f , %f } \n",
+             convert_float(weight2.x),
+             convert_float(weight2.y),
+             convert_float(weight2.z),
+             convert_float(weight2.w));
+      printf("filter3={ %f , %f , %f , %f } \n",
+             convert_float(weight3.x),
+             convert_float(weight3.y),
+             convert_float(weight3.z),
+             convert_float(weight3.w));
+      printf("output={ %f , %f , %f , %f } \n",
+             convert_float(output0.x),
+             convert_float(output0.y),
+             convert_float(output0.z),
+             convert_float(output0.w));
+    }
+#endif
  }
 #ifdef BATCH_NORM
@@ -195,7 +302,6 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
  output1 = activation_type4(output1);
  output2 = activation_type4(output2);
  output3 = activation_type4(output3);
  if (out_w0 < old_w) {
    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
  }
@@ -213,29 +319,30 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
  }
 }
-__kernel void conv2d_1x1_simple(__private const int global_size_dim0,
+__kernel void conv2d_1x1_simple(
-                         __private const int global_size_dim1,
+    __private const int global_size_dim0,
-                         __private const int global_size_dim2,
+    __private const int global_size_dim1,
-                         __read_only image2d_t input_image,
+    __private const int global_size_dim2,
-                         __read_only image2d_t filter,
+    __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-__read_only image2d_t new_scale,
+    __read_only image2d_t new_scale,
-                         __read_only image2d_t new_biase,
+    __read_only image2d_t new_biase,
 #endif
-                         __write_only image2d_t output_image,
+    __write_only image2d_t output_image,
-                         __private const int stride,
+    __private const int stride,
-                         __private const int offset,
+    __private const int offset,
-                         __private const int input_c,
+    __private const int input_c,
-                         __private const int input_c_origin,
+    __private const int input_c_origin,
-                         __private const int dilation,
+    __private const int dilation,
-                         __private const int input_width,  /* of one block */
+    __private const int input_width,  /* of one block */
-                         __private const int input_height, /* of one block */
+    __private const int input_height, /* of one block */
-                         __private const int output_width,
+    __private const int output_width,
-                         __private const int output_height,
+    __private const int output_height,
-                         __private const int old_w) {
+    __private const int old_w) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
@@ -358,13 +465,11 @@ __read_only image2d_t new_scale,
            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
 #endif
  output0 = activation_type4(output0);
  output1 = activation_type4(output1);
  output2 = activation_type4(output2);
  output3 = activation_type4(output3);
  if (out_w0 < old_w) {
    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
  }

--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
@@ -36,10 +36,10 @@ __kernel void conv2d_7x7(__private const int global_size_dim0,
  const int batch_index = out_nh / output_height;
  const int out_nh_in_one_batch = out_nh % output_height;
-  const filter_n0 = 4 * out_c + 0;
+  const int filter_n0 = 4 * out_c + 0;
-  const filter_n1 = 4 * out_c + 1;
+  const int filter_n1 = 4 * out_c + 1;
-  const filter_n2 = 4 * out_c + 2;
+  const int filter_n2 = 4 * out_c + 2;
-  const filter_n3 = 4 * out_c + 3;
+  const int filter_n3 = 4 * out_c + 3;
  int2 stride_xy;
  stride_xy.x = stride;

--- a/lite/kernels/opencl/conv_image_compute_test.cc
+++ b/lite/kernels/opencl/conv_image_compute_test.cc