[LITE][OPENCL]fix first 3 rerun err in mnasnet ,test=develop (#3450)

a98dbcc1 · xiebaiyuan · GitHub · 0294152f · a98dbcc1 · a98dbcc1
6 changed file
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -28,6 +28,7 @@ namespace lite {
 class CLContext {
 public:
  ~CLContext() {
+    GetCommandQueue().finish();
    for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
      // Note(ysh329): Don't need `clReleaseKernel`
      kernels_[kidx].reset();

--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
 #include <cl_common.h>
+
 __kernel void conv2d_1x1_opt(
    __private const int global_size_dim0,
    __private const int global_size_dim1,
@@ -27,10 +28,7 @@ __kernel void conv2d_1x1_opt(
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
-  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
-      out_nh >= global_size_dim2) {
-    return;
-  }
+
  int out_w0 = out_w;
  int out_w1 = out_w + global_size_dim1;
  int out_w2 = out_w + global_size_dim1 * 2;
@@ -76,10 +74,10 @@ __kernel void conv2d_1x1_opt(
  CL_DTYPE4 output3 = output0;

 #else
-  CL_DTYPE4 output0 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
-  CL_DTYPE4 output1 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
-  CL_DTYPE4 output2 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
-  CL_DTYPE4 output3 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
+  CL_DTYPE4 output0 = 0.0f;
+  CL_DTYPE4 output1 = 0.0f;
+  CL_DTYPE4 output2 = 0.0f;
+  CL_DTYPE4 output3 = 0.0f;
 #endif

  int max_w_bound = input_c_block * input_width;
@@ -88,14 +86,6 @@ __kernel void conv2d_1x1_opt(
    // ------------0---------------
    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
                         in_pos_in_one_block0.y);
-    pos_in.x = select(
-        pos_in.x,
-        -1,
-        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
-         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
-
-    pos_in.y =
-        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
    CL_DTYPE4 input0 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);

@@ -142,14 +132,6 @@ __kernel void conv2d_1x1_opt(
    // -------------1--------------
    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
                    in_pos_in_one_block1.y);
-    pos_in.x = select(
-        pos_in.x,
-        -1,
-        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
-         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
-
-    pos_in.y =
-        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
    CL_DTYPE4 input1 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);

@@ -186,14 +168,6 @@ __kernel void conv2d_1x1_opt(
    // -------------2--------------
    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
                    in_pos_in_one_block2.y);
-    pos_in.x = select(
-        pos_in.x,
-        -1,
-        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
-         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
-
-    pos_in.y =
-        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
    CL_DTYPE4 input2 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);

@@ -230,14 +204,6 @@ __kernel void conv2d_1x1_opt(
    // -------------3--------------
    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
                    in_pos_in_one_block3.y);
-    pos_in.x = select(
-        pos_in.x,
-        -1,
-        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
-         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
-
-    pos_in.y =
-        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
    CL_DTYPE4 input3 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);

@@ -339,10 +305,7 @@ __kernel void conv2d_1x1_simple(
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
-  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
-      out_nh >= global_size_dim2) {
-    return;
-  }
+
  int out_w0 = out_w;
  int out_w1 = out_w + global_size_dim1;
  int out_w2 = out_w + global_size_dim1 * 2;
@@ -388,25 +351,16 @@ __kernel void conv2d_1x1_simple(
  CL_DTYPE4 output3 = output0;

 #else
-  CL_DTYPE4 output0 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
-  CL_DTYPE4 output1 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
-  CL_DTYPE4 output2 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
-  CL_DTYPE4 output3 = (CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f);
+  CL_DTYPE4 output0 = 0.0f;
+  CL_DTYPE4 output1 = 0.0f;
+  CL_DTYPE4 output2 = 0.0f;
+  CL_DTYPE4 output3 = 0.0f;
 #endif

  for (int i = 0; i < input_c; ++i) {
    // ------------0---------------
    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
                         in_pos_in_one_block0.y);
-    pos_in.x = select(
-        pos_in.x,
-        -1,
-        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
-         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
-
-    pos_in.y =
-        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
-
    CL_DTYPE4 input0 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);

@@ -426,15 +380,6 @@ __kernel void conv2d_1x1_simple(

    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
                    in_pos_in_one_block1.y);
-    pos_in.x = select(
-        pos_in.x,
-        -1,
-        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
-         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
-
-    pos_in.y =
-        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
-
    CL_DTYPE4 input1 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
    output1 = mad(input1.x, weight0, output1);
@@ -444,14 +389,6 @@ __kernel void conv2d_1x1_simple(

    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
                    in_pos_in_one_block2.y);
-    pos_in.x = select(
-        pos_in.x,
-        -1,
-        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
-         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
-
-    pos_in.y =
-        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
    CL_DTYPE4 input2 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
    output2 = mad(input2.x, weight0, output2);
@@ -461,16 +398,6 @@ __kernel void conv2d_1x1_simple(

    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
                    in_pos_in_one_block3.y);
-
-    pos_in.x = select(
-        pos_in.x,
-        -1,
-        (pos_in.x < i * input_width + in_pos_in_one_block0.x ||
-         pos_in.x >= i * input_width + in_pos_in_one_block0.x + input_width));
-
-    pos_in.y =
-        select(pos_in.y, -1, (pos_in.y < 0 || pos_in.y >= global_size_dim2));
-
    CL_DTYPE4 input3 =
        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
    output3 = mad(input3.x, weight0, output3);
@@ -502,16 +429,6 @@ __kernel void conv2d_1x1_simple(
  output2 = activation_type4(output2);
  output3 = activation_type4(output3);

-  // const int debug_pos = 0;
-  // int2 pos_test = (int2)(debug_pos, debug_pos);
-  // if (input_height == 112 && input_width == 112 && output_width == 112 &&
-  //     output_height == 112) {
-  //   output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_test);
-  //   output1 = output0;
-  //   output2 = output1;
-  //   output3 = output2;
-  // }
-
  if (out_w0 < old_w) {
    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
  }

--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <cl_common.h>
+
 __kernel void conv2d_3x3_opt(__private const int item_ch,
                             __private const int item_w,
                             __private const int item_h,

--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
--- a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
@@ -30,10 +30,6 @@ __kernel void buffer_to_image2d(__global CL_DTYPE* in,
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);

-  if (out_c >= out_C || out_w >= out_W || out_nh >= out_H) {
-    return;
-  }
-
  const int out_n = out_nh / out_H;
  const int out_h = out_nh % out_H;

@@ -59,18 +55,12 @@ __kernel void buffer_to_image2d(__global CL_DTYPE* in,

  if (out_C - 4 * out_c >= 2) {
    output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE);
-  } else {
-    output.y = CONVERT_TYPE_TO(0.f, CL_COMPUTE_DTYPE);
  }
  if (out_C - 4 * out_c >= 3) {
    output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE);
-  } else {
-    output.z = CONVERT_TYPE_TO(0.f, CL_COMPUTE_DTYPE);
  }
  if (out_C - 4 * out_c >= 4) {
    output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE);
-  } else {
-    output.w = CONVERT_TYPE_TO(0.f, CL_COMPUTE_DTYPE);
  }

 #ifdef DEBUG
@@ -146,11 +136,9 @@ __kernel void image2d_to_buffer(__read_only image2d_t input,
  if (C - 4 * in_c >= 2) {
    out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
  }
-
  if (C - 4 * in_c >= 3) {
    out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
  }
-
  if (C - 4 * in_c >= 4) {
    out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
  }

--- a/lite/backends/opencl/target_wrapper.cc
+++ b/lite/backends/opencl/target_wrapper.cc
@@ -90,7 +90,7 @@ void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width,
  cl_int status;
  cl::Image2D *cl_image =
      new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_USE_HOST_PTR
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
                                                    : CL_MEM_ALLOC_HOST_PTR),
                      img_format,
                      cl_image2d_width,