[Lite][OpenCL]Add pre_process & post_process layout kernel (#3014)

* [Lite][OpenCL]Add pre_process & post_process layout kernel. test=develop * [Lite][OpenCL]pre_process & post_process layout kernel Code style.test=develop

[Lite][OpenCL]Add pre_process & post_process layout kernel (#3014)
* [Lite][OpenCL]Add pre_process & post_process layout kernel. test=develop * [Lite][OpenCL]pre_process & post_process layout kernel Code style.test=develop
b908e834 · Jiaying Zhao · GitHub · 360b4013 · b908e834 · b908e834
4 changed file
--- a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
@@ -199,3 +199,85 @@ __kernel void image2d_to_buffer_2d(__private const int in_height,
  out[index + 3] = CONVERT_TYPE_TO(CL_DTYPE, in.w);
 }
 #endif
+
+// buffer -> image2d (divide by 255 to normalize)
+__kernel void buffer_to_image2d_with_pre255(__global uchar *in,
+                                            __write_only image2d_t output_image,
+                                            __private const int out_H,
+                                            __private const int out_W,
+                                            __private const int out_C,
+                                            __private const int Stride0,
+                                            __private const int Stride1,
+                                            __private const int Stride2){
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_H;
+  const int out_h = out_nh % out_H;
+
+  const int in_n = out_n;
+  const int in_c0 = out_c * 4 + 0;
+  const int in_c1 = out_c * 4 + 1;
+  const int in_c2 = out_c * 4 + 2;
+  const int in_c3 = out_c * 4 + 3;
+  const int in_h = out_h;
+  const int in_w = out_w;
+
+
+  int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
+  int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
+  int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
+  int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w;
+
+  int2 output_pos;
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_nh;
+
+  CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f;
+  output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255;
+  if(out_C - 4 * out_c>=2){
+      output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255;
+  }
+  if(out_C - 4 * out_c>=3){
+      output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255;
+  }
+  if(out_C - 4 * out_c>=4){
+      output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255;
+  }
+  WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
+}
+
+// image2d -> buffer (multiply by 255 to de-normalize)
+__kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
+                                            __private const int in_width,
+                                            __private const int in_height,
+                                            __global uchar* out,
+                                            __private const int size_ch,
+                                            __private const int size_block,
+                                            __private const int size_batch,
+                                            __private const int C) {
+  const int in_c = get_global_id(0);
+  const int in_w = get_global_id(1);
+  const int in_nh = get_global_id(2);
+  const int in_n = in_nh / in_height;
+  const int in_h = in_nh % in_height;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  const int pos_x = mad24(in_c, in_width, in_w);
+  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
+
+  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  out[index] = convert_uchar_sat(in.x * 255);
+  if(C - 4 * in_c>=2){
+    out[index + size_ch] = convert_uchar_sat(in.y * 255);
+  }
+  if(C - 4 * in_c>=3){
+    out[index + size_ch * 2] = convert_uchar_sat(in.z * 255);
+  }
+  if(C - 4 * in_c>=4){
+    out[index + size_ch * 3] = convert_uchar_sat(in.w * 255);
+  }
+}
--- a/lite/backends/opencl/cl_kernel/image/scale_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/scale_kernel.cl
@@ -27,6 +27,6 @@ __kernel void scale(__read_only image2d_t input,
                            CLK_FILTER_NEAREST;

  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  in = convert_float(scale) * in + convert_float(bias);
+  in = CONVERT_TYPE_TO(scale, CL_DTYPE) * in + CONVERT_TYPE_TO(bias, CL_DTYPE);
  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
--- a/lite/kernels/opencl/layout_compute.cc
+++ b/lite/kernels/opencl/layout_compute.cc
@@ -38,6 +38,10 @@ class LayoutComputeBufferChwToImageDefault
  using param_t = operators::LayoutParam;

  void PrepareForRun() override {
+    auto& param = Param<param_t>();
+    if (param.process_type == 1) {
+      kernel_func_name_ = "buffer_to_image2d_with_pre255";
+    }
    auto& context = ctx_->As<OpenCLContext>();
    context.cl_context()->AddKernel(
        kernel_func_name_, "buffer/layout_kernel.cl", build_options_);
@@ -45,7 +49,12 @@ class LayoutComputeBufferChwToImageDefault

  void Run() override {
    auto& param = Param<param_t>();
-    auto* x_data = param.x->data<float, cl::Buffer>();
+    const cl::Buffer* x_data;
+    if (param.process_type == 1) {
+      x_data = param.x->data<uint8_t, cl::Buffer>();
+    } else {
+      x_data = param.x->data<float, cl::Buffer>();
+    }
    auto x_dims = param.x->dims();
    auto image_shape = InitImageDimInfoWith(x_dims);
    auto* y_data = param.y->mutable_data<half_t, cl::Image2D>(
@@ -140,6 +149,10 @@ class LayoutComputeImageDefaultToBufferChw
  using param_t = operators::LayoutParam;

  void PrepareForRun() override {
+    auto& param = Param<param_t>();
+    if (param.process_type == 1) {
+      kernel_func_name_ = "image2d_to_buffer_with_post255";
+    }
    auto& context = ctx_->As<OpenCLContext>();
    context.cl_context()->AddKernel(
        kernel_func_name_, "buffer/layout_kernel.cl", build_options_);
@@ -147,9 +160,14 @@ class LayoutComputeImageDefaultToBufferChw

  void Run() override {
    auto& param = Param<param_t>();
+    const cl::Buffer* y_data;
+    if (param.process_type == 1) {
+      y_data = param.y->mutable_data<uint8_t, cl::Buffer>(TARGET(kOpenCL));
+    } else {
+      y_data = param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+    }
    auto* x_data = param.x->data<half_t, cl::Image2D>();
    auto x_dims = param.x->dims();
-    auto* y_data = param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
    auto y_dims = param.y->dims();
    auto x_image_shape = InitImageDimInfoWith(x_dims);


--- a/lite/kernels/opencl/layout_compute_test.cc
+++ b/lite/kernels/opencl/layout_compute_test.cc
@@ -148,6 +148,132 @@ TEST(layout_ImageDefault, compute) {
 #endif
 }

+TEST(layout_ImageDefault_With_Pre_Post, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> layout(img2buf) "
+               "-> device";
+
+#ifdef LOOP_TEST
+  for (int n = 1; n <= 2; n += 1) {
+    for (auto c : {1, 3}) {
+      for (int h = 1; h <= 10; h += 1) {
+        for (int w = 1; w <= 10; w += 1) {
+#else
+          const int n = 1;
+          const int c = 2;
+          const int h = 3;
+          const int w = 4;
+#endif  // LOOP_TEST
+
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+          // set layout kernels
+          auto buf_to_img_kernels =
+              KernelRegistry::Global().Create("layout",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kAny),
+                                              DATALAYOUT(kImageDefault));
+          auto img_to_buf_kernels = KernelRegistry::Global().Create(
+              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+
+          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+
+          // set tensors about op param
+          LOG(INFO) << "set tensors about op param";
+          lite::Tensor x, y_image, y;
+          operators::LayoutParam BufferToImageParam;
+          operators::LayoutParam ImageToBufferParam;
+          BufferToImageParam.x = &x;
+          BufferToImageParam.y = &y_image;
+          BufferToImageParam.process_type = 1;
+          ImageToBufferParam.x = &y_image;
+          ImageToBufferParam.y = &y;
+          ImageToBufferParam.process_type = 1;
+
+          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(x_dim);
+          y_image.Resize(x_dim);  // useless for image2D
+          y.Resize(x_dim);
+
+          // initialize tensors
+          LOG(INFO) << "initialize tensors";
+          auto* x_data = x.mutable_data<uint8_t, cl::Buffer>(TARGET(kOpenCL));
+          auto* y_data = y.mutable_data<uint8_t, cl::Buffer>(TARGET(kOpenCL));
+          auto image_shape =
+              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+          auto* y_image_data = y_image.mutable_data<half_t, cl::Image2D>(
+              image_shape["width"], image_shape["height"]);
+          auto* mapped_x = static_cast<uint8_t*>(TargetWrapperCL::Map(
+              x_data, 0, sizeof(uint8_t) * x_dim.production()));
+          auto* mapped_y = static_cast<uint8_t*>(TargetWrapperCL::Map(
+              y_data, 0, sizeof(uint8_t) * x_dim.production()));
+          for (int i = 0; i < x_dim.production(); ++i) {
+            mapped_x[i] = static_cast<uint8_t>(i % 256);
+          }
+
+          // set context and kernel args
+          LOG(INFO) << "set context and kernel args";
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          buf_to_img_kernel->SetParam(BufferToImageParam);
+          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(buf_to_img_context->As<OpenCLContext>()));
+          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+          img_to_buf_kernel->SetParam(ImageToBufferParam);
+          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(img_to_buf_context->As<OpenCLContext>()));
+          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+          // run kernels
+          LOG(INFO) << "run kernel: buffer_to_image2d_with_pre255";
+          buf_to_img_kernel->Launch();
+          LOG(INFO) << "run kernel: image2d_to_buffer_with_post255";
+          img_to_buf_kernel->Launch();
+
+// result
+#ifdef PRINT_RESULT
+          LOG(INFO) << "---- print result ----";
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+            std::cout << mapped_x[eidx] << " -> "
+                      << static_cast<uint8_t>(mapped_y[eidx]) << std::endl;
+          }
+#endif  // PRINT_RESULT
+
+          // check result: compare input and output
+          float MAX_PASS_DIFF = 1;
+          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
+            EXPECT_NEAR(mapped_x[eidx], mapped_y[eidx], MAX_PASS_DIFF);
+            if (abs(mapped_x[eidx] - mapped_y[eidx]) > MAX_PASS_DIFF) {
+              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                        << " / " << x_dim.production() << ", mapped_x[" << eidx
+                        << "]:" << mapped_x[eidx] << ", mapped_y[" << eidx
+                        << "]:" << mapped_y[eidx];
+              break;
+            }
+          }
+
+          // free
+          LOG(INFO) << "free: unmap x, y";
+          TargetWrapperCL::Unmap(x_data, mapped_x);
+          TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
+
 #if 0
 TEST(layout_ImageNW, compute) {
 #ifdef LOOP_TEST