From b908e8348e5675f64c111240f2ed40b9147faf3e Mon Sep 17 00:00:00 2001 From: Jiaying Zhao Date: Wed, 26 Feb 2020 12:13:58 +0800 Subject: [PATCH] [Lite][OpenCL]Add pre_process & post_process layout kernel (#3014) * [Lite][OpenCL]Add pre_process & post_process layout kernel. test=develop * [Lite][OpenCL]pre_process & post_process layout kernel Code style.test=develop --- .../opencl/cl_kernel/buffer/layout_kernel.cl | 82 ++++++++++++ .../opencl/cl_kernel/image/scale_kernel.cl | 2 +- lite/kernels/opencl/layout_compute.cc | 22 ++- lite/kernels/opencl/layout_compute_test.cc | 126 ++++++++++++++++++ 4 files changed, 229 insertions(+), 3 deletions(-) diff --git a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl index 86a0118fb1..65f4e9fb03 100644 --- a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl @@ -199,3 +199,85 @@ __kernel void image2d_to_buffer_2d(__private const int in_height, out[index + 3] = CONVERT_TYPE_TO(CL_DTYPE, in.w); } #endif + +// buffer -> image2d (divide by 255 to normalize) +__kernel void buffer_to_image2d_with_pre255(__global uchar *in, + __write_only image2d_t output_image, + __private const int out_H, + __private const int out_W, + __private const int out_C, + __private const int Stride0, + __private const int Stride1, + __private const int Stride2){ + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_n; + const int in_c0 = out_c * 4 + 0; + const int in_c1 = out_c * 4 + 1; + const int in_c2 = out_c * 4 + 2; + const int in_c3 = out_c * 4 + 3; + const int in_h = out_h; + const int in_w = out_w; + + + int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w; + int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w; + int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w; + int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f; + output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255; + if(out_C - 4 * out_c>=2){ + output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255; + } + if(out_C - 4 * out_c>=3){ + output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255; + } + if(out_C - 4 * out_c>=4){ + output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255; + } + WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output); +} + +// image2d -> buffer (multiply by 255 to de-normalize) +__kernel void image2d_to_buffer_with_post255(__read_only image2d_t input, + __private const int in_width, + __private const int in_height, + __global uchar* out, + __private const int size_ch, + __private const int size_block, + __private const int size_batch, + __private const int C) { + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + const int in_n = in_nh / in_height; + const int in_h = in_nh % in_height; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + const int pos_x = mad24(in_c, in_width, in_w); + CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)); + + const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; + out[index] = convert_uchar_sat(in.x * 255); + if(C - 4 * in_c>=2){ + out[index + size_ch] = convert_uchar_sat(in.y * 255); + } + if(C - 4 * in_c>=3){ + out[index + size_ch * 2] = convert_uchar_sat(in.z * 255); + } + if(C - 4 * in_c>=4){ + out[index + size_ch * 3] = convert_uchar_sat(in.w * 255); + } +} diff --git a/lite/backends/opencl/cl_kernel/image/scale_kernel.cl b/lite/backends/opencl/cl_kernel/image/scale_kernel.cl index 739ff13385..dfc25063cc 100644 --- a/lite/backends/opencl/cl_kernel/image/scale_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/scale_kernel.cl @@ -27,6 +27,6 @@ __kernel void scale(__read_only image2d_t input, CLK_FILTER_NEAREST; CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); - in = convert_float(scale) * in + convert_float(bias); + in = CONVERT_TYPE_TO(scale, CL_DTYPE) * in + CONVERT_TYPE_TO(bias, CL_DTYPE); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); } diff --git a/lite/kernels/opencl/layout_compute.cc b/lite/kernels/opencl/layout_compute.cc index 046f667dbe..2a82aec526 100644 --- a/lite/kernels/opencl/layout_compute.cc +++ b/lite/kernels/opencl/layout_compute.cc @@ -38,6 +38,10 @@ class LayoutComputeBufferChwToImageDefault using param_t = operators::LayoutParam; void PrepareForRun() override { + auto& param = Param(); + if (param.process_type == 1) { + kernel_func_name_ = "buffer_to_image2d_with_pre255"; + } auto& context = ctx_->As(); context.cl_context()->AddKernel( kernel_func_name_, "buffer/layout_kernel.cl", build_options_); @@ -45,7 +49,12 @@ class LayoutComputeBufferChwToImageDefault void Run() override { auto& param = Param(); - auto* x_data = param.x->data(); + const cl::Buffer* x_data; + if (param.process_type == 1) { + x_data = param.x->data(); + } else { + x_data = param.x->data(); + } auto x_dims = param.x->dims(); auto image_shape = InitImageDimInfoWith(x_dims); auto* y_data = param.y->mutable_data( @@ -140,6 +149,10 @@ class LayoutComputeImageDefaultToBufferChw using param_t = operators::LayoutParam; void PrepareForRun() override { + auto& param = Param(); + if (param.process_type == 1) { + kernel_func_name_ = "image2d_to_buffer_with_post255"; + } auto& context = ctx_->As(); context.cl_context()->AddKernel( kernel_func_name_, "buffer/layout_kernel.cl", build_options_); @@ -147,9 +160,14 @@ class LayoutComputeImageDefaultToBufferChw void Run() override { auto& param = Param(); + const cl::Buffer* y_data; + if (param.process_type == 1) { + y_data = param.y->mutable_data(TARGET(kOpenCL)); + } else { + y_data = param.y->mutable_data(TARGET(kOpenCL)); + } auto* x_data = param.x->data(); auto x_dims = param.x->dims(); - auto* y_data = param.y->mutable_data(TARGET(kOpenCL)); auto y_dims = param.y->dims(); auto x_image_shape = InitImageDimInfoWith(x_dims); diff --git a/lite/kernels/opencl/layout_compute_test.cc b/lite/kernels/opencl/layout_compute_test.cc index 93a5be5195..a523c896fa 100644 --- a/lite/kernels/opencl/layout_compute_test.cc +++ b/lite/kernels/opencl/layout_compute_test.cc @@ -148,6 +148,132 @@ TEST(layout_ImageDefault, compute) { #endif } +TEST(layout_ImageDefault_With_Pre_Post, compute) { + LOG(INFO) << "main steps of test: host -> layout(buf2img) -> layout(img2buf) " + "-> device"; + +#ifdef LOOP_TEST + for (int n = 1; n <= 2; n += 1) { + for (auto c : {1, 3}) { + for (int h = 1; h <= 10; h += 1) { + for (int w = 1; w <= 10; w += 1) { +#else + const int n = 1; + const int c = 2; + const int h = 3; + const int w = 4; +#endif // LOOP_TEST + + LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " " + << h << " " << w << " ========"; + // set layout kernels + auto buf_to_img_kernels = + KernelRegistry::Global().Create("layout", + TARGET(kOpenCL), + PRECISION(kAny), + DATALAYOUT(kImageDefault)); + auto img_to_buf_kernels = KernelRegistry::Global().Create( + "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)); + ASSERT_FALSE(buf_to_img_kernels.empty()); + ASSERT_FALSE(buf_to_img_kernels.empty()); + + auto buf_to_img_kernel = std::move(buf_to_img_kernels.front()); + auto img_to_buf_kernel = std::move(img_to_buf_kernels.front()); + LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc(); + LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc(); + + // set tensors about op param + LOG(INFO) << "set tensors about op param"; + lite::Tensor x, y_image, y; + operators::LayoutParam BufferToImageParam; + operators::LayoutParam ImageToBufferParam; + BufferToImageParam.x = &x; + BufferToImageParam.y = &y_image; + BufferToImageParam.process_type = 1; + ImageToBufferParam.x = &y_image; + ImageToBufferParam.y = &y; + ImageToBufferParam.process_type = 1; + + const DDim x_dim = DDim(std::vector{n, c, h, w}); + x.Resize(x_dim); + y_image.Resize(x_dim); // useless for image2D + y.Resize(x_dim); + + // initialize tensors + LOG(INFO) << "initialize tensors"; + auto* x_data = x.mutable_data(TARGET(kOpenCL)); + auto* y_data = y.mutable_data(TARGET(kOpenCL)); + auto image_shape = + paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim); + auto* y_image_data = y_image.mutable_data( + image_shape["width"], image_shape["height"]); + auto* mapped_x = static_cast(TargetWrapperCL::Map( + x_data, 0, sizeof(uint8_t) * x_dim.production())); + auto* mapped_y = static_cast(TargetWrapperCL::Map( + y_data, 0, sizeof(uint8_t) * x_dim.production())); + for (int i = 0; i < x_dim.production(); ++i) { + mapped_x[i] = static_cast(i % 256); + } + + // set context and kernel args + LOG(INFO) << "set context and kernel args"; + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + buf_to_img_kernel->SetParam(BufferToImageParam); + std::unique_ptr buf_to_img_context(new KernelContext); + context->As().CopySharedTo( + &(buf_to_img_context->As())); + buf_to_img_kernel->SetContext(std::move(buf_to_img_context)); + + img_to_buf_kernel->SetParam(ImageToBufferParam); + std::unique_ptr img_to_buf_context(new KernelContext); + context->As().CopySharedTo( + &(img_to_buf_context->As())); + img_to_buf_kernel->SetContext(std::move(img_to_buf_context)); + + // run kernels + LOG(INFO) << "run kernel: buffer_to_image2d_with_pre255"; + buf_to_img_kernel->Launch(); + LOG(INFO) << "run kernel: image2d_to_buffer_with_post255"; + img_to_buf_kernel->Launch(); + +// result +#ifdef PRINT_RESULT + LOG(INFO) << "---- print result ----"; + for (int eidx = 0; eidx < x_dim.production(); ++eidx) { + std::cout << mapped_x[eidx] << " -> " + << static_cast(mapped_y[eidx]) << std::endl; + } +#endif // PRINT_RESULT + + // check result: compare input and output + float MAX_PASS_DIFF = 1; + for (int eidx = 0; eidx < x_dim.production(); eidx++) { + EXPECT_NEAR(mapped_x[eidx], mapped_y[eidx], MAX_PASS_DIFF); + if (abs(mapped_x[eidx] - mapped_y[eidx]) > MAX_PASS_DIFF) { + LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx + << " / " << x_dim.production() << ", mapped_x[" << eidx + << "]:" << mapped_x[eidx] << ", mapped_y[" << eidx + << "]:" << mapped_y[eidx]; + break; + } + } + + // free + LOG(INFO) << "free: unmap x, y"; + TargetWrapperCL::Unmap(x_data, mapped_x); + TargetWrapperCL::Unmap(y_data, mapped_y); +#ifdef LOOP_TEST + } // w + } // h + } // c + } // n +#else +// nothing to do. +#endif +} + #if 0 TEST(layout_ImageNW, compute) { #ifdef LOOP_TEST -- GitLab