未验证 提交 b908e834 编写于 作者: J Jiaying Zhao 提交者: GitHub

[Lite][OpenCL]Add pre_process & post_process layout kernel (#3014)

* [Lite][OpenCL]Add pre_process & post_process layout kernel. test=develop

* [Lite][OpenCL]pre_process & post_process layout kernel Code style.test=develop
上级 360b4013
...@@ -199,3 +199,85 @@ __kernel void image2d_to_buffer_2d(__private const int in_height, ...@@ -199,3 +199,85 @@ __kernel void image2d_to_buffer_2d(__private const int in_height,
out[index + 3] = CONVERT_TYPE_TO(CL_DTYPE, in.w); out[index + 3] = CONVERT_TYPE_TO(CL_DTYPE, in.w);
} }
#endif #endif
// buffer -> image2d (divide by 255 to normalize)
__kernel void buffer_to_image2d_with_pre255(__global uchar *in,
__write_only image2d_t output_image,
__private const int out_H,
__private const int out_W,
__private const int out_C,
__private const int Stride0,
__private const int Stride1,
__private const int Stride2){
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_H;
const int out_h = out_nh % out_H;
const int in_n = out_n;
const int in_c0 = out_c * 4 + 0;
const int in_c1 = out_c * 4 + 1;
const int in_c2 = out_c * 4 + 2;
const int in_c3 = out_c * 4 + 3;
const int in_h = out_h;
const int in_w = out_w;
int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w;
int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;
CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f;
output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255;
if(out_C - 4 * out_c>=2){
output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255;
}
if(out_C - 4 * out_c>=3){
output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255;
}
if(out_C - 4 * out_c>=4){
output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255;
}
WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
}
// image2d -> buffer (multiply by 255 to de-normalize)
__kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
__private const int in_width,
__private const int in_height,
__global uchar* out,
__private const int size_ch,
__private const int size_block,
__private const int size_batch,
__private const int C) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
const int in_n = in_nh / in_height;
const int in_h = in_nh % in_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int pos_x = mad24(in_c, in_width, in_w);
CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
out[index] = convert_uchar_sat(in.x * 255);
if(C - 4 * in_c>=2){
out[index + size_ch] = convert_uchar_sat(in.y * 255);
}
if(C - 4 * in_c>=3){
out[index + size_ch * 2] = convert_uchar_sat(in.z * 255);
}
if(C - 4 * in_c>=4){
out[index + size_ch * 3] = convert_uchar_sat(in.w * 255);
}
}
...@@ -27,6 +27,6 @@ __kernel void scale(__read_only image2d_t input, ...@@ -27,6 +27,6 @@ __kernel void scale(__read_only image2d_t input,
CLK_FILTER_NEAREST; CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
in = convert_float(scale) * in + convert_float(bias); in = CONVERT_TYPE_TO(scale, CL_DTYPE) * in + CONVERT_TYPE_TO(bias, CL_DTYPE);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
} }
...@@ -38,6 +38,10 @@ class LayoutComputeBufferChwToImageDefault ...@@ -38,6 +38,10 @@ class LayoutComputeBufferChwToImageDefault
using param_t = operators::LayoutParam; using param_t = operators::LayoutParam;
void PrepareForRun() override { void PrepareForRun() override {
auto& param = Param<param_t>();
if (param.process_type == 1) {
kernel_func_name_ = "buffer_to_image2d_with_pre255";
}
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "buffer/layout_kernel.cl", build_options_); kernel_func_name_, "buffer/layout_kernel.cl", build_options_);
...@@ -45,7 +49,12 @@ class LayoutComputeBufferChwToImageDefault ...@@ -45,7 +49,12 @@ class LayoutComputeBufferChwToImageDefault
void Run() override { void Run() override {
auto& param = Param<param_t>(); auto& param = Param<param_t>();
auto* x_data = param.x->data<float, cl::Buffer>(); const cl::Buffer* x_data;
if (param.process_type == 1) {
x_data = param.x->data<uint8_t, cl::Buffer>();
} else {
x_data = param.x->data<float, cl::Buffer>();
}
auto x_dims = param.x->dims(); auto x_dims = param.x->dims();
auto image_shape = InitImageDimInfoWith(x_dims); auto image_shape = InitImageDimInfoWith(x_dims);
auto* y_data = param.y->mutable_data<half_t, cl::Image2D>( auto* y_data = param.y->mutable_data<half_t, cl::Image2D>(
...@@ -140,6 +149,10 @@ class LayoutComputeImageDefaultToBufferChw ...@@ -140,6 +149,10 @@ class LayoutComputeImageDefaultToBufferChw
using param_t = operators::LayoutParam; using param_t = operators::LayoutParam;
void PrepareForRun() override { void PrepareForRun() override {
auto& param = Param<param_t>();
if (param.process_type == 1) {
kernel_func_name_ = "image2d_to_buffer_with_post255";
}
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "buffer/layout_kernel.cl", build_options_); kernel_func_name_, "buffer/layout_kernel.cl", build_options_);
...@@ -147,9 +160,14 @@ class LayoutComputeImageDefaultToBufferChw ...@@ -147,9 +160,14 @@ class LayoutComputeImageDefaultToBufferChw
void Run() override { void Run() override {
auto& param = Param<param_t>(); auto& param = Param<param_t>();
const cl::Buffer* y_data;
if (param.process_type == 1) {
y_data = param.y->mutable_data<uint8_t, cl::Buffer>(TARGET(kOpenCL));
} else {
y_data = param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
}
auto* x_data = param.x->data<half_t, cl::Image2D>(); auto* x_data = param.x->data<half_t, cl::Image2D>();
auto x_dims = param.x->dims(); auto x_dims = param.x->dims();
auto* y_data = param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto y_dims = param.y->dims(); auto y_dims = param.y->dims();
auto x_image_shape = InitImageDimInfoWith(x_dims); auto x_image_shape = InitImageDimInfoWith(x_dims);
......
...@@ -148,6 +148,132 @@ TEST(layout_ImageDefault, compute) { ...@@ -148,6 +148,132 @@ TEST(layout_ImageDefault, compute) {
#endif #endif
} }
TEST(layout_ImageDefault_With_Pre_Post, compute) {
LOG(INFO) << "main steps of test: host -> layout(buf2img) -> layout(img2buf) "
"-> device";
#ifdef LOOP_TEST
for (int n = 1; n <= 2; n += 1) {
for (auto c : {1, 3}) {
for (int h = 1; h <= 10; h += 1) {
for (int w = 1; w <= 10; w += 1) {
#else
const int n = 1;
const int c = 2;
const int h = 3;
const int w = 4;
#endif // LOOP_TEST
LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
<< h << " " << w << " ========";
// set layout kernels
auto buf_to_img_kernels =
KernelRegistry::Global().Create("layout",
TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kImageDefault));
auto img_to_buf_kernels = KernelRegistry::Global().Create(
"layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
ASSERT_FALSE(buf_to_img_kernels.empty());
ASSERT_FALSE(buf_to_img_kernels.empty());
auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
// set tensors about op param
LOG(INFO) << "set tensors about op param";
lite::Tensor x, y_image, y;
operators::LayoutParam BufferToImageParam;
operators::LayoutParam ImageToBufferParam;
BufferToImageParam.x = &x;
BufferToImageParam.y = &y_image;
BufferToImageParam.process_type = 1;
ImageToBufferParam.x = &y_image;
ImageToBufferParam.y = &y;
ImageToBufferParam.process_type = 1;
const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
x.Resize(x_dim);
y_image.Resize(x_dim); // useless for image2D
y.Resize(x_dim);
// initialize tensors
LOG(INFO) << "initialize tensors";
auto* x_data = x.mutable_data<uint8_t, cl::Buffer>(TARGET(kOpenCL));
auto* y_data = y.mutable_data<uint8_t, cl::Buffer>(TARGET(kOpenCL));
auto image_shape =
paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
auto* y_image_data = y_image.mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
auto* mapped_x = static_cast<uint8_t*>(TargetWrapperCL::Map(
x_data, 0, sizeof(uint8_t) * x_dim.production()));
auto* mapped_y = static_cast<uint8_t*>(TargetWrapperCL::Map(
y_data, 0, sizeof(uint8_t) * x_dim.production()));
for (int i = 0; i < x_dim.production(); ++i) {
mapped_x[i] = static_cast<uint8_t>(i % 256);
}
// set context and kernel args
LOG(INFO) << "set context and kernel args";
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
buf_to_img_kernel->SetParam(BufferToImageParam);
std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(buf_to_img_context->As<OpenCLContext>()));
buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
img_to_buf_kernel->SetParam(ImageToBufferParam);
std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
context->As<OpenCLContext>().CopySharedTo(
&(img_to_buf_context->As<OpenCLContext>()));
img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
// run kernels
LOG(INFO) << "run kernel: buffer_to_image2d_with_pre255";
buf_to_img_kernel->Launch();
LOG(INFO) << "run kernel: image2d_to_buffer_with_post255";
img_to_buf_kernel->Launch();
// result
#ifdef PRINT_RESULT
LOG(INFO) << "---- print result ----";
for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
std::cout << mapped_x[eidx] << " -> "
<< static_cast<uint8_t>(mapped_y[eidx]) << std::endl;
}
#endif // PRINT_RESULT
// check result: compare input and output
float MAX_PASS_DIFF = 1;
for (int eidx = 0; eidx < x_dim.production(); eidx++) {
EXPECT_NEAR(mapped_x[eidx], mapped_y[eidx], MAX_PASS_DIFF);
if (abs(mapped_x[eidx] - mapped_y[eidx]) > MAX_PASS_DIFF) {
LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
<< " / " << x_dim.production() << ", mapped_x[" << eidx
<< "]:" << mapped_x[eidx] << ", mapped_y[" << eidx
<< "]:" << mapped_y[eidx];
break;
}
}
// free
LOG(INFO) << "free: unmap x, y";
TargetWrapperCL::Unmap(x_data, mapped_x);
TargetWrapperCL::Unmap(y_data, mapped_y);
#ifdef LOOP_TEST
} // w
} // h
} // c
} // n
#else
// nothing to do.
#endif
}
#if 0 #if 0
TEST(layout_ImageNW, compute) { TEST(layout_ImageNW, compute) {
#ifdef LOOP_TEST #ifdef LOOP_TEST
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册