From 394c2833ed96987e41415dc5ff42f5e343215f52 Mon Sep 17 00:00:00 2001 From: ysh329 Date: Fri, 28 Aug 2020 16:27:17 +0800 Subject: [PATCH] [KERNEL][OPENCL] Fix concat; Enable conv3x3 with group (#4180) * [KERNEL][OPENCL] support opencl concat 3inputs, 4inputs of cases; enable conv3x3 with group; fix buffer opencl concat; clean and clear cmake of opencl kernels. test=develop --- .../opencl/cl_kernel/buffer/concat_kernel.cl | 89 ++- .../cl_kernel/image/activation_kernel.cl | 2 +- .../opencl/cl_kernel/image/concat_kernel.cl | 339 +++++++-- lite/kernels/opencl/CMakeLists.txt | 126 ++-- .../opencl/activation_image_compute.cc | 2 +- lite/kernels/opencl/concat_buffer_compute.cc | 15 +- .../opencl/concat_buffer_compute_test.cc | 13 +- lite/kernels/opencl/concat_image_compute.cc | 402 +++++++--- .../opencl/concat_image_compute_test.cc | 704 ++++++++++++------ lite/kernels/opencl/conv_image_compute.cc | 93 ++- 10 files changed, 1260 insertions(+), 525 deletions(-) diff --git a/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl index 1574cb4a69..23762ffd9f 100644 --- a/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/concat_kernel.cl @@ -14,47 +14,62 @@ limitations under the License. */ #include -__kernel void concat2(__global const CL_DTYPE* x_data0, __global const CL_DTYPE* x_data1, __global CL_DTYPE* out_data, - int size, int axis_size, int pre_size, int post_size, int total, int total0, int total1) { - const int index = get_global_id(0); - if (index < size){ - for (int i = 0; i < pre_size; i++){ - int offset_out = index * post_size + i * total; - int offset_in = index * post_size + i * total0; - // memcpy(out_data + offset_out, x_data0 + offset_in, post_size); - CL_DTYPE* dst = out_data + offset_out; - CL_DTYPE* src = x_data0 + offset_in; - for (int k = 0; k < post_size; k++){ - *dst++ = *src++; - } - } - }else if (index < axis_size){ - for (int i = 0; i < pre_size; i++){ - int offset_out = index * post_size + i * total; - int offset_in = index * post_size + i * total1; - // memcpy(out_data + offset_out, x_data1 + offset_in, post_size); - CL_DTYPE* dst = out_data + offset_out; - CL_DTYPE* src = x_data1 + offset_in; - for (int k = 0; k < post_size; k++){ - *dst++ = *src++; +__kernel void concat2(__global const CL_DTYPE* x_data0, + __global const CL_DTYPE* x_data1, + __global CL_DTYPE* out_data, + int size, + int axis_size, + int pre_size, + int post_size, + int total, + int total0, + int total1) { + const int index = get_global_id(0); + if (index < size) { + for (int i = 0; i < pre_size; i++) { + int offset_out = index * post_size + i * total; + int offset_in = index * post_size + i * total0; + // memcpy(out_data + offset_out, x_data0 + offset_in, post_size); + __global CL_DTYPE* dst = (__global CL_DTYPE*)(out_data + offset_out); + __global CL_DTYPE* src = (__global CL_DTYPE*)(x_data0 + offset_in); + for (int k = 0; k < post_size; k++) { + *dst++ = *src++; + } + } + } else if (index < axis_size) { + for (int i = 0; i < pre_size; i++) { + int offset_out = index * post_size + i * total; + int offset_in = index * post_size + i * total1; + // memcpy(out_data + offset_out, x_data1 + offset_in, post_size); + __global CL_DTYPE* dst = (__global CL_DTYPE*)(out_data + offset_out); + __global CL_DTYPE* src = (__global CL_DTYPE*)(x_data1 + offset_in); + for (int k = 0; k < post_size; k++) { + *dst++ = *src++; + } } } - } } -__kernel void concat_mul(__global const CL_DTYPE* x_data, __global CL_DTYPE* out_data, - int axis_size, int pre_size, int post_size, int start, int total, int total0) { - const int index = get_global_id(0); - if (index < axis_size){ - for (int i = 0; i < pre_size; i++){ - int offset_out = (start + index) * post_size + i * total; - int offset_in = index * post_size + i * total0; - // memcpy(out_data + offset_out, x_data + offset_in, post_size); - CL_DTYPE* dst = out_data + offset_out; - CL_DTYPE* src = x_data + offset_in; - for (int k = 0; k < post_size; k++){ - *dst++ = *src++; +__kernel void concat_mul_buffer( + __global const CL_DTYPE* x_data, + __global CL_DTYPE* out_data, + int axis_size, + int pre_size, + int post_size, + int start, + int total, + int total0) { + const int index = get_global_id(0); // [0, axis_size) + if (index < axis_size) { + for (int i = 0; i < pre_size; i++) { + int offset_out = (start + index) * post_size + i * total; + int offset_in = index * post_size + i * total0; + // memcpy(out_data + offset_out, x_data + offset_in, post_size); + __global CL_DTYPE* dst = (__global CL_DTYPE*)(out_data + offset_out); + __global CL_DTYPE* src = (__global CL_DTYPE*)(x_data + offset_in); + for (int k = 0; k < post_size; k++) { + *dst++ = *src++; + } } } - } } diff --git a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl index f7387c2026..e9cb92a615 100644 --- a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl @@ -77,7 +77,7 @@ __kernel void hard_sigmoid(__read_only image2d_t input, CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); - CL_DTYPE4 out = clamp(in * scale + value_offset, 0.0, 1.0); + CL_DTYPE4 out = clamp(in * (CL_DTYPE4)(scale) + (CL_DTYPE4)(value_offset), (CL_DTYPE4)(0.0), (CL_DTYPE4)(1.0)); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); } diff --git a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl index 40cc52d54d..d61bef7cbb 100644 --- a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl @@ -11,6 +11,286 @@ limitations under the License. */ #include +// deprecated +__kernel void concatByCWith2Inputs( + __write_only image2d_t output_image, + __private const int output_tensor_c, + __private const int output_tensor_w, + __read_only image2d_t input0_image, + __private const int input0_tensor_c, + __read_only image2d_t input1_image, + __private const int input1_tensor_c) { + const int out_c = get_global_id(0); // [0, (output_tensor_c + 3) / 4) + const int out_w = get_global_id(1); // [0, output_tensor_w) + const int out_nh = get_global_id(2); // [0, output_tensor_n * output_tensor_h) + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 output_pos; + output_pos.x = out_c * output_tensor_w + out_w; + output_pos.y = out_nh; + CL_DTYPE4 output_data; + + for (int i = 0; i < 4; i++) { + int c = out_c * 4 + i; + if (c >= output_tensor_c) { + break; + } + int c_in; + CL_DTYPE4 input_data; + if (c < input0_tensor_c) { + c_in = c; + int2 input_pos; + input_pos.x = (c_in / 4) * output_tensor_w + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0_image, sampler, input_pos); + } else { + c_in = c - input0_tensor_c; + int2 input_pos; + input_pos.x = (c_in / 4) * output_tensor_w + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1_image, sampler, input_pos); + } + int value_offset = c_in % 4; + float value; + if (value_offset == 0) { + value = input_data.x; + } else if (value_offset == 1) { + value = input_data.y; + } else if (value_offset == 2) { + value = input_data.z; + } else if (value_offset == 3) { + value = input_data.w; + } + if (i == 0) { + output_data.x = value; + } else if (i == 1) { + output_data.y = value; + } else if (i == 2) { + output_data.z = value; + } else if (i == 3) { + output_data.w = value; + } + } + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data); +} + + +__kernel void concatByCWith3Inputs( + __write_only image2d_t output_image, + __private const int output_tensor_c, + __private const int output_tensor_w, + __read_only image2d_t input0_image, + __private const int input0_tensor_c, + __read_only image2d_t input1_image, + __private const int input1_tensor_c, + __read_only image2d_t input2_image, + __private const int input2_tensor_c) { + const int out_c = get_global_id(0); // [0, (output_tensor_c + 3) / 4) + const int out_w = get_global_id(1); // [0, output_tensor_w) + const int out_nh = get_global_id(2); // [0, output_tensor_n * output_tensor_h) + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 output_pos; + output_pos.x = out_c * output_tensor_w + out_w; + output_pos.y = out_nh; + CL_DTYPE4 output_data; + + for (int i = 0; i < 4; i++) { + int c = out_c * 4 + i; + if (c >= output_tensor_c) { + break; + } + int c_in; + CL_DTYPE4 input_data; + if (c < input0_tensor_c) { + c_in = c; + int2 input_pos; + input_pos.x = (c_in / 4) * output_tensor_w + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0_image, sampler, input_pos); + } else if (c < input0_tensor_c + input1_tensor_c) { + c_in = c - input0_tensor_c; + int2 input_pos; + input_pos.x = (c_in / 4) * output_tensor_w + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1_image, sampler, input_pos); + } else { + c_in = c - input0_tensor_c - input1_tensor_c; + int2 input_pos; + input_pos.x = (c_in / 4) * output_tensor_w + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input2_image, sampler, input_pos); + } + int value_offset = c_in % 4; + float value; + if (value_offset == 0) { + value = input_data.x; + } else if (value_offset == 1) { + value = input_data.y; + } else if (value_offset == 2) { + value = input_data.z; + } else if (value_offset == 3) { + value = input_data.w; + } + if (i == 0) { + output_data.x = value; + } else if (i == 1) { + output_data.y = value; + } else if (i == 2) { + output_data.z = value; + } else if (i == 3) { + output_data.w = value; + } + } + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data); +} + + +__kernel void concatByCWith4Inputs( + __write_only image2d_t output_image, + __private const int output_tensor_c, + __private const int output_tensor_w, + __read_only image2d_t input0_image, + __private const int input0_tensor_c, + __read_only image2d_t input1_image, + __private const int input1_tensor_c, + __read_only image2d_t input2_image, + __private const int input2_tensor_c, + __read_only image2d_t input3_image, + __private const int input3_tensor_c) { + const int out_c = get_global_id(0); // [0, (output_tensor_c + 3) / 4) + const int out_w = get_global_id(1); // [0, output_tensor_w) + const int out_nh = get_global_id(2); // [0, output_tensor_n * output_tensor_h) + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 output_pos; + output_pos.x = out_c * output_tensor_w + out_w; + output_pos.y = out_nh; + CL_DTYPE4 output_data; + + for (int i = 0; i < 4; i++) { + int c = out_c * 4 + i; + if (c >= output_tensor_c) { + break; + } + int c_in; + CL_DTYPE4 input_data; + if (c < input0_tensor_c) { + c_in = c; + int2 input_pos; + input_pos.x = (c_in / 4) * output_tensor_w + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0_image, sampler, input_pos); + } else if (c < input0_tensor_c + input1_tensor_c) { + c_in = c - input0_tensor_c; + int2 input_pos; + input_pos.x = (c_in / 4) * output_tensor_w + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1_image, sampler, input_pos); + } else if (c < input0_tensor_c + input1_tensor_c + input2_tensor_c) { + c_in = c - input0_tensor_c - input1_tensor_c; + int2 input_pos; + input_pos.x = (c_in / 4) * output_tensor_w + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input2_image, sampler, input_pos); + }else if (c < input0_tensor_c + input1_tensor_c + input2_tensor_c + input3_tensor_c){ + c_in = c - input0_tensor_c - input1_tensor_c - input2_tensor_c; + int2 input_pos; + input_pos.x = (c_in / 4) * output_tensor_w + out_w; + input_pos.y = out_nh; + input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input3_image, sampler, input_pos); + } + int value_offset = c_in % 4; + float value; + if (value_offset == 0) { + value = input_data.x; + } else if (value_offset == 1) { + value = input_data.y; + } else if (value_offset == 2) { + value = input_data.z; + } else if (value_offset == 3) { + value = input_data.w; + } + if (i == 0) { + output_data.x = value; + } else if (i == 1) { + output_data.y = value; + } else if (i == 2) { + output_data.z = value; + } else if (i == 3) { + output_data.w = value; + } + } + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data); +} + + +// deprecated +__kernel void concatByH(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_W, + __private const int out_H_Start) { + + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + + int2 input_pos; + input_pos.x = in_c * out_W + in_w; + input_pos.y = in_nh; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + CL_DTYPE4 input; + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos); + + int2 output_pos; + output_pos.x = input_pos.x; + output_pos.y = out_H_Start + input_pos.y; + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input); + +} + + +// deprecated +__kernel void concatByW(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int in_W, + __private const int pre_Width, + __private const int out_Width) { + + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + + int2 input_pos; + input_pos.x = in_c * in_W + in_w; + input_pos.y = in_nh; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + CL_DTYPE4 input; + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos); + + int2 output_pos; + output_pos.x = input_pos.x + pre_Width + out_Width * in_c; + output_pos.y = input_pos.y; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input); +} + + __kernel void concat2(__read_only image2d_t input0, __read_only image2d_t input1, __write_only image2d_t output, @@ -103,62 +383,3 @@ __kernel void concat2(__read_only image2d_t input0, WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input); } } - -__kernel void concat_mul(__read_only image2d_t input, - __write_only image2d_t output, - int flag, int C_0, int out_C, int out_W, int in_W, int width) { - const int in_w = get_global_id(0); // image_width cxw/4 - const int in_c = get_global_id(1); // image_width cxw/4 - const int in_nh = get_global_id(2); // image_height nxh - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - int2 input_pos; - int2 output_pos; - input_pos.x = in_c * in_W + in_w; - input_pos.y = in_nh; - CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos); - if (flag == 1){ // by channel - CL_DTYPE4 output_data; - for (int i = 0; i < 4; i++) { - int c_out = C_0 + in_c * 4 + i; - if (c_out >= out_C) { - break; - } - int2 output_pos; - output_pos.x = (c_out / 4) * in_W + in_w; - output_pos.y = in_nh; - CL_DTYPE val; - if (i == 0) { - val = input_data.x; - } else if (i == 1) { - val = input_data.y; - } else if (i == 2) { - val = input_data.z; - } else if (i == 3) { - val = input_data.w; - } - if (c_out % 4 == 0){ - output_data.x = val; - }else if (c_out % 4 == 1){ - output_data.y = val; - }else if (c_out % 4 == 2){ - output_data.z = val; - }else if (c_out % 4 == 3){ - output_data.w = val; - } - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data); - } - }else if (flag == 2){ // by height, width == n - int2 output_pos; - output_pos.x = in_c * in_W + in_w; - output_pos.y = in_nh + C_0 * width; - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data); - }else if (flag == 3){ // by width, width == C - int2 output_pos; - output_pos.y = in_nh; - output_pos.x = in_c * out_W + (in_w + C_0); - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data); - } -} diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index 81e1a4d756..01651e0e41 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -8,35 +8,35 @@ set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper # image kernel # ##################### # basic -add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(elementwise_sub_opencl OPENCL basic SRCS elementwise_sub_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(fusion_elementwise_add_activation_opencl +add_kernel(elementwise_add_opencl_image OPENCL basic SRCS elementwise_add_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(elementwise_sub_opencl_image OPENCL basic SRCS elementwise_sub_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(elementwise_mul_opencl_image OPENCL basic SRCS elementwise_mul_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(fusion_elementwise_add_activation_opencl_image OPENCL basic SRCS fusion_elementwise_add_activation_image_compute.cc - DEPS elementwise_add_opencl ${cl_kernel_deps}) -add_kernel(fusion_elementwise_sub_activation_opencl + DEPS elementwise_add_opencl_image ${cl_kernel_deps}) +add_kernel(fusion_elementwise_sub_activation_opencl_image OPENCL basic SRCS fusion_elementwise_sub_activation_image_compute.cc - DEPS elementwise_sub_opencl ${cl_kernel_deps}) - -add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(transpose_opencl OPENCL basic SRCS transpose_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(layout_opencl OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(grid_sampler_opencl OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(lrn_opencl OPENCL basic SRCS lrn_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(bilinear_interp_opencl OPENCL basic SRCS bilinear_interp_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(slice_opencl OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(instance_norm_opencl OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(pixel_shuffle_opencl OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(expand_opencl OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps}) + DEPS elementwise_sub_opencl_image ${cl_kernel_deps}) + +add_kernel(pool_opencl_image OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(activation_opencl_image OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(reshape_opencl_image OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(transpose_opencl_image OPENCL basic SRCS transpose_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(conv_opencl_image OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(layout_opencl_image OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(concat_opencl_image OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(nearest_interp_opencl_image OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(scale_opencl_image OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(grid_sampler_opencl_image OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(lrn_opencl_image OPENCL basic SRCS lrn_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(bilinear_interp_opencl_image OPENCL basic SRCS bilinear_interp_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(slice_opencl_image OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(instance_norm_opencl_image OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(dropout_opencl_image OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(pad2d_opencl_image OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(box_coder_opencl_image OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(pixel_shuffle_opencl_image OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(expand_opencl_image OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps}) # extra # wait to add ... @@ -48,89 +48,89 @@ add_kernel(expand_opencl OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_ker # image kernel test # ###################### lite_cc_test(test_activation_image_opencl SRCS activation_image_compute_test.cc - DEPS activation_opencl layout_opencl op_registry program context) + DEPS activation_opencl_image layout_opencl_image op_registry program context) lite_cc_test(test_conv_image_opencl SRCS conv_image_compute_test.cc - DEPS conv_opencl op_registry program context) + DEPS conv_opencl_image op_registry program context) lite_cc_test(test_depthwise_conv2d_image_opencl SRCS depthwise_conv2d_image_compute_test.cc - DEPS conv_opencl op_registry program context) + DEPS conv_opencl_image op_registry program context) lite_cc_test(test_nearest_interp_image_opencl SRCS nearest_interp_image_compute_test.cc - DEPS nearest_interp_opencl layout_opencl op_registry program context) + DEPS nearest_interp_opencl_image layout_opencl_image op_registry program context) lite_cc_test(test_pool_image_opencl SRCS pool_image_compute_test.cc - DEPS pool_opencl op_registry program context) + DEPS pool_opencl_image op_registry program context) lite_cc_test(test_scale_image_opencl SRCS scale_image_compute_test.cc - DEPS scale_opencl op_registry program context) + DEPS scale_opencl_image op_registry program context) lite_cc_test(test_reshape_image_opencl SRCS reshape_image_compute_test.cc - DEPS reshape_opencl op_registry program context) + DEPS reshape_opencl_image op_registry program context) lite_cc_test(test_transpose_image_opencl SRCS transpose_image_compute_test.cc - DEPS transpose_opencl layout_opencl op_registry program context) + DEPS transpose_opencl_image layout_opencl_image op_registry program context) lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc - DEPS concat_opencl layout_opencl op_registry program context) + DEPS concat_opencl_image layout_opencl_image op_registry program context) #lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_compute_test.cc -# DEPS elementwise_mul_opencl op_registry program context) +# DEPS elementwise_mul_opencl_image op_registry program context) lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc - DEPS layout_opencl op_registry program context) + DEPS layout_opencl_image op_registry program context) lite_cc_test(test_pixel_shuffle_image_opencl SRCS pixel_shuffle_image_compute_test.cc - DEPS pixel_shuffle_opencl op_registry program context) + DEPS pixel_shuffle_opencl_image op_registry program context) lite_cc_test(test_expand_image_opencl SRCS expand_image_compute_test.cc - DEPS expand_opencl op_registry program context) + DEPS expand_opencl_image op_registry program context) lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc - DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context) + DEPS elementwise_add_opencl_image fusion_elementwise_add_activation_opencl_image op_registry program context) lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc - DEPS elementwise_sub_opencl fusion_elementwise_sub_activation_opencl op_registry program context) + DEPS elementwise_sub_opencl_image fusion_elementwise_sub_activation_opencl_image op_registry program context) lite_cc_test(test_grid_sampler_image_opencl SRCS grid_sampler_image_compute_test.cc - DEPS grid_sampler_opencl op_registry program context) + DEPS grid_sampler_opencl_image op_registry program context) lite_cc_test(test_lrn_image_opencl SRCS lrn_image_compute_test.cc - DEPS lrn_opencl op_registry program context) + DEPS lrn_opencl_image op_registry program context) lite_cc_test(test_bilinear_interp_image_opencl SRCS bilinear_interp_image_compute_test.cc - DEPS bilinear_interp_opencl op_registry program context) + DEPS bilinear_interp_opencl_image op_registry program context) lite_cc_test(test_slice_image_opencl SRCS slice_image_compute_test.cc - DEPS slice_opencl op_registry program context) + DEPS slice_opencl_image op_registry program context) - #lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc - # DEPS instance_norm_opencl op_registry program context) +#lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc +# DEPS instance_norm_opencl_image op_registry program context) lite_cc_test(test_dropout_image_opencl SRCS dropout_image_compute_test.cc - DEPS dropout_opencl op_registry program context) + DEPS dropout_opencl_image op_registry program context) lite_cc_test(test_pad2d_image_opencl SRCS pad2d_image_compute_test.cc - DEPS pad2d_opencl layout_opencl op_registry program context) + DEPS pad2d_opencl_image layout_opencl_image op_registry program context) lite_cc_test(test_box_coder_image_opencl SRCS box_coder_image_compute_test.cc - DEPS box_coder_opencl op_registry program context) + DEPS box_coder_opencl_image op_registry program context) ###################### # buffer kernel # ###################### # basic -#add_kernel(activation_opencl OPENCL basic SRCS activation_buffer_compute.cc DEPS ${cl_kernel_deps}) -#add_kernel(conv_opencl OPENCL basic SRCS conv_buffer_compute.cc DEPS ${cl_kernel_deps}) +#add_kernel(activation_opencl_buffer OPENCL basic SRCS activation_buffer_compute.cc DEPS ${cl_kernel_deps}) +#add_kernel(conv_opencl_buffer OPENCL basic SRCS conv_buffer_compute.cc DEPS ${cl_kernel_deps}) #add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_buffer_compute.cc DEPS ${cl_kernel_deps}) -#add_kernel(pool_opencl OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps}) -#add_kernel(concat_opencl OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps}) -add_kernel(fc_opencl OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps}) +#add_kernel(pool_opencl_buffer OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps}) +#add_kernel(concat_opencl_buffer OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps}) +add_kernel(fc_opencl_buffer OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps}) # NOTE(ysh329): use fc as `mul`, and mul is not used. -#add_kernel(mul_opencl OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps}) -#add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps}) -#add_kernel(fusion_elementwise_add_activation_opencl +#add_kernel(mul_opencl_buffer OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps}) +#add_kernel(elementwise_add_opencl_buffer OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps}) +#add_kernel(fusion_elementwise_add_activation_opencl_buffer # OPENCL basic SRCS fusion_elementwise_add_activation_buffer_compute.cc # DEPS elementwise_add_opencl ${cl_kernel_deps}) -add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps}) +add_kernel(io_copy_opencl_buffer OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps}) # extra # wait to add ... @@ -156,10 +156,10 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten # DEPS pool_opencl op_registry program context) #lite_cc_test(test_concat_buffer_opencl SRCS concat_buffer_compute_test.cc -# DEPS concat_opencl op_registry program context) +# DEPS concat_opencl_buffer op_registry program context) lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc - DEPS fc_opencl op_registry program context) + DEPS fc_opencl_buffer op_registry program context) #lite_cc_test(test_mul_buffer_opencl SRCS mul_buffer_compute_test.cc # DEPS mul_opencl op_registry program context) @@ -168,4 +168,4 @@ lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc # DEPS elementwise_add_opencl op_registry program context) lite_cc_test(test_io_copy_buffer_opencl SRCS io_copy_buffer_compute_test.cc - DEPS io_copy_opencl op_registry program context) + DEPS io_copy_opencl_buffer op_registry program context) diff --git a/lite/kernels/opencl/activation_image_compute.cc b/lite/kernels/opencl/activation_image_compute.cc index 92ace84f85..5a157122c7 100644 --- a/lite/kernels/opencl/activation_image_compute.cc +++ b/lite/kernels/opencl/activation_image_compute.cc @@ -77,7 +77,7 @@ class ActivationComputeImageDefault scale_ = act_param_->hard_sigmoid_slope; threshold_ = act_param_->hard_sigmoid_offset; break; - defauln: + default: LOG(FATAL) << "This act type:" << act_type << " doesn't support."; return; } diff --git a/lite/kernels/opencl/concat_buffer_compute.cc b/lite/kernels/opencl/concat_buffer_compute.cc index c9d7fc1cb8..23e6408492 100644 --- a/lite/kernels/opencl/concat_buffer_compute.cc +++ b/lite/kernels/opencl/concat_buffer_compute.cc @@ -40,7 +40,7 @@ class ConcatCompute : public KernelLitex.size() == 2) { kernel_func_name_ = "concat2"; } else { - kernel_func_name_ = "concat_mul"; + kernel_func_name_ = "concat_mul_buffer"; } context.cl_context()->AddKernel(kernel_func_name_, "buffer/concat_kernel.cl", @@ -86,7 +86,6 @@ class ConcatCompute : public KernelLite(); const auto& x_dims = param.output->dims(); - auto image_shape = InitImageDimInfoWith(x_dims); auto* out_buf = param.output->mutable_data(TARGET(kOpenCL)); const auto& y_dims = param.output->dims(); // useless: check dim only @@ -98,8 +97,9 @@ class ConcatCompute : public KernelLite(axis_size_)}; int total = axis_size_ * post_size_; + auto kernel = context.cl_context()->GetKernel(kernel_key.str()); if (inputs.size() == 2) { auto* x_buf0 = inputs[0]->data(); @@ -144,6 +144,15 @@ class ConcatCompute : public KernelLitedata(); global_work_size = cl::NDRange{static_cast(size)}; int total0 = size * post_size_; +#ifdef LITE_WITH_LOG + LOG(INFO) << "------------- i=" << i << " -------------"; + LOG(INFO) << "pre_size:" << pre_size_; + LOG(INFO) << "post_size:" << post_size_; + LOG(INFO) << "size:" << size; + LOG(INFO) << "start:" << start; + LOG(INFO) << "total:" << total; + LOG(INFO) << "total0:" << total0; +#endif cl_int status = kernel.setArg(arg_idx, *x_buf); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, *out_buf); diff --git a/lite/kernels/opencl/concat_buffer_compute_test.cc b/lite/kernels/opencl/concat_buffer_compute_test.cc index 57621d4a39..1a3047e5bd 100644 --- a/lite/kernels/opencl/concat_buffer_compute_test.cc +++ b/lite/kernels/opencl/concat_buffer_compute_test.cc @@ -99,13 +99,14 @@ TEST(opencl_concat_buffer, compute) { auto *mapped_x2 = static_cast( TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production())); for (int i = 0; i < x0_dim.production(); i++) { - mapped_x0[i] = dist(engine); + mapped_x0[i] = i + 1; // dist(engine); } for (int i = 0; i < x1_dim.production(); i++) { - mapped_x1[i] = dist(engine); + mapped_x1[i] = x0_dim.production() + i + 1; // dist(engine); } for (int i = 0; i < x2_dim.production(); i++) { - mapped_x2[i] = dist(engine); + mapped_x2[i] = + x0_dim.production() + x1_dim.production() + i + 1; // dist(engine); } // set param and kernel, then run @@ -151,9 +152,13 @@ TEST(opencl_concat_buffer, compute) { auto *out_data = out.mutable_data(); auto *mapped_out = static_cast( TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production())); +#ifdef PRINT_RESULT_CONCAT_BUFFER for (int i = 0; i < out_dim.production(); i++) { - EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6); + LOG(INFO) << "i:" << i << ", out[" << i << "]:" << mapped_out[i] + << ", out_ref_data[" << i << "]:" << out_ref_data[i]; } +#endif + EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6); TargetWrapperCL::Unmap(out_data, mapped_out); TargetWrapperCL::Unmap(x0_data, mapped_x0); TargetWrapperCL::Unmap(x1_data, mapped_x1); diff --git a/lite/kernels/opencl/concat_image_compute.cc b/lite/kernels/opencl/concat_image_compute.cc index 25830b6a08..1a7ef0828f 100644 --- a/lite/kernels/opencl/concat_image_compute.cc +++ b/lite/kernels/opencl/concat_image_compute.cc @@ -38,72 +38,89 @@ class ConcatComputeImage : public KernelLiteAs(); concat_param_ = param_.get_mutable(); - if (concat_param_->x.size() == 2) { + + auto inputs = concat_param_->x; + auto axis_ = concat_param_->axis; + auto output_tensor_dims = concat_param_->output->dims(); + auto* axis_tensor = concat_param_->axis_tensor; + if (axis_tensor != nullptr) { + // auto* axis_tensor_data = axis_tensor->data(TARGET(kARM)); + // axis = axis_tensor_data[0]; + } + + if (inputs.size() == 2) { kernel_func_name_ = "concat2"; + } else if (inputs.size() == 3) { + kernel_func_name_ = "concatByCWith3Inputs"; + } else if (inputs.size() == 4) { + kernel_func_name_ = "concatByCWith4Inputs"; } else { - kernel_func_name_ = "concat_mul"; + // note: do layout transform between image and buffer, + // before and after concat(buffer impl.) + kernel_func_name_ = "concat_mul_buffer"; // buffer/concat_kernel.cl + build_options_ = " -DCL_DTYPE_float"; + auto in_dims = inputs[0]->dims(); + for (int i = 0; i < axis_; i++) { + pre_size_ *= in_dims[i]; + } + for (int i = axis_ + 1; i < in_dims.size(); i++) { + post_size_ *= in_dims[i]; + } } VLOG(1) << "kernel_func_name_:" << kernel_func_name_; + context.cl_context()->AddKernel(kernel_func_name_, - "image/concat_kernel.cl", + (kernel_func_name_ == "concat_mul_buffer") + ? "buffer/concat_kernel.cl" + : "image/concat_kernel.cl", build_options_, time_stamp_); - auto axis = concat_param_->axis; - auto inputs = concat_param_->x; - auto out_dims = concat_param_->output->dims(); - auto* axis_tensor = concat_param_->axis_tensor; - if (axis_tensor != nullptr) { - // auto* axis_tensor_data = axis_tensor->data(TARGET(kARM)); - // axis = axis_tensor_data[0]; - } - auto in_dims = inputs[0]->dims(); - axis_size_ = out_dims[axis]; - axis_ = axis; - if (out_dims.size() < 4) { - if (out_dims.size() - axis == 1) { + if (output_tensor_dims.size() < 4) { + if (output_tensor_dims.size() - axis_ == 1) { // width - width_ = out_dims[1]; // c + width_ = output_tensor_dims[1]; // c flag_ = 3; } else { // height - width_ = out_dims[0]; // n + width_ = output_tensor_dims[0]; // n flag_ = 2; } } else { switch (axis_) { case 0: - width_ = out_dims[2]; // h + width_ = output_tensor_dims[2]; // h flag_ = 0; break; - case 1: // channel - width_ = out_dims[3]; // w + case 1: // channel + width_ = output_tensor_dims[3]; // w flag_ = 1; break; - case 2: // height - width_ = out_dims[0]; // n + case 2: // height + width_ = output_tensor_dims[0]; // n flag_ = 2; break; case 3: - case -1: // width - width_ = out_dims[1]; // c + case -1: // width + width_ = output_tensor_dims[1]; // c flag_ = 3; break; default: - printf("this axis: %d does not support \n", axis_); + LOG(FATAL) << "Unsupported axis:" << axis_; } } + auto input0_tensor_dims = inputs[0]->dims(); for (int i = 1; i < inputs.size(); i++) { auto dims = inputs[i]->dims(); - // auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size()); - if (in_dims.size() != dims.size()) { + // auto flag = CHECK_EQ_OR_FALSE(input0_tensor_dims.size(), dims.size()); + if (input0_tensor_dims.size() != dims.size()) { printf("input shape must be same \n"); return; } for (int i = 0; i < dims.size(); i++) { - if (i != axis) { - if (in_dims[i] != dims[i]) { + if (i != axis_) { + if (input0_tensor_dims[i] != dims[i]) { printf("input shape must be same \n"); return; } @@ -113,21 +130,22 @@ class ConcatComputeImage : public KernelLite(); - const auto& x_dims = param.output->dims(); - auto image_shape = InitImageDimInfoWith(x_dims); - auto* out_buf = param.output->mutable_data( - image_shape["width"], image_shape["height"]); - const auto& y_dims = param.output->dims(); // useless: check dim only - - auto& context = ctx_->As(); - CHECK(context.cl_context() != nullptr); - STL::stringstream kernel_key; - kernel_key << kernel_func_name_ << build_options_ << time_stamp_; + const auto& output_tensor_dims = concat_param_->output->dims(); + int output_tensor_w = output_tensor_dims[output_tensor_dims.size() - 1]; + int output_tensor_c = output_tensor_dims[1]; + auto output_image_shape = InitImageDimInfoWith(output_tensor_dims); + auto* output_image_p = + concat_param_->output->mutable_data( + output_image_shape["width"], output_image_shape["height"]); + auto inputs = concat_param_->x; - auto inputs = param.x; - int arg_idx = 0; - int width = inputs[0]->dims()[inputs[0]->dims().size() - 1]; + auto global_work_size = + cl::NDRange{static_cast( + output_tensor_dims[output_tensor_dims.size() - 1]), + static_cast( + output_image_shape["width"] / + output_tensor_dims[output_tensor_dims.size() - 1]), + static_cast(output_image_shape["height"])}; #ifdef LITE_WITH_LOG VLOG(4) << "concat input shape: "; @@ -141,55 +159,54 @@ class ConcatComputeImage : public KernelLite(x_dims[x_dims.size() - 1]), - static_cast(image_shape["width"] / - x_dims[x_dims.size() - 1]), - static_cast(image_shape["height"])}; - -#ifdef LITE_WITH_LOG - VLOG(4) << TargetToStr(param.output->target()); - VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " - << image_shape["height"]; - VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " " - << x_dims[1] << " " << x_dims[2] << " " << x_dims[3] - << "x_dims[x_dims.size() - 1]" << x_dims[x_dims.size() - 1]; - VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " - << y_dims[1] << " " << y_dims[2] << " " << y_dims[3]; - VLOG(4) << "width_: " << width_ << ", flag_: " << flag_; - VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << " " - << (image_shape["width"] / x_dims[x_dims.size() - 1]) << " " - << (image_shape["height"]); + VLOG(4) << TargetToStr(concat_param_->output->target()); + VLOG(4) << "output_image_shape(w,h):" << output_image_shape["width"] << " " + << output_image_shape["height"]; + VLOG(4) << "output_tensor_dims[" << output_tensor_dims.size() + << "D]:" << output_tensor_dims[0] << " " << output_tensor_dims[1] + << " " << output_tensor_dims[2] << " " << output_tensor_dims[3] + << "output_tensor_dims[output_tensor_dims.size() - 1]" + << output_tensor_dims[output_tensor_dims.size() - 1]; + VLOG(4) << "output_tensor_w: " << output_tensor_w << ", flag_: " << flag_; + VLOG(4) << "width_:" << width_; + VLOG(4) << "global_work_size: " + << output_tensor_dims[output_tensor_dims.size() - 1] << " " + << (output_image_shape["width"] / + output_tensor_dims[output_tensor_dims.size() - 1]) + << " " << (output_image_shape["height"]); #endif + auto& context = ctx_->As(); + CHECK(context.cl_context() != nullptr); + STL::stringstream kernel_key; + kernel_key << kernel_func_name_ << build_options_ << time_stamp_; auto kernel = context.cl_context()->GetKernel(kernel_key.str()); - int out_w = x_dims[x_dims.size() - 1]; - int out_c = x_dims[1]; - if (inputs.size() == 2) { - auto* x_buf0 = inputs[0]->data(); - auto* x_buf1 = inputs[1]->data(); - cl_int status = kernel.setArg(arg_idx, *x_buf0); + + if (kernel_func_name_ == "concat2") { + auto* input0_image_p = inputs[0]->data(); + auto* input1_image_p = inputs[1]->data(); + int input0_axis_dims = inputs[0]->dims()[axis_]; + cl_int status = kernel.setArg(0, *input0_image_p); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *x_buf1); + status = kernel.setArg(1, *input1_image_p); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *out_buf); + status = kernel.setArg(2, *output_image_p); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, flag_); + status = kernel.setArg(3, flag_); CL_CHECK_FATAL(status); - status = - kernel.setArg(++arg_idx, static_cast(inputs[0]->dims()[axis_])); + status = kernel.setArg(4, input0_axis_dims); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, out_c); + status = kernel.setArg(5, output_tensor_c); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, out_w); + status = kernel.setArg(6, output_tensor_w); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, width_); + status = kernel.setArg(7, width_); CL_CHECK_FATAL(status); status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( @@ -200,51 +217,209 @@ class ConcatComputeImage : public KernelLitedims(); - image_shape = InitImageDimInfoWith(in_dims); - auto* x_buf = inputs[i]->data(); - int in_w = in_dims[in_dims.size() - 1]; + } else if (kernel_func_name_ == "concatByCWith3Inputs" || + kernel_func_name_ == "concatByCWith4Inputs") { + auto* input0 = inputs[0]; + auto* input0_image_p = input0->data(); + int input0_tensor_c = input0->dims()[1]; + + auto* input1 = inputs.size() >= 2 ? inputs[1] : nullptr; + auto* input1_image_p = + input1 ? input1->data() : nullptr; + int input1_tensor_c = input1 ? input1->dims()[1] : -1; + + auto* input2 = inputs.size() >= 3 ? inputs[2] : nullptr; + auto* input2_image_p = + input2 ? input2->data() : nullptr; + int input2_tensor_c = input2 ? input2->dims()[1] : -1; + + auto* input3 = inputs.size() >= 4 ? inputs[3] : nullptr; + auto* input3_image_p = + input3 ? input3->data() : nullptr; + int input3_tensor_c = input3 ? input3->dims()[1] : -1; + + int output_tensor_c = output_tensor_dims[1]; + int output_tensor_w = output_tensor_dims[3]; + + const std::vector& default_work_size = DefaultWorkSize( + output_tensor_dims, + DDim(std::vector{ + static_cast(output_image_shape["width"]), + static_cast(output_image_shape["height"])})); + cl::NDRange global_work_size = + cl::NDRange{static_cast(default_work_size[0]), + static_cast(default_work_size[1]), + static_cast(default_work_size[2])}; + + cl_int status; + status = kernel.setArg(0, *output_image_p); + CL_CHECK_FATAL(status); + status = kernel.setArg(1, output_tensor_c); + CL_CHECK_FATAL(status); + status = kernel.setArg(2, output_tensor_w); + CL_CHECK_FATAL(status); + status = kernel.setArg(3, *input0_image_p); + CL_CHECK_FATAL(status); + status = kernel.setArg(4, input0_tensor_c); + CL_CHECK_FATAL(status); + status = kernel.setArg(5, *input1_image_p); + CL_CHECK_FATAL(status); + status = kernel.setArg(6, input1_tensor_c); + CL_CHECK_FATAL(status); + if (inputs.size() >= 3) { + status = kernel.setArg(7, *input2_image_p); + CL_CHECK_FATAL(status); + status = kernel.setArg(8, input2_tensor_c); + CL_CHECK_FATAL(status); + } + if (inputs.size() == 4) { + status = kernel.setArg(9, *input3_image_p); + CL_CHECK_FATAL(status); + status = kernel.setArg(10, input3_tensor_c); + CL_CHECK_FATAL(status); + } + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status); + } else if (kernel_func_name_ == "concat_mul_buffer") { // inputs.size() > 4 + // note: do image layout transform: image to buffer + size_t inputs_num = inputs.size(); + std::vector inputs_image_pointers(inputs_num); + std::vector> inputs_image_shapes( + inputs_num); + std::vector inputs_dims(inputs_num); + std::vector inputs_buffer_pointers(inputs_num); + for (int i = 0; i < inputs_num; i++) { + auto* input = inputs[i]; + inputs_dims[i] = input->dims(); + inputs_image_shapes[i] = InitImageDimInfoWith(input->dims()); + inputs_image_pointers[i] = input->data(); + } + // step1. create kernels + // 1.1 img_to_buf + std::vector>> + img_to_buf_kernels_vec(inputs_num); + for (size_t i = 0; i < inputs_num; ++i) { + auto img_to_buf_kernels = KernelRegistry::Global().Create( + "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)); + img_to_buf_kernels_vec[i] = std::move(img_to_buf_kernels); + } + // 1.2 buf_to_img + std::list> buf_to_img_kernels = + KernelRegistry::Global().Create("layout", + TARGET(kOpenCL), + PRECISION(kAny), + DATALAYOUT(kImageDefault)); + + // step2. get real kernel + // 2.1 img_to_buf + std::vector> img_to_buf_kernel_vec( + inputs_num); + for (size_t i = 0; i < inputs_num; ++i) { + img_to_buf_kernel_vec[i] = std::move(img_to_buf_kernels_vec[i].front()); + } + // 2.2 buf_to_img + std::unique_ptr buf_to_img_kernel = + std::move(buf_to_img_kernels.front()); + + // step3. create and set param, context to kernel + std::unique_ptr kernel_context(new KernelContext); + kernel_context->As().InitOnce(); + // 3.1 img_to_buf + std::vector img_to_buf_params(inputs_num); + std::vector outputs_vec(inputs_num); + std::vector outputs_buffer_pointers(inputs_num); + for (size_t i = 0; i < inputs_num; ++i) { + img_to_buf_params[i].x = inputs[i]; + img_to_buf_params[i].y = &outputs_vec[i]; + outputs_vec[i].Resize(inputs_dims[i]); + outputs_buffer_pointers[i] = + outputs_vec[i].mutable_data(TARGET(kOpenCL)); + img_to_buf_kernel_vec[i]->SetParam(img_to_buf_params[i]); + + std::unique_ptr img_to_buf_context(new KernelContext); + kernel_context->As().CopySharedTo( + &(img_to_buf_context->As())); + img_to_buf_kernel_vec[i]->SetContext(std::move(img_to_buf_context)); + } + // 3.2 concat_mul_buf + std::shared_ptr concat_mul_buf_output_t(new lite::Tensor); + concat_mul_buf_output_t->Resize(concat_param_->output->dims()); + auto conat_mul_buf_output_data = + concat_mul_buf_output_t->mutable_data( + TARGET(kOpenCL)); + // 3.3 buf_to_img + std::shared_ptr buf_to_img_output_t(new lite::Tensor); + buf_to_img_output_t->Resize(concat_param_->output->dims()); + + std::shared_ptr buf_to_img_param( + new operators::LayoutParam); + buf_to_img_param->x = concat_mul_buf_output_t.get(); + buf_to_img_param->y = concat_param_->output; + buf_to_img_kernel->SetParam(buf_to_img_param); + + std::unique_ptr buf_to_img_context(new KernelContext); + kernel_context->As().CopySharedTo( + &(buf_to_img_context->As())); + buf_to_img_kernel->SetContext(std::move(buf_to_img_context)); + + // step4. run kernels + // 4.1 run kernel: image->buffer + for (size_t i = 0; i < inputs_num; ++i) { + img_to_buf_kernel_vec[i]->Launch(); + } + // 4.2 run kernel: concat_mul_buffer + int cur_axis_start_idx = 0; + int total = output_tensor_dims[axis_] * post_size_; + for (size_t i = 0; i < inputs_num; ++i) { + auto* x_buf = outputs_buffer_pointers[i]; + int axis_dim_size = inputs[i]->dims()[axis_]; + global_work_size = cl::NDRange{static_cast(axis_dim_size)}; + int total0 = axis_dim_size * post_size_; #ifdef LITE_WITH_LOG - VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " - << image_shape["height"]; + VLOG(2) << "--------------- i:" << i << " -----------------"; + VLOG(2) << "post_size_:" << post_size_; + VLOG(2) << "pre_size_:" << pre_size_; + VLOG(2) << "axis_dim_size:" << axis_dim_size; + VLOG(2) << "cur_axis_start_idx:" << cur_axis_start_idx; + VLOG(2) << "total:" << total; + VLOG(2) << "total0:" << total0; #endif - global_work_size = - cl::NDRange{static_cast(in_dims[in_dims.size() - 1]), - static_cast(image_shape["width"] / - in_dims[in_dims.size() - 1]), - static_cast(image_shape["height"])}; - cl_int status = kernel.setArg(arg_idx, *x_buf); - CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, *out_buf); + cl_int status; + status = kernel.setArg(0, *x_buf); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, flag_); + status = kernel.setArg(1, *conat_mul_buf_output_data); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, start); + status = kernel.setArg(2, axis_dim_size); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, out_c); + status = kernel.setArg(3, pre_size_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, out_w); + status = kernel.setArg(4, post_size_); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, in_w); + status = kernel.setArg(5, cur_axis_start_idx); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, width_); + status = kernel.setArg(6, total); CL_CHECK_FATAL(status); + status = kernel.setArg(7, total0); CL_CHECK_FATAL(status); - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size, - cl::NullRange, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_); CL_CHECK_FATAL(status); - start += inputs[i]->dims()[axis_]; + cur_axis_start_idx += axis_dim_size; } + // 4.3 run kernel: buffer->image + buf_to_img_kernel->Launch(); } } @@ -258,10 +433,11 @@ class ConcatComputeImage : public KernelLite ins_data, } } -// #define LOOP_TEST -// #define PRINT_RESULT -TEST(concat_image2d, compute) { +// #define LOOP_TEST_CONCAT2 +// #define PRINT_RESULT_CONCAT2 +#define CHECK_RESULT_CONCAT2 +TEST(concat_image2d_concat2, compute) { LOG(INFO) << "main steps of test: host -> layout(buf2img) -> concat(img) -> " "layout(img2buf) " "-> host"; - -#ifdef LOOP_TEST - for (int n = 1; n <= 100; n += 33) { - for (auto c : {1, 3}) { - for (int h = 12; h <= 100; h += 13) { - for (int w = 12; w <= 100; w += 25) { - for (atuo &axis : {0, 1, 2, 3}) { +#ifdef LOOP_TEST_CONCAT2 + const int axis = 1; + const int n = 1; + for (int c = 1; c < 4; ++c) { + for (int h = 1; h < 50; ++h) { + for (int w = 1; w < 50; ++w) { #else + const int axis = 1; const int n = 1; - const int c = 2; - const int h = 3; - const int w = 4; + const int c = 3; // 1; + const int h = 15; // 2; + const int w = 15; // 42; +#endif // LOOP_TEST_CONCAT2 + LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " " + << h << " " << w << " ========"; + LOG(INFO) << "======== axis: " << axis; + + // set layout kernels + std::list> buf_to_img_kernels = + KernelRegistry::Global().Create("layout", + TARGET(kOpenCL), + PRECISION(kAny), + DATALAYOUT(kImageDefault)); + auto buf_to_img_kernels1 = + KernelRegistry::Global().Create("layout", + TARGET(kOpenCL), + PRECISION(kAny), + DATALAYOUT(kImageDefault)); + auto img_to_buf_kernels = KernelRegistry::Global().Create( + "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)); + auto concat_img_kernels = + KernelRegistry::Global().Create("concat", + TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(buf_to_img_kernels.empty()); + ASSERT_FALSE(buf_to_img_kernels1.empty()); + ASSERT_FALSE(img_to_buf_kernels.empty()); + ASSERT_FALSE(concat_img_kernels.empty()); + + auto buf_to_img_kernel = std::move(buf_to_img_kernels.front()); + auto buf_to_img_kernel1 = std::move(buf_to_img_kernels1.front()); + auto img_to_buf_kernel = std::move(img_to_buf_kernels.front()); + auto concat_img_kernel = std::move(concat_img_kernels.front()); + LOG(INFO) << "get 1st-0 kernel: " << buf_to_img_kernel->doc(); + LOG(INFO) << "get 1st-1 kernel: " << buf_to_img_kernel1->doc(); + LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc(); + LOG(INFO) << "get 3rd kernel: " << concat_img_kernel->doc(); + + // set tensors about op param + LOG(INFO) << "set tensors about op param"; + lite::Tensor x0, x1, y, concat_in0, concat_in1, concat_out, y_ref; + operators::LayoutParam BufferToImageParam0, BufferToImageParam1, + BufferToImageParam2; + operators::LayoutParam ImageToBufferParam; + BufferToImageParam0.x = &x0; + BufferToImageParam0.y = &concat_in0; + BufferToImageParam1.x = &x1; + BufferToImageParam1.y = &concat_in1; + ImageToBufferParam.x = &concat_out; + ImageToBufferParam.y = &y; + std::vector ins; + operators::ConcatParam concatParam; + ins.push_back(&concat_in0); + ins.push_back(&concat_in1); + concatParam.x = ins; + concatParam.axis = axis; + concatParam.output = &concat_out; + + DDim x0_dim = DDim(std::vector{n, c, h, w}); + DDim x1_dim = DDim(std::vector{n, c, h, w}); + DDim out_dim = DDim(std::vector{n, c, h, w}); + // note: used to make cases with different channel + // x1_dim[axis] += 2; + out_dim[axis] = x0_dim[axis] + x1_dim[axis]; + x0.Resize(x0_dim); + x1.Resize(x1_dim); + y.Resize(out_dim); + concat_in0.Resize(x0_dim); + concat_in1.Resize(x1_dim); + concat_out.Resize(out_dim); + y_ref.Resize(out_dim); + auto concat_image2d_shape = + paddle::lite::kernels::opencl::InitImageDimInfoWith(out_dim); + auto concat_image2d_shape_in0 = + paddle::lite::kernels::opencl::InitImageDimInfoWith(x0_dim); + auto concat_image2d_shape_in1 = + paddle::lite::kernels::opencl::InitImageDimInfoWith(x1_dim); + + // initialize tensors + LOG(INFO) << "initialize tensors"; + auto *x_data0 = x0.mutable_data(TARGET(kOpenCL)); + auto *x_data1 = x1.mutable_data(TARGET(kOpenCL)); + auto *y_data = y.mutable_data(TARGET(kOpenCL)); + auto *y_data_ref = y_ref.mutable_data(TARGET(kARM)); + auto *mapped_x0 = static_cast(TargetWrapperCL::Map( + x_data0, 0, sizeof(float) * x0_dim.production())); + auto *mapped_x1 = static_cast(TargetWrapperCL::Map( + x_data1, 0, sizeof(float) * x1_dim.production())); + auto *mapped_y = static_cast(TargetWrapperCL::Map( + y_data, 0, sizeof(float) * out_dim.production())); + for (int i = 0; i < x0_dim.production(); ++i) { + mapped_x0[i] = i + 1; // (i+1) * 0.01;//*/static_cast(i) - + // x0_dim.production() / 2 * 0.00987; + } + for (int i = 0; i < x1_dim.production(); ++i) { + mapped_x1[i] = x0_dim.production() + i + + 1; // (i+1) * 0.1;//*/static_cast(i) - + // x1_dim.production() / 2 * 0.00987; + } + for (int i = 0; i < out_dim.production(); ++i) { + mapped_y[i] = static_cast(0); + } + auto *concat_in_data0 = concat_in0.mutable_data( + concat_image2d_shape_in0["width"], + concat_image2d_shape_in0["height"]); + auto *concat_in_data1 = concat_in1.mutable_data( + concat_image2d_shape_in1["width"], + concat_image2d_shape_in1["height"]); + auto *concat_out_data = concat_out.mutable_data( + concat_image2d_shape["width"], concat_image2d_shape["height"]); + + // set context and kernel args + LOG(INFO) << "set context and kernel args"; + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + buf_to_img_kernel->SetParam(BufferToImageParam0); + std::unique_ptr buf_to_img_context(new KernelContext); + context->As().CopySharedTo( + &(buf_to_img_context->As())); + buf_to_img_kernel->SetContext(std::move(buf_to_img_context)); + + buf_to_img_kernel1->SetParam(BufferToImageParam1); + std::unique_ptr buf_to_img_context1(new KernelContext); + context->As().CopySharedTo( + &(buf_to_img_context1->As())); + buf_to_img_kernel1->SetContext(std::move(buf_to_img_context1)); + + img_to_buf_kernel->SetParam(ImageToBufferParam); + std::unique_ptr img_to_buf_context(new KernelContext); + context->As().CopySharedTo( + &(img_to_buf_context->As())); + img_to_buf_kernel->SetContext(std::move(img_to_buf_context)); + + concat_img_kernel->SetParam(concatParam); + std::unique_ptr concat_img_context(new KernelContext); + context->As().CopySharedTo( + &(concat_img_context->As())); + concat_img_kernel->SetContext(std::move(concat_img_context)); + + // run kernels + LOG(INFO) << "run kernel: buf_to_img_kernel"; + buf_to_img_kernel->Launch(); + buf_to_img_kernel1->Launch(); + LOG(INFO) << "run kernel: concat_img_kernel"; + concat_img_kernel->Launch(); + LOG(INFO) << "run kernel: img_to_buf_kernel"; + img_to_buf_kernel->Launch(); + + CLRuntime::Global()->command_queue().finish(); + + // compute ref cp_u + std::vector ins_ptr; + std::vector in_dim; + ins_ptr.push_back(mapped_x0); + ins_ptr.push_back(mapped_x1); + in_dim.push_back(x0_dim); + in_dim.push_back(x1_dim); + concat_mul_compute_ref( + ins_ptr, in_dim, axis, out_dim, y_data_ref); +// result +#ifdef PRINT_RESULT_CONCAT2 + LOG(INFO) << "---- print kernel result (input -> output) ----"; + for (int i = 0; i < out_dim.production(); ++i) { + auto abs_diff = abs(y_data_ref[i] - mapped_y[i]); + auto relative_diff = + COMPUTE_RELATIVE_DIFF(y_data_ref[i], mapped_y[i]); + LOG(INFO) << "idx:" << i << " mapped_y[" << i << "]:" << mapped_y[i] + << " y_data_ref[" << i << "]:" << y_data_ref[i] + << " abs_diff:" << abs_diff + << " relative_diff:" << relative_diff + << " FP16_MAX_DIFF:" << FP16_MAX_DIFF; +#if 0 // note: used to write to file and check + std::cout << "idx:" << i << " mapped_y[" << i + << "]:" << mapped_y[i] << " y_data_ref[" << i + << "]:" << y_data_ref[i] << " abs_diff:" << abs_diff + << " relative_diff:" << relative_diff + << " FP16_MAX_DIFF:" << FP16_MAX_DIFF << "\n"; +#endif + } +#endif // PRINT_RESULT_CONCAT2 + +#ifdef CHECK_RESULT_CONCAT2 + // check result: compare kernel output and cpu output(y_data_ref) + for (int i = 0; i < out_dim.production(); i++) { + auto abs_diff = abs(y_data_ref[i] - mapped_y[i]); + auto relative_diff = + COMPUTE_RELATIVE_DIFF(y_data_ref[i], mapped_y[i]); + EXPECT_EQ( + (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF), + true); + if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) { + LOG(FATAL) << "error idx:" << i << " mapped_y[" << i + << "]:" << mapped_y[i] << " y_data_ref[" << i + << "]:" << y_data_ref[i] << " abs_diff:" << abs_diff + << " relative_diff:" << relative_diff + << " FP16_MAX_DIFF:" << FP16_MAX_DIFF; + break; + } + } +#endif + // free + LOG(INFO) << "free: unmap x, y"; + TargetWrapperCL::Unmap(x_data0, mapped_x0); + TargetWrapperCL::Unmap(x_data1, mapped_x1); + TargetWrapperCL::Unmap(y_data, mapped_y); +#ifdef LOOP_TEST_CONCAT2 + } // w + } // h + } // c +#else +// nothing to do. +#endif +} + +// #define LOOP_TEST_CONCAT_MUL +// #define PRINT_RESULT_CONCAT_MUL +#define CHECK_RESULT_CONCAT_MUL +TEST(concat_image2d_multi, compute) { + LOG(INFO) << "main steps of test: host -> layout(buf2img) -> concat(img) -> " + "layout(img2buf) " + "-> host"; +#ifdef LOOP_TEST_CONCAT_MUL const int axis = 1; -#endif // LOOP_TEST - LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c - << " " << h << " " << w << " ========"; - LOG(INFO) << "======== axis: " << axis; - // set layout kernels - auto buf_to_img_kernels = - KernelRegistry::Global().Create("layout", - TARGET(kOpenCL), - PRECISION(kAny), - DATALAYOUT(kImageDefault)); - auto buf_to_img_kernels1 = - KernelRegistry::Global().Create("layout", - TARGET(kOpenCL), - PRECISION(kAny), - DATALAYOUT(kImageDefault)); - auto img_to_buf_kernels = KernelRegistry::Global().Create( - "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)); - auto concat_img_kernels = - KernelRegistry::Global().Create("concat", - TARGET(kOpenCL), - PRECISION(kFP16), - DATALAYOUT(kImageDefault)); - ASSERT_FALSE(buf_to_img_kernels.empty()); - ASSERT_FALSE(buf_to_img_kernels1.empty()); - ASSERT_FALSE(img_to_buf_kernels.empty()); - ASSERT_FALSE(concat_img_kernels.empty()); - - auto buf_to_img_kernel = std::move(buf_to_img_kernels.front()); - auto buf_to_img_kernel1 = std::move(buf_to_img_kernels1.front()); - auto img_to_buf_kernel = std::move(img_to_buf_kernels.front()); - auto concat_img_kernel = std::move(concat_img_kernels.front()); - LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc(); - LOG(INFO) << "get 1st-1 kernel: " << buf_to_img_kernel1->doc(); - LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc(); - LOG(INFO) << "get 3rd kernel: " << concat_img_kernel->doc(); - - // set tensors about op param - LOG(INFO) << "set tensors about op param"; - lite::Tensor x0, x1, y, concat_in0, concat_in1, concat_out, y_ref; - operators::LayoutParam BufferToImageParam0, BufferToImageParam1; - operators::LayoutParam ImageToBufferParam; - BufferToImageParam0.x = &x0; - BufferToImageParam0.y = &concat_in0; - BufferToImageParam1.x = &x1; - BufferToImageParam1.y = &concat_in1; - ImageToBufferParam.x = &concat_out; - ImageToBufferParam.y = &y; - std::vector ins; - operators::ConcatParam concatParam; - ins.push_back(&concat_in0); - ins.push_back(&concat_in1); - concatParam.x = ins; - concatParam.axis = axis; - concatParam.output = &concat_out; - - const DDim x0_dim = DDim(std::vector{n, c, h, w}); - DDim x1_dim = DDim(std::vector{n, c, h, w}); - DDim out_dim = DDim(std::vector{n, c, h, w}); - x1_dim[axis] += 2; - out_dim[axis] = x0_dim[axis] + x1_dim[axis]; - x0.Resize(x0_dim); - x1.Resize(x1_dim); - y.Resize(out_dim); - concat_in0.Resize(x0_dim); - concat_in1.Resize(x1_dim); - concat_out.Resize(out_dim); - y_ref.Resize(out_dim); - auto concat_image2d_shape = - paddle::lite::kernels::opencl::InitImageDimInfoWith(out_dim); - auto concat_image2d_shape_in0 = - paddle::lite::kernels::opencl::InitImageDimInfoWith(x0_dim); - auto concat_image2d_shape_in1 = - paddle::lite::kernels::opencl::InitImageDimInfoWith(x1_dim); - - // initialize tensors - LOG(INFO) << "initialize tensors"; - auto *x_data0 = x0.mutable_data(TARGET(kOpenCL)); - auto *x_data1 = x1.mutable_data(TARGET(kOpenCL)); - auto *y_data = y.mutable_data(TARGET(kOpenCL)); - auto *y_data_ref = y_ref.mutable_data(TARGET(kARM)); - auto *mapped_x0 = static_cast(TargetWrapperCL::Map( - x_data0, 0, sizeof(float) * x0_dim.production())); - auto *mapped_x1 = static_cast(TargetWrapperCL::Map( - x_data1, 0, sizeof(float) * x1_dim.production())); - auto *mapped_y = static_cast(TargetWrapperCL::Map( - y_data, 0, sizeof(float) * out_dim.production())); - for (int i = 0; i < x0_dim.production(); ++i) { - mapped_x0[i] = static_cast(i) - x0_dim.production() / 2; - } - for (int i = 0; i < x1_dim.production(); ++i) { - mapped_x1[i] = static_cast(i) - x1_dim.production() / 2; - } - for (int i = 0; i < out_dim.production(); ++i) { - mapped_y[i] = static_cast(0); - } - auto *concat_in_data0 = - concat_in0.mutable_data( - concat_image2d_shape_in0["width"], - concat_image2d_shape_in0["height"]); - auto *concat_in_data1 = - concat_in1.mutable_data( - concat_image2d_shape_in1["width"], - concat_image2d_shape_in1["height"]); - auto *concat_out_data = - concat_out.mutable_data( - concat_image2d_shape["width"], - concat_image2d_shape["height"]); - - // set context and kernel args - LOG(INFO) << "set context and kernel args"; - std::unique_ptr context(new KernelContext); - context->As().InitOnce(); - - buf_to_img_kernel->SetParam(BufferToImageParam0); - std::unique_ptr buf_to_img_context( - new KernelContext); - context->As().CopySharedTo( - &(buf_to_img_context->As())); - buf_to_img_kernel->SetContext(std::move(buf_to_img_context)); - buf_to_img_kernel1->SetParam(BufferToImageParam1); - std::unique_ptr buf_to_img_context1( - new KernelContext); - context->As().CopySharedTo( - &(buf_to_img_context1->As())); - buf_to_img_kernel1->SetContext(std::move(buf_to_img_context1)); - - img_to_buf_kernel->SetParam(ImageToBufferParam); - std::unique_ptr img_to_buf_context( - new KernelContext); - context->As().CopySharedTo( - &(img_to_buf_context->As())); - img_to_buf_kernel->SetContext(std::move(img_to_buf_context)); - - concat_img_kernel->SetParam(concatParam); - std::unique_ptr concat_img_context( - new KernelContext); - context->As().CopySharedTo( - &(concat_img_context->As())); - concat_img_kernel->SetContext(std::move(concat_img_context)); - - // run kernels - LOG(INFO) << "run kernel: buf_to_img_kernel"; - buf_to_img_kernel->Launch(); - buf_to_img_kernel1->Launch(); - LOG(INFO) << "run kernel: concat_img_kernel"; - concat_img_kernel->Launch(); - LOG(INFO) << "run kernel: img_to_buf_kernel"; - img_to_buf_kernel->Launch(); - - CLRuntime::Global()->command_queue().finish(); - - // compute ref cp_u - std::vector ins_ptr; - std::vector in_dim; - ins_ptr.push_back(mapped_x0); - ins_ptr.push_back(mapped_x1); - in_dim.push_back(x0_dim); - in_dim.push_back(x1_dim); - concat_mul_compute_ref( - ins_ptr, in_dim, axis, out_dim, y_data_ref); + const int n = 1; + for (int c = 1; c < 4; ++c) { + for (int h = 1; h < 50; ++h) { + for (int w = 1; w < 50; ++w) { +#else + const int axis = 1; + const int n = 1; + const int c = 3; + const int h = 4; + const int w = 5; +#endif // LOOP_TEST_CONCAT_MUL + LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " " + << h << " " << w << " ========"; + LOG(INFO) << "======== axis: " << axis; + + // set layout kernels + auto buf_to_img_kernels = + KernelRegistry::Global().Create("layout", + TARGET(kOpenCL), + PRECISION(kAny), + DATALAYOUT(kImageDefault)); + auto buf_to_img_kernels1 = + KernelRegistry::Global().Create("layout", + TARGET(kOpenCL), + PRECISION(kAny), + DATALAYOUT(kImageDefault)); + auto buf_to_img_kernels2 = + KernelRegistry::Global().Create("layout", + TARGET(kOpenCL), + PRECISION(kAny), + DATALAYOUT(kImageDefault)); + auto img_to_buf_kernels = KernelRegistry::Global().Create( + "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)); + auto concat_img_kernels = + KernelRegistry::Global().Create("concat", + TARGET(kOpenCL), + PRECISION(kFP16), + DATALAYOUT(kImageDefault)); + ASSERT_FALSE(buf_to_img_kernels.empty()); + ASSERT_FALSE(buf_to_img_kernels1.empty()); + ASSERT_FALSE(buf_to_img_kernels2.empty()); + ASSERT_FALSE(img_to_buf_kernels.empty()); + ASSERT_FALSE(concat_img_kernels.empty()); + + auto buf_to_img_kernel = std::move(buf_to_img_kernels.front()); + auto buf_to_img_kernel1 = std::move(buf_to_img_kernels1.front()); + auto buf_to_img_kernel2 = std::move(buf_to_img_kernels2.front()); + auto img_to_buf_kernel = std::move(img_to_buf_kernels.front()); + auto concat_img_kernel = std::move(concat_img_kernels.front()); + LOG(INFO) << "get 1st-0 kernel: " << buf_to_img_kernel->doc(); + LOG(INFO) << "get 1st-1 kernel: " << buf_to_img_kernel1->doc(); + LOG(INFO) << "get 1st-2 kernel: " << buf_to_img_kernel2->doc(); + LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc(); + LOG(INFO) << "get 3rd kernel: " << concat_img_kernel->doc(); + + // set tensors about op param + LOG(INFO) << "set tensors about op param"; + lite::Tensor x0, x1, x2, y, concat_in0, concat_in1, concat_in2, + concat_out, y_ref; + operators::LayoutParam BufferToImageParam0, BufferToImageParam1, + BufferToImageParam2; + operators::LayoutParam ImageToBufferParam; + BufferToImageParam0.x = &x0; + BufferToImageParam0.y = &concat_in0; + BufferToImageParam1.x = &x1; + BufferToImageParam1.y = &concat_in1; + BufferToImageParam2.x = &x2; + BufferToImageParam2.y = &concat_in2; + ImageToBufferParam.x = &concat_out; + ImageToBufferParam.y = &y; + std::vector ins; + operators::ConcatParam concatParam; + ins.push_back(&concat_in0); + ins.push_back(&concat_in1); + ins.push_back(&concat_in2); + concatParam.x = ins; + concatParam.axis = axis; + concatParam.output = &concat_out; + + DDim x0_dim = DDim(std::vector{n, c, h, w}); + DDim x1_dim = DDim(std::vector{n, c, h, w}); + DDim x2_dim = DDim(std::vector{n, c, h, w}); + DDim out_dim = DDim(std::vector{n, c * 3, h, w}); + // x1_dim[axis] += 2; + out_dim[axis] = x0_dim[axis] + x1_dim[axis] + x2_dim[axis]; + x0.Resize(x0_dim); + x1.Resize(x1_dim); + x2.Resize(x2_dim); + y.Resize(out_dim); + concat_in0.Resize(x0_dim); + concat_in1.Resize(x1_dim); + concat_in2.Resize(x2_dim); + concat_out.Resize(out_dim); + y_ref.Resize(out_dim); + auto concat_image2d_shape = + paddle::lite::kernels::opencl::InitImageDimInfoWith(out_dim); + auto concat_image2d_shape_in0 = + paddle::lite::kernels::opencl::InitImageDimInfoWith(x0_dim); + auto concat_image2d_shape_in1 = + paddle::lite::kernels::opencl::InitImageDimInfoWith(x1_dim); + auto concat_image2d_shape_in2 = + paddle::lite::kernels::opencl::InitImageDimInfoWith(x2_dim); + + // initialize tensors + LOG(INFO) << "initialize tensors"; + auto *x_data0 = x0.mutable_data(TARGET(kOpenCL)); + auto *x_data1 = x1.mutable_data(TARGET(kOpenCL)); + auto *x_data2 = x2.mutable_data(TARGET(kOpenCL)); + auto *y_data = y.mutable_data(TARGET(kOpenCL)); + auto *y_data_ref = y_ref.mutable_data(TARGET(kARM)); + auto *mapped_x0 = static_cast(TargetWrapperCL::Map( + x_data0, 0, sizeof(float) * x0_dim.production())); + auto *mapped_x1 = static_cast(TargetWrapperCL::Map( + x_data1, 0, sizeof(float) * x1_dim.production())); + auto *mapped_x2 = static_cast(TargetWrapperCL::Map( + x_data2, 0, sizeof(float) * x2_dim.production())); + auto *mapped_y = static_cast(TargetWrapperCL::Map( + y_data, 0, sizeof(float) * out_dim.production())); + for (int i = 0; i < x0_dim.production(); ++i) { + mapped_x0[i] = i + 1; // (i+1) * 0.01;//*/static_cast(i) - + // x0_dim.production() / 2 * 0.00987; + } + for (int i = 0; i < x1_dim.production(); ++i) { + mapped_x1[i] = x0_dim.production() + i + + 1; // (i+1) * 0.1;//*/static_cast(i) - + // x1_dim.production() / 2 * 0.00987; + } + for (int i = 0; i < x2_dim.production(); ++i) { + mapped_x2[i] = x0_dim.production() + x1_dim.production() + i + + 1; // (i+1) * 1.;//*/ static_cast(i) - + // x2_dim.production() / 2 * 0.00987; + } + for (int i = 0; i < out_dim.production(); ++i) { + mapped_y[i] = static_cast(0); + } + auto *concat_in_data0 = concat_in0.mutable_data( + concat_image2d_shape_in0["width"], + concat_image2d_shape_in0["height"]); + auto *concat_in_data1 = concat_in1.mutable_data( + concat_image2d_shape_in1["width"], + concat_image2d_shape_in1["height"]); + auto *concat_in_data2 = concat_in2.mutable_data( + concat_image2d_shape_in2["width"], + concat_image2d_shape_in2["height"]); + auto *concat_out_data = concat_out.mutable_data( + concat_image2d_shape["width"], concat_image2d_shape["height"]); + + // set context and kernel args + LOG(INFO) << "set context and kernel args"; + std::unique_ptr context(new KernelContext); + context->As().InitOnce(); + + buf_to_img_kernel->SetParam(BufferToImageParam0); + std::unique_ptr buf_to_img_context(new KernelContext); + context->As().CopySharedTo( + &(buf_to_img_context->As())); + buf_to_img_kernel->SetContext(std::move(buf_to_img_context)); + + buf_to_img_kernel1->SetParam(BufferToImageParam1); + std::unique_ptr buf_to_img_context1(new KernelContext); + context->As().CopySharedTo( + &(buf_to_img_context1->As())); + buf_to_img_kernel1->SetContext(std::move(buf_to_img_context1)); + + buf_to_img_kernel2->SetParam(BufferToImageParam2); + std::unique_ptr buf_to_img_context2(new KernelContext); + context->As().CopySharedTo( + &(buf_to_img_context2->As())); + buf_to_img_kernel2->SetContext(std::move(buf_to_img_context2)); + + img_to_buf_kernel->SetParam(ImageToBufferParam); + std::unique_ptr img_to_buf_context(new KernelContext); + context->As().CopySharedTo( + &(img_to_buf_context->As())); + img_to_buf_kernel->SetContext(std::move(img_to_buf_context)); + + concat_img_kernel->SetParam(concatParam); + std::unique_ptr concat_img_context(new KernelContext); + context->As().CopySharedTo( + &(concat_img_context->As())); + concat_img_kernel->SetContext(std::move(concat_img_context)); + + // run kernels + LOG(INFO) << "run kernel: buf_to_img_kernel"; + buf_to_img_kernel->Launch(); + buf_to_img_kernel1->Launch(); + buf_to_img_kernel2->Launch(); + LOG(INFO) << "run kernel: concat_img_kernel"; + concat_img_kernel->Launch(); + LOG(INFO) << "run kernel: img_to_buf_kernel"; + img_to_buf_kernel->Launch(); + + CLRuntime::Global()->command_queue().finish(); + + // compute ref cp_u + std::vector ins_ptr; + std::vector in_dim; + ins_ptr.push_back(mapped_x0); + ins_ptr.push_back(mapped_x1); + ins_ptr.push_back(mapped_x2); + in_dim.push_back(x0_dim); + in_dim.push_back(x1_dim); + in_dim.push_back(x2_dim); + concat_mul_compute_ref( + ins_ptr, in_dim, axis, out_dim, y_data_ref); // result -#ifdef PRINT_RESULT - LOG(INFO) << "---- print kernel result (input -> output) ----"; - for (int eidx = 0; eidx < out_dim.production(); ++eidx) { - std::cout << "x0[" << eidx << "]:" << mapped_x0[eidx] << ",\t x1[" - << eidx << "]:" << mapped_x1[eidx] << " -> y[" << eidx - << "]:" << mapped_y[eidx] << "\t, y_ref[" << eidx - << "]:" << y_data_ref[eidx] << ",\t IS_DIFF_PASSED:" - << IS_DIFF_PASSED( - y_data_ref[eidx], mapped_y[eidx], FP16_MAX_DIFF) - << std::endl; - } -#endif // PRINT_RESULT - - // check result: compare kernel output and cpu output(y_data_ref) - for (int i = 0; i < out_dim.production(); i++) { - auto abs_diff = abs(y_data_ref[i] - mapped_y[i]); - auto relative_diff = - COMPUTE_RELATIVE_DIFF(y_data_ref[i], mapped_y[i]); - EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || - (abs_diff <= FP16_MAX_DIFF), - true); - if ((relative_diff > FP16_MAX_DIFF) && - (abs_diff > FP16_MAX_DIFF)) { - LOG(ERROR) << "error idx:" << i << " mapped_y[" << i - << "]:" << mapped_y[i] << " y_data_ref[" << i - << "]:" << y_data_ref[i] << " abs_diff:" << abs_diff - << " relative_diff:" << relative_diff - << " FP16_MAX_DIFF:" << FP16_MAX_DIFF; - break; - } - } - - // free - LOG(INFO) << "free: unmap x, y"; - TargetWrapperCL::Unmap(x_data0, mapped_x0); - TargetWrapperCL::Unmap(x_data1, mapped_x1); - TargetWrapperCL::Unmap(y_data, mapped_y); -#ifdef LOOP_TEST - } // axis - } // w - } // h - } // c - } // n +#ifdef PRINT_RESULT_CONCAT_MUL + LOG(INFO) << "---- print kernel result (input -> output) ----"; + for (int i = 0; i < out_dim.production(); ++i) { + auto abs_diff = abs(y_data_ref[i] - mapped_y[i]); + auto relative_diff = + COMPUTE_RELATIVE_DIFF(y_data_ref[i], mapped_y[i]); + LOG(INFO) << "idx:" << i << " mapped_y[" << i << "]:" << mapped_y[i] + << " y_data_ref[" << i << "]:" << y_data_ref[i] + << " abs_diff:" << abs_diff + << " relative_diff:" << relative_diff + << " FP16_MAX_DIFF:" << FP16_MAX_DIFF; +#if 0 // note: used to write to file and check + std::cout << "idx:" << i << " mapped_y[" << i + << "]:" << mapped_y[i] << " y_data_ref[" << i + << "]:" << y_data_ref[i] << " abs_diff:" << abs_diff + << " relative_diff:" << relative_diff + << " FP16_MAX_DIFF:" << FP16_MAX_DIFF << "\n"; +#endif + } +#endif // PRINT_RESULT_CONCAT_MUL + +#ifdef CHECK_RESULT_CONCAT_MUL + // check result: compare kernel output and cpu output(y_data_ref) + for (int i = 0; i < out_dim.production(); i++) { + auto abs_diff = abs(y_data_ref[i] - mapped_y[i]); + auto relative_diff = + COMPUTE_RELATIVE_DIFF(y_data_ref[i], mapped_y[i]); + if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) { + LOG(FATAL) << "error idx:" << i << " mapped_y[" << i + << "]:" << mapped_y[i] << " y_data_ref[" << i + << "]:" << y_data_ref[i] << " abs_diff:" << abs_diff + << " relative_diff:" << relative_diff + << " FP16_MAX_DIFF:" << FP16_MAX_DIFF; + break; + } + } +#endif + // free + LOG(INFO) << "free: unmap x, y"; + TargetWrapperCL::Unmap(x_data0, mapped_x0); + TargetWrapperCL::Unmap(x_data1, mapped_x1); + TargetWrapperCL::Unmap(x_data2, mapped_x2); + TargetWrapperCL::Unmap(y_data, mapped_y); +#ifdef LOOP_TEST_CONCAT_MUL + } // w + } // h + } // c #else // nothing to do. #endif diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index f53c464e99..d5eb17691e 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -185,47 +185,29 @@ void ConvImageCompute::PrepareForRun() { impl_ = &ConvImageCompute::DepthwiseConv2d; } else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) { -// #define CONV3x3OPT_FALL_BACK -#ifndef CONV3x3OPT_FALL_BACK // conv2d_3x3 - kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" - : "conv2d_3x3_opt"); - kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl"); - - CLImageConverterFolder converter; - const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); - filter_image_h_ = filter_image_dims[1]; - filter_image_w_ = filter_image_dims[0]; - tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); - - half_t* filter_image_data = - tensor_hold_filter_image_->mutable_data(); - - converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); - filter_gpu_image_->mutable_data( - filter_image_w_, filter_image_h_, filter_image_data); - - impl_ = &ConvImageCompute::Conv2d3x3opt; -#else - kernel_func_names_.push_back("conv2d_3x3"); - kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl"); + if (groups_ == 1) { + kernel_func_names_.push_back( + input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" : "conv2d_3x3_opt"); + kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl"); + impl_ = &ConvImageCompute::Conv2d3x3opt; + } else { // groups_ > 1 + kernel_func_names_.push_back("conv2d_3x3"); + kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl"); + impl_ = &ConvImageCompute::Conv2d3x3; + } CLImageConverterFolder converter; const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims); filter_image_h_ = filter_image_dims[1]; filter_image_w_ = filter_image_dims[0]; tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4}); - half_t* filter_image_data = tensor_hold_filter_image_->mutable_data(); converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims); filter_gpu_image_->mutable_data( filter_image_w_, filter_image_h_, filter_image_data); - - impl_ = &ConvImageCompute::Conv2d3x3; -#endif -#undef CONV3x3OPT_FALL_BACK } else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) { #define CONV_5x5_OPT #ifndef CONV_5x5_OPT @@ -584,6 +566,11 @@ void ConvImageCompute::GetGlobalWorkSize() { static_cast(w_blk_), static_cast(nh_blk_)}; input_c_block_ = static_cast((input_tensor_c_ + 3) / 4); + } else if (kernel_func_names_[0] == "conv2d_3x3") { + global_work_size_ = cl::NDRange{static_cast(c_blk_), + static_cast(w_blk_), + static_cast(nh_blk_)}; + } else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" || kernel_func_names_[0] == "conv2d_3x3_opt") { int w_blk_size = 5; @@ -1185,6 +1172,56 @@ void ConvImageCompute::PrintConvInfo() { VLOG(4) << "dilations: " << dilation_h_ << ", " << dilation_w_; VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," << global_work_size_[1] << "," << global_work_size_[2] << "}"; + VLOG(4) << "groups_:" << groups_; + + LOG(INFO) << "================================"; + LOG(INFO) << "c_blk_=" << c_blk_ << ", w_blk_=" << w_blk_ + << ",nh_blk_=" << nh_blk_; + LOG(INFO) << "input_image_p_:" << input_image_p_; + LOG(INFO) << "filter_image_p_:" << filter_image_p_; + LOG(INFO) << "bias_image_p_:" << bias_image_p_; + LOG(INFO) << "output_image_p_:" << output_image_p_; + + LOG(INFO) << "stride_h_:" << stride_h_; + LOG(INFO) << "stride_w_:" << stride_w_; + + LOG(INFO) << "dilation_h_:" << dilation_h_; + LOG(INFO) << "dilation_w_:" << dilation_w_; + + LOG(INFO) << "pad_up_:" << pad_up_; + LOG(INFO) << "pad_down_:" << pad_down_; + LOG(INFO) << "pad_left_:" << pad_left_; + LOG(INFO) << "pad_right_:" << pad_right_; + + LOG(INFO) << "offset_:" << offset_; + LOG(INFO) << "groups_:" << groups_; + LOG(INFO) << "relu_fused_:" << relu_fused_; + LOG(INFO) << "has_bias_:" << has_bias_; + + LOG(INFO) << "input_tensor_n_:" << input_tensor_n_; + LOG(INFO) << "input_tensor_c_:" << input_tensor_c_; + LOG(INFO) << "input_tensor_h_:" << input_tensor_h_; + LOG(INFO) << "input_tensor_w_:" << input_tensor_w_; + LOG(INFO) << "input_image_h_:" << input_image_h_; + LOG(INFO) << "input_image_w_:" << input_image_w_; + LOG(INFO) << "input_c_block_:" << input_c_block_; + + LOG(INFO) << "output_tensor_n_:" << output_tensor_n_; + LOG(INFO) << "output_tensor_c_:" << output_tensor_c_; + LOG(INFO) << "output_tensor_h_:" << output_tensor_h_; + LOG(INFO) << "output_tensor_w_:" << output_tensor_w_; + LOG(INFO) << "output_image_h_:" << output_image_h_; + LOG(INFO) << "output_image_w_:" << output_image_w_; + + LOG(INFO) << "filter_tensor_n_:" << filter_tensor_n_; + LOG(INFO) << "filter_tensor_c_:" << filter_tensor_c_; + LOG(INFO) << "filter_tensor_h_:" << filter_tensor_h_; + LOG(INFO) << "filter_tensor_w_:" << filter_tensor_w_; + LOG(INFO) << "filter_image_h_:" << filter_image_h_; + LOG(INFO) << "filter_image_w_:" << filter_image_w_; + + LOG(INFO) << "bias_image_h_" << bias_image_h_; + LOG(INFO) << "bias_image_w_" << bias_image_w_; } double ConvImageCompute::Tune(int times) { -- GitLab