未验证 提交 394c2833 编写于 作者: Y ysh329 提交者: GitHub

[KERNEL][OPENCL] Fix concat; Enable conv3x3 with group (#4180)

* [KERNEL][OPENCL] support opencl concat 3inputs, 4inputs of cases; enable conv3x3 with group; fix buffer opencl concat; clean and clear cmake of opencl kernels. test=develop
上级 396607ca
......@@ -14,45 +14,60 @@ limitations under the License. */
#include <cl_common.h>
__kernel void concat2(__global const CL_DTYPE* x_data0, __global const CL_DTYPE* x_data1, __global CL_DTYPE* out_data,
int size, int axis_size, int pre_size, int post_size, int total, int total0, int total1) {
__kernel void concat2(__global const CL_DTYPE* x_data0,
__global const CL_DTYPE* x_data1,
__global CL_DTYPE* out_data,
int size,
int axis_size,
int pre_size,
int post_size,
int total,
int total0,
int total1) {
const int index = get_global_id(0);
if (index < size){
for (int i = 0; i < pre_size; i++){
if (index < size) {
for (int i = 0; i < pre_size; i++) {
int offset_out = index * post_size + i * total;
int offset_in = index * post_size + i * total0;
// memcpy(out_data + offset_out, x_data0 + offset_in, post_size);
CL_DTYPE* dst = out_data + offset_out;
CL_DTYPE* src = x_data0 + offset_in;
for (int k = 0; k < post_size; k++){
__global CL_DTYPE* dst = (__global CL_DTYPE*)(out_data + offset_out);
__global CL_DTYPE* src = (__global CL_DTYPE*)(x_data0 + offset_in);
for (int k = 0; k < post_size; k++) {
*dst++ = *src++;
}
}
}else if (index < axis_size){
for (int i = 0; i < pre_size; i++){
} else if (index < axis_size) {
for (int i = 0; i < pre_size; i++) {
int offset_out = index * post_size + i * total;
int offset_in = index * post_size + i * total1;
// memcpy(out_data + offset_out, x_data1 + offset_in, post_size);
CL_DTYPE* dst = out_data + offset_out;
CL_DTYPE* src = x_data1 + offset_in;
for (int k = 0; k < post_size; k++){
__global CL_DTYPE* dst = (__global CL_DTYPE*)(out_data + offset_out);
__global CL_DTYPE* src = (__global CL_DTYPE*)(x_data1 + offset_in);
for (int k = 0; k < post_size; k++) {
*dst++ = *src++;
}
}
}
}
__kernel void concat_mul(__global const CL_DTYPE* x_data, __global CL_DTYPE* out_data,
int axis_size, int pre_size, int post_size, int start, int total, int total0) {
const int index = get_global_id(0);
if (index < axis_size){
for (int i = 0; i < pre_size; i++){
__kernel void concat_mul_buffer(
__global const CL_DTYPE* x_data,
__global CL_DTYPE* out_data,
int axis_size,
int pre_size,
int post_size,
int start,
int total,
int total0) {
const int index = get_global_id(0); // [0, axis_size)
if (index < axis_size) {
for (int i = 0; i < pre_size; i++) {
int offset_out = (start + index) * post_size + i * total;
int offset_in = index * post_size + i * total0;
// memcpy(out_data + offset_out, x_data + offset_in, post_size);
CL_DTYPE* dst = out_data + offset_out;
CL_DTYPE* src = x_data + offset_in;
for (int k = 0; k < post_size; k++){
__global CL_DTYPE* dst = (__global CL_DTYPE*)(out_data + offset_out);
__global CL_DTYPE* src = (__global CL_DTYPE*)(x_data + offset_in);
for (int k = 0; k < post_size; k++) {
*dst++ = *src++;
}
}
......
......@@ -77,7 +77,7 @@ __kernel void hard_sigmoid(__read_only image2d_t input,
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = clamp(in * scale + value_offset, 0.0, 1.0);
CL_DTYPE4 out = clamp(in * (CL_DTYPE4)(scale) + (CL_DTYPE4)(value_offset), (CL_DTYPE4)(0.0), (CL_DTYPE4)(1.0));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
}
......
......@@ -11,6 +11,286 @@ limitations under the License. */
#include <cl_common.h>
// deprecated
__kernel void concatByCWith2Inputs(
__write_only image2d_t output_image,
__private const int output_tensor_c,
__private const int output_tensor_w,
__read_only image2d_t input0_image,
__private const int input0_tensor_c,
__read_only image2d_t input1_image,
__private const int input1_tensor_c) {
const int out_c = get_global_id(0); // [0, (output_tensor_c + 3) / 4)
const int out_w = get_global_id(1); // [0, output_tensor_w)
const int out_nh = get_global_id(2); // [0, output_tensor_n * output_tensor_h)
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos;
output_pos.x = out_c * output_tensor_w + out_w;
output_pos.y = out_nh;
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c = out_c * 4 + i;
if (c >= output_tensor_c) {
break;
}
int c_in;
CL_DTYPE4 input_data;
if (c < input0_tensor_c) {
c_in = c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0_image, sampler, input_pos);
} else {
c_in = c - input0_tensor_c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1_image, sampler, input_pos);
}
int value_offset = c_in % 4;
float value;
if (value_offset == 0) {
value = input_data.x;
} else if (value_offset == 1) {
value = input_data.y;
} else if (value_offset == 2) {
value = input_data.z;
} else if (value_offset == 3) {
value = input_data.w;
}
if (i == 0) {
output_data.x = value;
} else if (i == 1) {
output_data.y = value;
} else if (i == 2) {
output_data.z = value;
} else if (i == 3) {
output_data.w = value;
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
}
__kernel void concatByCWith3Inputs(
__write_only image2d_t output_image,
__private const int output_tensor_c,
__private const int output_tensor_w,
__read_only image2d_t input0_image,
__private const int input0_tensor_c,
__read_only image2d_t input1_image,
__private const int input1_tensor_c,
__read_only image2d_t input2_image,
__private const int input2_tensor_c) {
const int out_c = get_global_id(0); // [0, (output_tensor_c + 3) / 4)
const int out_w = get_global_id(1); // [0, output_tensor_w)
const int out_nh = get_global_id(2); // [0, output_tensor_n * output_tensor_h)
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos;
output_pos.x = out_c * output_tensor_w + out_w;
output_pos.y = out_nh;
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c = out_c * 4 + i;
if (c >= output_tensor_c) {
break;
}
int c_in;
CL_DTYPE4 input_data;
if (c < input0_tensor_c) {
c_in = c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0_image, sampler, input_pos);
} else if (c < input0_tensor_c + input1_tensor_c) {
c_in = c - input0_tensor_c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1_image, sampler, input_pos);
} else {
c_in = c - input0_tensor_c - input1_tensor_c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input2_image, sampler, input_pos);
}
int value_offset = c_in % 4;
float value;
if (value_offset == 0) {
value = input_data.x;
} else if (value_offset == 1) {
value = input_data.y;
} else if (value_offset == 2) {
value = input_data.z;
} else if (value_offset == 3) {
value = input_data.w;
}
if (i == 0) {
output_data.x = value;
} else if (i == 1) {
output_data.y = value;
} else if (i == 2) {
output_data.z = value;
} else if (i == 3) {
output_data.w = value;
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
}
__kernel void concatByCWith4Inputs(
__write_only image2d_t output_image,
__private const int output_tensor_c,
__private const int output_tensor_w,
__read_only image2d_t input0_image,
__private const int input0_tensor_c,
__read_only image2d_t input1_image,
__private const int input1_tensor_c,
__read_only image2d_t input2_image,
__private const int input2_tensor_c,
__read_only image2d_t input3_image,
__private const int input3_tensor_c) {
const int out_c = get_global_id(0); // [0, (output_tensor_c + 3) / 4)
const int out_w = get_global_id(1); // [0, output_tensor_w)
const int out_nh = get_global_id(2); // [0, output_tensor_n * output_tensor_h)
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 output_pos;
output_pos.x = out_c * output_tensor_w + out_w;
output_pos.y = out_nh;
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c = out_c * 4 + i;
if (c >= output_tensor_c) {
break;
}
int c_in;
CL_DTYPE4 input_data;
if (c < input0_tensor_c) {
c_in = c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0_image, sampler, input_pos);
} else if (c < input0_tensor_c + input1_tensor_c) {
c_in = c - input0_tensor_c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1_image, sampler, input_pos);
} else if (c < input0_tensor_c + input1_tensor_c + input2_tensor_c) {
c_in = c - input0_tensor_c - input1_tensor_c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input2_image, sampler, input_pos);
}else if (c < input0_tensor_c + input1_tensor_c + input2_tensor_c + input3_tensor_c){
c_in = c - input0_tensor_c - input1_tensor_c - input2_tensor_c;
int2 input_pos;
input_pos.x = (c_in / 4) * output_tensor_w + out_w;
input_pos.y = out_nh;
input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input3_image, sampler, input_pos);
}
int value_offset = c_in % 4;
float value;
if (value_offset == 0) {
value = input_data.x;
} else if (value_offset == 1) {
value = input_data.y;
} else if (value_offset == 2) {
value = input_data.z;
} else if (value_offset == 3) {
value = input_data.w;
}
if (i == 0) {
output_data.x = value;
} else if (i == 1) {
output_data.y = value;
} else if (i == 2) {
output_data.z = value;
} else if (i == 3) {
output_data.w = value;
}
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output_data);
}
// deprecated
__kernel void concatByH(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_W,
__private const int out_H_Start) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
int2 input_pos;
input_pos.x = in_c * out_W + in_w;
input_pos.y = in_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 input;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos);
int2 output_pos;
output_pos.x = input_pos.x;
output_pos.y = out_H_Start + input_pos.y;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
}
// deprecated
__kernel void concatByW(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int in_W,
__private const int pre_Width,
__private const int out_Width) {
const int in_c = get_global_id(0);
const int in_w = get_global_id(1);
const int in_nh = get_global_id(2);
int2 input_pos;
input_pos.x = in_c * in_W + in_w;
input_pos.y = in_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 input;
input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,input_pos);
int2 output_pos;
output_pos.x = input_pos.x + pre_Width + out_Width * in_c;
output_pos.y = input_pos.y;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
}
__kernel void concat2(__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
......@@ -103,62 +383,3 @@ __kernel void concat2(__read_only image2d_t input0,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input);
}
}
__kernel void concat_mul(__read_only image2d_t input,
__write_only image2d_t output,
int flag, int C_0, int out_C, int out_W, int in_W, int width) {
const int in_w = get_global_id(0); // image_width cxw/4
const int in_c = get_global_id(1); // image_width cxw/4
const int in_nh = get_global_id(2); // image_height nxh
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 input_pos;
int2 output_pos;
input_pos.x = in_c * in_W + in_w;
input_pos.y = in_nh;
CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
if (flag == 1){ // by channel
CL_DTYPE4 output_data;
for (int i = 0; i < 4; i++) {
int c_out = C_0 + in_c * 4 + i;
if (c_out >= out_C) {
break;
}
int2 output_pos;
output_pos.x = (c_out / 4) * in_W + in_w;
output_pos.y = in_nh;
CL_DTYPE val;
if (i == 0) {
val = input_data.x;
} else if (i == 1) {
val = input_data.y;
} else if (i == 2) {
val = input_data.z;
} else if (i == 3) {
val = input_data.w;
}
if (c_out % 4 == 0){
output_data.x = val;
}else if (c_out % 4 == 1){
output_data.y = val;
}else if (c_out % 4 == 2){
output_data.z = val;
}else if (c_out % 4 == 3){
output_data.w = val;
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data);
}
}else if (flag == 2){ // by height, width == n
int2 output_pos;
output_pos.x = in_c * in_W + in_w;
output_pos.y = in_nh + C_0 * width;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
}else if (flag == 3){ // by width, width == C
int2 output_pos;
output_pos.y = in_nh;
output_pos.x = in_c * out_W + (in_w + C_0);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
}
}
......@@ -8,35 +8,35 @@ set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper
# image kernel #
#####################
# basic
add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_sub_opencl OPENCL basic SRCS elementwise_sub_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fusion_elementwise_add_activation_opencl
add_kernel(elementwise_add_opencl_image OPENCL basic SRCS elementwise_add_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_sub_opencl_image OPENCL basic SRCS elementwise_sub_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_mul_opencl_image OPENCL basic SRCS elementwise_mul_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fusion_elementwise_add_activation_opencl_image
OPENCL basic SRCS fusion_elementwise_add_activation_image_compute.cc
DEPS elementwise_add_opencl ${cl_kernel_deps})
add_kernel(fusion_elementwise_sub_activation_opencl
DEPS elementwise_add_opencl_image ${cl_kernel_deps})
add_kernel(fusion_elementwise_sub_activation_opencl_image
OPENCL basic SRCS fusion_elementwise_sub_activation_image_compute.cc
DEPS elementwise_sub_opencl ${cl_kernel_deps})
add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(transpose_opencl OPENCL basic SRCS transpose_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(layout_opencl OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(grid_sampler_opencl OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(lrn_opencl OPENCL basic SRCS lrn_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(bilinear_interp_opencl OPENCL basic SRCS bilinear_interp_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(slice_opencl OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(instance_norm_opencl OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(box_coder_opencl OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(pixel_shuffle_opencl OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(expand_opencl OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps})
DEPS elementwise_sub_opencl_image ${cl_kernel_deps})
add_kernel(pool_opencl_image OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(activation_opencl_image OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(reshape_opencl_image OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(transpose_opencl_image OPENCL basic SRCS transpose_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv_opencl_image OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(layout_opencl_image OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(concat_opencl_image OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(nearest_interp_opencl_image OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(scale_opencl_image OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(grid_sampler_opencl_image OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(lrn_opencl_image OPENCL basic SRCS lrn_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(bilinear_interp_opencl_image OPENCL basic SRCS bilinear_interp_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(slice_opencl_image OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(instance_norm_opencl_image OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(dropout_opencl_image OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(pad2d_opencl_image OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(box_coder_opencl_image OPENCL basic SRCS box_coder_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(pixel_shuffle_opencl_image OPENCL basic SRCS pixel_shuffle_image_compute.cc DEPS ${cl_kernel_deps})
add_kernel(expand_opencl_image OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_kernel_deps})
# extra
# wait to add ...
......@@ -48,89 +48,89 @@ add_kernel(expand_opencl OPENCL basic SRCS expand_image_compute.cc DEPS ${cl_ker
# image kernel test #
######################
lite_cc_test(test_activation_image_opencl SRCS activation_image_compute_test.cc
DEPS activation_opencl layout_opencl op_registry program context)
DEPS activation_opencl_image layout_opencl_image op_registry program context)
lite_cc_test(test_conv_image_opencl SRCS conv_image_compute_test.cc
DEPS conv_opencl op_registry program context)
DEPS conv_opencl_image op_registry program context)
lite_cc_test(test_depthwise_conv2d_image_opencl SRCS depthwise_conv2d_image_compute_test.cc
DEPS conv_opencl op_registry program context)
DEPS conv_opencl_image op_registry program context)
lite_cc_test(test_nearest_interp_image_opencl SRCS nearest_interp_image_compute_test.cc
DEPS nearest_interp_opencl layout_opencl op_registry program context)
DEPS nearest_interp_opencl_image layout_opencl_image op_registry program context)
lite_cc_test(test_pool_image_opencl SRCS pool_image_compute_test.cc
DEPS pool_opencl op_registry program context)
DEPS pool_opencl_image op_registry program context)
lite_cc_test(test_scale_image_opencl SRCS scale_image_compute_test.cc
DEPS scale_opencl op_registry program context)
DEPS scale_opencl_image op_registry program context)
lite_cc_test(test_reshape_image_opencl SRCS reshape_image_compute_test.cc
DEPS reshape_opencl op_registry program context)
DEPS reshape_opencl_image op_registry program context)
lite_cc_test(test_transpose_image_opencl SRCS transpose_image_compute_test.cc
DEPS transpose_opencl layout_opencl op_registry program context)
DEPS transpose_opencl_image layout_opencl_image op_registry program context)
lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc
DEPS concat_opencl layout_opencl op_registry program context)
DEPS concat_opencl_image layout_opencl_image op_registry program context)
#lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_compute_test.cc
# DEPS elementwise_mul_opencl op_registry program context)
# DEPS elementwise_mul_opencl_image op_registry program context)
lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc
DEPS layout_opencl op_registry program context)
DEPS layout_opencl_image op_registry program context)
lite_cc_test(test_pixel_shuffle_image_opencl SRCS pixel_shuffle_image_compute_test.cc
DEPS pixel_shuffle_opencl op_registry program context)
DEPS pixel_shuffle_opencl_image op_registry program context)
lite_cc_test(test_expand_image_opencl SRCS expand_image_compute_test.cc
DEPS expand_opencl op_registry program context)
DEPS expand_opencl_image op_registry program context)
lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context)
DEPS elementwise_add_opencl_image fusion_elementwise_add_activation_opencl_image op_registry program context)
lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc
DEPS elementwise_sub_opencl fusion_elementwise_sub_activation_opencl op_registry program context)
DEPS elementwise_sub_opencl_image fusion_elementwise_sub_activation_opencl_image op_registry program context)
lite_cc_test(test_grid_sampler_image_opencl SRCS grid_sampler_image_compute_test.cc
DEPS grid_sampler_opencl op_registry program context)
DEPS grid_sampler_opencl_image op_registry program context)
lite_cc_test(test_lrn_image_opencl SRCS lrn_image_compute_test.cc
DEPS lrn_opencl op_registry program context)
DEPS lrn_opencl_image op_registry program context)
lite_cc_test(test_bilinear_interp_image_opencl SRCS bilinear_interp_image_compute_test.cc
DEPS bilinear_interp_opencl op_registry program context)
DEPS bilinear_interp_opencl_image op_registry program context)
lite_cc_test(test_slice_image_opencl SRCS slice_image_compute_test.cc
DEPS slice_opencl op_registry program context)
DEPS slice_opencl_image op_registry program context)
#lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc
# DEPS instance_norm_opencl op_registry program context)
#lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc
# DEPS instance_norm_opencl_image op_registry program context)
lite_cc_test(test_dropout_image_opencl SRCS dropout_image_compute_test.cc
DEPS dropout_opencl op_registry program context)
DEPS dropout_opencl_image op_registry program context)
lite_cc_test(test_pad2d_image_opencl SRCS pad2d_image_compute_test.cc
DEPS pad2d_opencl layout_opencl op_registry program context)
DEPS pad2d_opencl_image layout_opencl_image op_registry program context)
lite_cc_test(test_box_coder_image_opencl SRCS box_coder_image_compute_test.cc
DEPS box_coder_opencl op_registry program context)
DEPS box_coder_opencl_image op_registry program context)
######################
# buffer kernel #
######################
# basic
#add_kernel(activation_opencl OPENCL basic SRCS activation_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(conv_opencl OPENCL basic SRCS conv_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(activation_opencl_buffer OPENCL basic SRCS activation_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(conv_opencl_buffer OPENCL basic SRCS conv_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(pool_opencl OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(concat_opencl OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fc_opencl OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(pool_opencl_buffer OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(concat_opencl_buffer OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fc_opencl_buffer OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps})
# NOTE(ysh329): use fc as `mul`, and mul is not used.
#add_kernel(mul_opencl OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(fusion_elementwise_add_activation_opencl
#add_kernel(mul_opencl_buffer OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(elementwise_add_opencl_buffer OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps})
#add_kernel(fusion_elementwise_add_activation_opencl_buffer
# OPENCL basic SRCS fusion_elementwise_add_activation_buffer_compute.cc
# DEPS elementwise_add_opencl ${cl_kernel_deps})
add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
add_kernel(io_copy_opencl_buffer OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
# extra
# wait to add ...
......@@ -156,10 +156,10 @@ add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${ten
# DEPS pool_opencl op_registry program context)
#lite_cc_test(test_concat_buffer_opencl SRCS concat_buffer_compute_test.cc
# DEPS concat_opencl op_registry program context)
# DEPS concat_opencl_buffer op_registry program context)
lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc
DEPS fc_opencl op_registry program context)
DEPS fc_opencl_buffer op_registry program context)
#lite_cc_test(test_mul_buffer_opencl SRCS mul_buffer_compute_test.cc
# DEPS mul_opencl op_registry program context)
......@@ -168,4 +168,4 @@ lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc
# DEPS elementwise_add_opencl op_registry program context)
lite_cc_test(test_io_copy_buffer_opencl SRCS io_copy_buffer_compute_test.cc
DEPS io_copy_opencl op_registry program context)
DEPS io_copy_opencl_buffer op_registry program context)
......@@ -77,7 +77,7 @@ class ActivationComputeImageDefault
scale_ = act_param_->hard_sigmoid_slope;
threshold_ = act_param_->hard_sigmoid_offset;
break;
defauln:
default:
LOG(FATAL) << "This act type:" << act_type << " doesn't support.";
return;
}
......
......@@ -40,7 +40,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
if (concat_param_->x.size() == 2) {
kernel_func_name_ = "concat2";
} else {
kernel_func_name_ = "concat_mul";
kernel_func_name_ = "concat_mul_buffer";
}
context.cl_context()->AddKernel(kernel_func_name_,
"buffer/concat_kernel.cl",
......@@ -86,7 +86,6 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.output->dims();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
const auto& y_dims = param.output->dims(); // useless: check dim only
......@@ -98,8 +97,9 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
auto inputs = param.x;
int arg_idx = 0;
auto global_work_size = cl::NDRange{axis_size_};
auto global_work_size = cl::NDRange{static_cast<cl::size_type>(axis_size_)};
int total = axis_size_ * post_size_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
if (inputs.size() == 2) {
auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
......@@ -144,6 +144,15 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
auto* x_buf = inputs[i]->data<float, cl::Buffer>();
global_work_size = cl::NDRange{static_cast<size_t>(size)};
int total0 = size * post_size_;
#ifdef LITE_WITH_LOG
LOG(INFO) << "------------- i=" << i << " -------------";
LOG(INFO) << "pre_size:" << pre_size_;
LOG(INFO) << "post_size:" << post_size_;
LOG(INFO) << "size:" << size;
LOG(INFO) << "start:" << start;
LOG(INFO) << "total:" << total;
LOG(INFO) << "total0:" << total0;
#endif
cl_int status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
......
......@@ -99,13 +99,14 @@ TEST(opencl_concat_buffer, compute) {
auto *mapped_x2 = static_cast<float *>(
TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
for (int i = 0; i < x0_dim.production(); i++) {
mapped_x0[i] = dist(engine);
mapped_x0[i] = i + 1; // dist(engine);
}
for (int i = 0; i < x1_dim.production(); i++) {
mapped_x1[i] = dist(engine);
mapped_x1[i] = x0_dim.production() + i + 1; // dist(engine);
}
for (int i = 0; i < x2_dim.production(); i++) {
mapped_x2[i] = dist(engine);
mapped_x2[i] =
x0_dim.production() + x1_dim.production() + i + 1; // dist(engine);
}
// set param and kernel, then run
......@@ -151,9 +152,13 @@ TEST(opencl_concat_buffer, compute) {
auto *out_data = out.mutable_data<float, cl::Buffer>();
auto *mapped_out = static_cast<float *>(
TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
#ifdef PRINT_RESULT_CONCAT_BUFFER
for (int i = 0; i < out_dim.production(); i++) {
EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
LOG(INFO) << "i:" << i << ", out[" << i << "]:" << mapped_out[i]
<< ", out_ref_data[" << i << "]:" << out_ref_data[i];
}
#endif
EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
TargetWrapperCL::Unmap(out_data, mapped_out);
TargetWrapperCL::Unmap(x0_data, mapped_x0);
TargetWrapperCL::Unmap(x1_data, mapped_x1);
......
......@@ -185,47 +185,29 @@ void ConvImageCompute::PrepareForRun() {
impl_ = &ConvImageCompute::DepthwiseConv2d;
} else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) {
// #define CONV3x3OPT_FALL_BACK
#ifndef CONV3x3OPT_FALL_BACK
// conv2d_3x3
kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch"
: "conv2d_3x3_opt");
if (groups_ == 1) {
kernel_func_names_.push_back(
input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" : "conv2d_3x3_opt");
kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::Conv2d3x3opt;
#else
} else { // groups_ > 1
kernel_func_names_.push_back("conv2d_3x3");
kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
impl_ = &ConvImageCompute::Conv2d3x3;
}
CLImageConverterFolder converter;
const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
filter_image_h_ = filter_image_dims[1];
filter_image_w_ = filter_image_dims[0];
tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
half_t* filter_image_data =
tensor_hold_filter_image_->mutable_data<half_t>();
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
filter_image_w_, filter_image_h_, filter_image_data);
impl_ = &ConvImageCompute::Conv2d3x3;
#endif
#undef CONV3x3OPT_FALL_BACK
} else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) {
#define CONV_5x5_OPT
#ifndef CONV_5x5_OPT
......@@ -584,6 +566,11 @@ void ConvImageCompute::GetGlobalWorkSize() {
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
input_c_block_ = static_cast<const int>((input_tensor_c_ + 3) / 4);
} else if (kernel_func_names_[0] == "conv2d_3x3") {
global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
static_cast<size_t>(w_blk_),
static_cast<size_t>(nh_blk_)};
} else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" ||
kernel_func_names_[0] == "conv2d_3x3_opt") {
int w_blk_size = 5;
......@@ -1185,6 +1172,56 @@ void ConvImageCompute::PrintConvInfo() {
VLOG(4) << "dilations: " << dilation_h_ << ", " << dilation_w_;
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
VLOG(4) << "groups_:" << groups_;
LOG(INFO) << "================================";
LOG(INFO) << "c_blk_=" << c_blk_ << ", w_blk_=" << w_blk_
<< ",nh_blk_=" << nh_blk_;
LOG(INFO) << "input_image_p_:" << input_image_p_;
LOG(INFO) << "filter_image_p_:" << filter_image_p_;
LOG(INFO) << "bias_image_p_:" << bias_image_p_;
LOG(INFO) << "output_image_p_:" << output_image_p_;
LOG(INFO) << "stride_h_:" << stride_h_;
LOG(INFO) << "stride_w_:" << stride_w_;
LOG(INFO) << "dilation_h_:" << dilation_h_;
LOG(INFO) << "dilation_w_:" << dilation_w_;
LOG(INFO) << "pad_up_:" << pad_up_;
LOG(INFO) << "pad_down_:" << pad_down_;
LOG(INFO) << "pad_left_:" << pad_left_;
LOG(INFO) << "pad_right_:" << pad_right_;
LOG(INFO) << "offset_:" << offset_;
LOG(INFO) << "groups_:" << groups_;
LOG(INFO) << "relu_fused_:" << relu_fused_;
LOG(INFO) << "has_bias_:" << has_bias_;
LOG(INFO) << "input_tensor_n_:" << input_tensor_n_;
LOG(INFO) << "input_tensor_c_:" << input_tensor_c_;
LOG(INFO) << "input_tensor_h_:" << input_tensor_h_;
LOG(INFO) << "input_tensor_w_:" << input_tensor_w_;
LOG(INFO) << "input_image_h_:" << input_image_h_;
LOG(INFO) << "input_image_w_:" << input_image_w_;
LOG(INFO) << "input_c_block_:" << input_c_block_;
LOG(INFO) << "output_tensor_n_:" << output_tensor_n_;
LOG(INFO) << "output_tensor_c_:" << output_tensor_c_;
LOG(INFO) << "output_tensor_h_:" << output_tensor_h_;
LOG(INFO) << "output_tensor_w_:" << output_tensor_w_;
LOG(INFO) << "output_image_h_:" << output_image_h_;
LOG(INFO) << "output_image_w_:" << output_image_w_;
LOG(INFO) << "filter_tensor_n_:" << filter_tensor_n_;
LOG(INFO) << "filter_tensor_c_:" << filter_tensor_c_;
LOG(INFO) << "filter_tensor_h_:" << filter_tensor_h_;
LOG(INFO) << "filter_tensor_w_:" << filter_tensor_w_;
LOG(INFO) << "filter_image_h_:" << filter_image_h_;
LOG(INFO) << "filter_image_w_:" << filter_image_w_;
LOG(INFO) << "bias_image_h_" << bias_image_h_;
LOG(INFO) << "bias_image_w_" << bias_image_w_;
}
double ConvImageCompute::Tune(int times) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册