From 8c2e1dda31080de38108f97ea745e83663ead47a Mon Sep 17 00:00:00 2001 From: liuruilong Date: Tue, 16 Oct 2018 22:35:36 +0800 Subject: [PATCH] update conv cl kernel --- src/framework/cl/cl_helper.h | 3 +- .../cl/cl_kernel/conv_add_bn_relu_kernel.cl | 324 +++++++++++++++++- .../kernel/cl/cl_kernel/conv_add_kernel.cl | 320 ++++++++++++++++- .../kernel/cl/conv_add_bn_relu_kernel.cpp | 33 +- src/operators/kernel/cl/conv_add_kernel.cpp | 30 +- .../kernel/cl/elementwise_add_kernel.cpp | 37 +- src/operators/kernel/cl/relu_kernel.cpp | 24 +- src/operators/kernel/cl/softmax_kernel.cpp | 2 +- 8 files changed, 730 insertions(+), 43 deletions(-) diff --git a/src/framework/cl/cl_helper.h b/src/framework/cl/cl_helper.h index e1fdd54d6a..8640f6b1a4 100644 --- a/src/framework/cl/cl_helper.h +++ b/src/framework/cl/cl_helper.h @@ -65,8 +65,7 @@ class CLHelper { auto work_size_2 = n * h; return {work_size_0, work_size_1, work_size_2}; - }else if(image_dim.size()==2){ - + } else if (image_dim.size() == 2) { auto image_width = image.ImageWidth(); auto work_size_0 = image_width / image_dim[1]; diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl index f27660a919..7b98369af1 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl @@ -12,10 +12,324 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + + #define BIASE #define BATCH_NORM -#define RELU -#include "conv_kernel.inc.cl" -#undef -#undef -#undef + +__kernel void conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, + +#ifdef BIASE + __read_only image2d_t bias, +#endif + +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; + + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + + #ifdef BIASE + half4 output = read_imageh(bias, sampler, int2(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + half4 input[9]; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + input[0] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[1] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[2] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[3] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height)); + + input[4] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + + input[5] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height)); + + input[6] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + input[7] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + input[8] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + for (int j = 0; j < 9; ++j) { + int2 fuck; + fuck.x = i * 3 + j % 3; + fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, fuck); + output.x += dot(input[j], weight_x); + + fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, fuck); + output.y += dot(input[j], weight_y); + + fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, fuck); + output.z += dot(input[j], weight_z); + + fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, fuck); + output.w += dot(input[j], weight_w); + } + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0)) +#endif + +#ifdef RELU + output = activation(output); +#endif + + write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); +} + + + + +__kernel void depth_conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const int batch_index = out_nh / output_height; + + const int out_nh_in_one_batch = out_nh % output_height; + + const uint kernelHXW = 1; + + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); + + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); + int weight_x_to = out_c * 3; + + half4 inputs[9]; + + inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + + inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + + inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + + inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height)); + + inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + + inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height)); + + inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + + inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + + inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + half4 weight = read_imageh(filter, sampler, (int2)(weight_x_to + j % 3, j / 3)); + output.x += input.x * weight.x; + output.y += input.y * weight.y; + output.z += input.z * weight.z; + output.w += input.w * weight.w; + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)) +#endif + +#ifdef RELU + output = activation(output); +#endif + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); + +} + +__kernel void conv_1x1(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + const uint kernelHXW = 1; + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + if (pos_in.x >=0 && pos_in.y >= 0 && pos_in.x < input_width && pos_in.y < input_height) { + half4 input = read_imageh(input_image, sampler, pos_in); + + half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0)); + output.x += dot(input, weight_x); + + half4 weight_y = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 1)); + output.y += dot(input, weight_y); + + half4 weight_z = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 2)); + output.z += dot(input, weight_z); + + half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3)); + output.w += dot(input, weight_w); + + } + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)) +#endif + +#ifdef RELU + output = activation(output); +#endif + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); + +} + diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl index 3ec50f82d2..20041c80a5 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl @@ -12,6 +12,322 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + #define BIASE -#include "conv_kernel.inc.cl" -#undef + +__kernel void conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, + +#ifdef BIASE + __read_only image2d_t bias, +#endif + +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; + + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + + #ifdef BIASE + half4 output = read_imageh(bias, sampler, int2(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + half4 input[9]; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + input[0] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[1] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[2] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + + input[3] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height)); + + input[4] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + + input[5] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height)); + + input[6] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + input[7] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + input[8] = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + + for (int j = 0; j < 9; ++j) { + int2 fuck; + fuck.x = i * 3 + j % 3; + fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, fuck); + output.x += dot(input[j], weight_x); + + fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, fuck); + output.y += dot(input[j], weight_y); + + fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, fuck); + output.z += dot(input[j], weight_z); + + fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, fuck); + output.w += dot(input[j], weight_w); + } + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0)) +#endif + +#ifdef RELU + output = activation(output); +#endif + + write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); +} + + + + +__kernel void depth_conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const int batch_index = out_nh / output_height; + + const int out_nh_in_one_batch = out_nh % output_height; + + const uint kernelHXW = 1; + + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); + + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); + int weight_x_to = out_c * 3; + + half4 inputs[9]; + + inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + + inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + + inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + + inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height)); + + inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + + inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height)); + + inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + + inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + + inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + half4 weight = read_imageh(filter, sampler, (int2)(weight_x_to + j % 3, j / 3)); + output.x += input.x * weight.x; + output.y += input.y * weight.y; + output.z += input.z * weight.z; + output.w += input.w * weight.w; + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)) +#endif + +#ifdef RELU + output = activation(output); +#endif + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); + +} + +__kernel void conv_1x1(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + const uint kernelHXW = 1; + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + if (pos_in.x >=0 && pos_in.y >= 0 && pos_in.x < input_width && pos_in.y < input_height) { + half4 input = read_imageh(input_image, sampler, pos_in); + + half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0)); + output.x += dot(input, weight_x); + + half4 weight_y = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 1)); + output.y += dot(input, weight_y); + + half4 weight_z = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 2)); + output.z += dot(input, weight_z); + + half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3)); + output.w += dot(input, weight_w); + + } + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)) +#endif + +#ifdef RELU + output = activation(output); +#endif + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); + +} + diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index 610ea50efd..20c148d776 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -130,23 +130,54 @@ void ConvAddBNReluKernel::Compute( cl_int status; status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 9, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 10, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 11, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 12, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 13, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 14, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 15, sizeof(int), &output_width); - status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); CL_CHECK_ERRORS(status); status = diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index f2e1e7a5d7..c6306aeba7 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -73,27 +73,53 @@ void ConvAddKernel::Compute( cl_int status; status = clSetKernelArg(kernel, 0, sizeof(int), &c_block); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(int), &w); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &nh); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 7, sizeof(int), &stride); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 8, sizeof(int), &offset); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 9, sizeof(int), &input_c); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 10, sizeof(int), &dilation); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 11, sizeof(int), &input_width); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 12, sizeof(int), &input_height); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 13, sizeof(int), &output_width); - status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 14, sizeof(int), &output_height); CL_CHECK_ERRORS(status); status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, default_work_size.data(), NULL, 0, NULL, NULL); - CL_CHECK_ERRORS(status); } diff --git a/src/operators/kernel/cl/elementwise_add_kernel.cpp b/src/operators/kernel/cl/elementwise_add_kernel.cpp index 856afc3b91..5b31a9c61d 100644 --- a/src/operators/kernel/cl/elementwise_add_kernel.cpp +++ b/src/operators/kernel/cl/elementwise_add_kernel.cpp @@ -22,16 +22,16 @@ namespace operators { template <> bool ElementwiseAddKernel::Init( ElementwiseAddParam *param) { - CLImage *bias = (CLImage*)param->InputY(); - bias->InitCLImage(cl_helper_.CLContext(),this->cl_helper_.CLCommandQueue()); - if(bias->dims().size()==4){ - this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl"); - }else if(param->InputY()->dims().size()==1){ - DLOG<<"-----init add-----"; - this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl"); - }else{ - DLOG << "error:bias dims is error"; - } + CLImage *bias = (CLImage *)param->InputY(); + bias->InitCLImage(cl_helper_.CLContext(), this->cl_helper_.CLCommandQueue()); + if (bias->dims().size() == 4) { + this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl"); + } else if (param->InputY()->dims().size() == 1) { + DLOG << "-----init add-----"; + this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl"); + } else { + DLOG << "error:bias dims is error"; + } return true; } @@ -44,7 +44,7 @@ void ElementwiseAddKernel::Compute( auto output = param.Out(); cl_int status; auto kernel = this->cl_helper_.KernelAt(0); - if(bias->dims().size()==4){ + if (bias->dims().size() == 4) { cl_mem input_image = input->GetCLImage(); cl_mem bias_image = bias->GetCLImage(); cl_mem output_image = output->GetCLImage(); @@ -57,10 +57,11 @@ void ElementwiseAddKernel::Compute( int width = input->ImageWidth(); int height = input->ImageHeight(); size_t global_work_size[2] = {width, height}; - status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); - }else if(bias->dims().size()==1){ + } else if (bias->dims().size() == 1) { cl_mem input_image = input->GetCLImage(); cl_mem bias_image = bias->GetCLImage(); cl_mem output_image = output->GetCLImage(); @@ -76,13 +77,13 @@ void ElementwiseAddKernel::Compute( int width = input->ImageWidth(); int height = input->ImageHeight(); size_t global_work_size[2] = {width, height}; - status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, - NULL, global_work_size, NULL, 0, NULL, NULL); + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); - }else{ + } else { DLOG << "error:bias dims is error"; } - } template class ElementwiseAddKernel; diff --git a/src/operators/kernel/cl/relu_kernel.cpp b/src/operators/kernel/cl/relu_kernel.cpp index ae40e8ae6a..3b45b7c49b 100644 --- a/src/operators/kernel/cl/relu_kernel.cpp +++ b/src/operators/kernel/cl/relu_kernel.cpp @@ -20,23 +20,23 @@ namespace operators { template <> bool ReluKernel::Init(ReluParam* param) { -// this->cl_helper_.AddKernel("relu", "relu.cl"); + // this->cl_helper_.AddKernel("relu", "relu.cl"); return true; } template <> void ReluKernel::Compute(const ReluParam& param) { -// auto kernel = this->cl_helper_.KernelAt(0); -// const auto* input = param.InputX(); -// auto* output = param.Out(); -// auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); -// auto inputImage = input->GetCLImage(); -// auto outputImage = output->GetCLImage(); -// clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); -// clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); -// const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; -// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, -// work_size, NULL, 0, NULL, NULL); + // auto kernel = this->cl_helper_.KernelAt(0); + // const auto* input = param.InputX(); + // auto* output = param.Out(); + // auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); + // auto inputImage = input->GetCLImage(); + // auto outputImage = output->GetCLImage(); + // clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); + // clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); + // const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; + // clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL, + // work_size, NULL, 0, NULL, NULL); } template class ReluKernel; diff --git a/src/operators/kernel/cl/softmax_kernel.cpp b/src/operators/kernel/cl/softmax_kernel.cpp index 304c716c6e..a8196cf376 100644 --- a/src/operators/kernel/cl/softmax_kernel.cpp +++ b/src/operators/kernel/cl/softmax_kernel.cpp @@ -38,7 +38,7 @@ void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { const auto &inputDim = input->dims(); int dims[4] = {1, 1, 1, 1}; for (int i = 0; i < inputDim.size(); i++) { - dims[4-inputDim.size()+i] = inputDim[i]; + dims[4 - inputDim.size() + i] = inputDim[i]; } clSetKernelArg(kernel, 2, sizeof(int), &dims); clSetKernelArg(kernel, 3, sizeof(int), &dims[1]); -- GitLab