From 3092e4e9e153d1ebd55c867d54e462a986bae447 Mon Sep 17 00:00:00 2001 From: xiebaiyuan Date: Thu, 26 Dec 2019 21:51:36 +0800 Subject: [PATCH] [mobile]Develop common deepwise & fix bug in element mul (#2687) * [mobile][opencl]common deepwise conv,test=mobile * [mobile][opencl]revert deepwise 3x3 for stable ,test = mobile * [mobile][opencl]format convkernel.inc.cl with clang-format ,test = mobile * [mobile][opencl] suite 1*X Y element_y ,test=mobile * [mobile][opencl] add whole print method for cl_image ,test=mobile --- mobile/src/framework/cl/cl_image.cpp | 31 + mobile/src/framework/cl/cl_image.h | 2 + mobile/src/framework/cl/cl_tool.h | 15 +- .../kernel/cl/cl-kernel-func/conv_func.cpp | 17 +- .../kernel/cl/cl_kernel/conv_kernel.inc.cl | 3874 +++++++++-------- .../cl/cl_kernel/elementwise_mul_kernel.cl | 106 +- .../kernel/cl/conv_add_bn_relu_kernel.cpp | 11 + .../operators/kernel/cl/conv_add_kernel.cpp | 9 + .../kernel/cl/conv_add_relu_kernel.cpp | 9 + .../kernel/cl/conv_bn_relu_kernel.cpp | 9 + .../src/operators/kernel/cl/conv_kernel.cpp | 9 + .../operators/kernel/cl/conv_relu_kernel.cpp | 9 + .../kernel/cl/elementwise_mul_kernel.cpp | 57 +- mobile/src/operators/op_param.h | 1 + 14 files changed, 2315 insertions(+), 1844 deletions(-) mode change 100755 => 100644 mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl diff --git a/mobile/src/framework/cl/cl_image.cpp b/mobile/src/framework/cl/cl_image.cpp index 0d4cf87db0..1b8966742d 100644 --- a/mobile/src/framework/cl/cl_image.cpp +++ b/mobile/src/framework/cl/cl_image.cpp @@ -18,6 +18,37 @@ limitations under the License. */ namespace paddle_mobile { namespace framework { +void CLImage::PrintTensor(const CLImage &cl_image) const { + size_t width = cl_image.ImageDims()[0]; + size_t height = cl_image.ImageDims()[1]; + + half_t *image_data = new half_t[height * width * 4]; + cl_int err; + cl_mem image = cl_image.GetCLImage(); + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {width, height, 1}; + err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin, + region, 0, 0, image_data, 0, NULL, NULL); + + CL_CHECK_ERRORS(err); + + PADDLE_MOBILE_ENFORCE(cl_image.numel() != 0, + "cl_image numel should not be 0 "); + float *tensor_data = new float[cl_image.numel()]; + auto converter = cl_image.Converter(); + converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(), + cl_image.dims()); + int stride = cl_image.numel() / 20; + stride = stride > 0 ? stride : 1; + + for (int i = 0; i < cl_image.numel(); i++) { + printf("%f \n", tensor_data[i]); + } + + delete[](tensor_data); + delete[](image_data); +} + void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context, cl_command_queue commandQueue, cl_kernel kernel) { tensor->mutable_data(); diff --git a/mobile/src/framework/cl/cl_image.h b/mobile/src/framework/cl/cl_image.h index f41d0ed659..d3d48cda8b 100644 --- a/mobile/src/framework/cl/cl_image.h +++ b/mobile/src/framework/cl/cl_image.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include @@ -285,6 +286,7 @@ class CLImage { cl_event GetClEvent() const { return cl_event_.get(); } CLImageConverterBase *Converter() const { return image_converter_; } + void PrintTensor(const CLImage &cl_image) const; private: void InitCLImage(cl_context context, size_t width, size_t height, diff --git a/mobile/src/framework/cl/cl_tool.h b/mobile/src/framework/cl/cl_tool.h index 25d5bfc584..ccc97779ec 100644 --- a/mobile/src/framework/cl/cl_tool.h +++ b/mobile/src/framework/cl/cl_tool.h @@ -21,13 +21,14 @@ namespace framework { const char* opencl_error_to_str(cl_int error); -#define CL_CHECK_ERRORS(ERR) \ - if (ERR != CL_SUCCESS) { \ - printf( \ - "OpenCL error with code %s happened in file %s at line %d. " \ - "Exiting.\n", \ - paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \ - __LINE__); \ +#define CL_CHECK_ERRORS(ERR) \ + if (ERR != CL_SUCCESS) { \ + printf( \ + "\033[1;31;40mOpenCL error with code %s happened in file %s at line " \ + "%d. " \ + "Exiting.\033[0m\n", \ + paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \ + __LINE__); \ } } // namespace framework diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp index 72d94ced5d..a4dfd8321e 100644 --- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp +++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp @@ -241,7 +241,9 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper, cl_int status; int index = 0; - if (param.Filter()->dims()[2] == 1 && param.Filter()->dims()[3] == 1) { + const int filter_height = param.Filter()->dims()[2]; + const int filter_width = param.Filter()->dims()[3]; + if (filter_height == 1 && filter_width == 1) { status = clSetKernelArg(kernel, index++, sizeof(int), &c_block); CL_CHECK_ERRORS(status); @@ -404,7 +406,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper, status = clSetKernelArg(kernel, index++, sizeof(int), &output_height); CL_CHECK_ERRORS(status); - if (param.Filter()->dims()[2] == 3 && param.Filter()->dims()[3] == 3) { + if (filter_height == 3 && filter_width == 3) { // normal conv if (param.Filter()->dims()[0] == param.Output()->dims()[1] && param.Filter()->dims()[1] == param.Input()->dims()[1]) { @@ -425,6 +427,17 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper, status = clSetKernelArg(kernel, index++, sizeof(int), &group); CL_CHECK_ERRORS(status); } + } else if (filter_height != 3 && filter_width != 3) { + // not 3x3 + if (param.Filter()->dims()[1] == 1 && + param.Input()->dims()[1] == param.Output()->dims()[1]) { + // deepwise basic use in not 3x3 + status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height); + CL_CHECK_ERRORS(status); + } } status = clEnqueueNDRangeKernel( diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl old mode 100755 new mode 100644 index d3078e6a5c..bf31f32970 --- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -24,980 +24,1101 @@ conv_add_bn_relu #include "cl_common.h" -__kernel void conv_3x3(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter, - +__kernel void conv_3x3( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ - __private const int output_width, - __private const int output_height, - __private const int output_c, - __private const int filter_channel, - __private const int group) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - if (out_c >= global_size_dim0 || - out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height, + __private const int output_c, __private const int filter_channel, + __private const int group) { - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; #ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); + half4 output = read_imageh(bias, sampler, output_pos); #else - half4 output = 0.0f; -#endif - - half4 input[9]; - if (group == 1) { - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - input[0] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - - input[1] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - - input[2] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - - input[3] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - input[4] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - input[5] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - input[6] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - - input[7] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - - input[8] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - - -/* - for (int j = 0; j < 9; ++j) { - int2 pos_of_weight; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - half4 weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - half4 weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - half4 weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - half4 weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - } -*/ - int j = 0; - int2 pos_of_weight; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - half4 weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - half4 weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - half4 weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - half4 weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 1; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 2; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 3; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 4; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 5; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 6; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 7; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 8; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); + half4 output = 0.0f; +#endif + half4 input[9]; + if (group == 1) { + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, + in_pos_in_one_block.y); + input[0] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[1] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[2] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[3] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[4] = select( + read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[5] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[6] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + input[7] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + input[8] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + /* + for (int j = 0; j < 9; ++j) { + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, + pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, + pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, + pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, + pos_of_weight); + output.w += dot(input[j], weight_w); + } + */ + int j = 0; + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 1; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 2; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 3; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 4; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 5; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 6; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 7; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 8; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + } + } else { + for (int i = 0; i < 4; i++) { + int used_input_channel_num = + (out_c * 4 + i) / (output_c / group) * filter_channel; + for (int f_c = 0; f_c < filter_channel; ++f_c) { + int input_c = used_input_channel_num + f_c; + int input_block = input_c / 4; + int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, + in_pos_in_one_block.y); + input[0] = select( + read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + input[1] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + input[2] = select( + read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + input[3] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + input[4] = select( + read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + input[5] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + input[6] = select( + read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + input[7] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + input[8] = select( + read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + half tmp_out = 0; + for (int j = 0; j < 9; j++) { + int2 pos_of_weight; + pos_of_weight.x = (f_c / 4) * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3; + half4 weight = read_imageh(filter, sampler, pos_of_weight); + int f_c_offset = f_c % 4; + half f_value; + if (f_c_offset == 0) { + f_value = weight.x; + } else if (f_c_offset == 1) { + f_value = weight.y; + } else if (f_c_offset == 2) { + f_value = weight.z; + } else if (f_c_offset == 3) { + f_value = weight.w; + } + int input_c_offset = input_c % 4; + half input_value; + if (input_c_offset == 0) { + input_value = input[j].x; + } else if (input_c_offset == 1) { + input_value = input[j].y; + } else if (input_c_offset == 2) { + input_value = input[j].z; + } else if (input_c_offset == 3) { + input_value = input[j].w; + } + tmp_out += f_value * input_value; } - } else { - for (int i = 0; i < 4; i++) { - int used_input_channel_num = (out_c * 4 + i) / (output_c / group) * filter_channel; - for (int f_c = 0; f_c < filter_channel; ++f_c) { - int input_c = used_input_channel_num + f_c; - int input_block = input_c / 4; - int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - input[0] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - input[1] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - input[2] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); - input[3] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - input[4] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - input[5] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - input[6] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - input[7] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - input[8] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); - - half tmp_out = 0; - for (int j = 0; j < 9; j++) { - int2 pos_of_weight; - pos_of_weight.x = (f_c / 4) * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3; - half4 weight = read_imageh(filter, sampler, pos_of_weight); - int f_c_offset = f_c % 4; - half f_value; - if (f_c_offset == 0) { - f_value = weight.x; - } else if (f_c_offset == 1) { - f_value = weight.y; - } else if (f_c_offset == 2) { - f_value = weight.z; - } else if (f_c_offset == 3) { - f_value = weight.w; - } - int input_c_offset = input_c % 4; - half input_value; - if (input_c_offset == 0) { - input_value = input[j].x; - } else if (input_c_offset == 1) { - input_value = input[j].y; - } else if (input_c_offset == 2) { - input_value = input[j].z; - } else if (input_c_offset == 3) { - input_value = input[j].w; - } - tmp_out += f_value * input_value; - } - - if (i == 0) { - output.x += tmp_out; - } else if (i == 1) { - output.y += tmp_out; - } else if (i == 2) { - output.z += tmp_out; - } else if (i == 3) { - output.w += tmp_out; - } - } + + if (i == 0) { + output.x += tmp_out; + } else if (i == 1) { + output.y += tmp_out; + } else if (i == 2) { + output.z += tmp_out; + } else if (i == 3) { + output.w += tmp_out; } + } } - + } #ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef RELU - output = activation(output); + output = activation(output); #endif - write_imageh(output_image, output_pos, output); + write_imageh(output_image, output_pos, output); } - // dilation == 1 -__kernel void conv_3x3spl(__private const int item_ch, - __private const int item_w, - __private const int item_h, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, +// dilation == 1 +__kernel void conv_3x3spl( + __private const int item_ch, __private const int item_w, + __private const int item_h, __read_only image2d_t input_image, + __read_only image2d_t filter_image, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM -__read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int in_ch, - __private const int in_w, - __private const int in_h, - __private const int out_w, - __private const int out_h) { - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = get_global_id(1); - const int item_h_id = get_global_id(2); - - // out_width_id_per_blk and out_batch_id - int out_batch_id = item_h_id / in_h; - int out_w_base_id = item_ch_id * out_w; - int out_w_id0 = item_w_id; - int out_w_id1 = out_w_id0 + item_w; - int out_w_id2 = out_w_id1 + item_w; - int out_w_id3 = out_w_id2 + item_w; - int out_w_id4 = out_w_id3 + item_w; - - // in_width_id_per_blk and in_height_id_per_batch - int in_h_id = (item_h_id % out_h) * stride - pad; - int in_w_id0 = item_w_id * stride - pad; - int in_w_id1 = in_w_id0 + item_w * stride; - int in_w_id2 = in_w_id1 + item_w * stride; - int in_w_id3 = in_w_id2 + item_w * stride; - int in_w_id4 = in_w_id3 + item_w * stride; + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int pad, __private const int dilation, + __private const int in_ch, __private const int in_w, + __private const int in_h, __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; #ifdef BIASE_CH - half4 output[5]; - output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); - output[1] = output[0]; - output[2] = output[0]; - output[3] = output[0]; - output[4] = output[0]; + half4 output[5]; + output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; #elif defined(BIASE_ELE) - half4 output[5]; - output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); - if (out_w_id1 < out_w) { - output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id)); - } - if (out_w_id2 < out_w) { - output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id)); - } - if (out_w_id3 < out_w) { - output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id)); - } - if (out_w_id4 < out_w) { - output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id)); - } + half4 output[5]; + output[0] = + read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } #else - half4 output[5] = {0.0f}; -#endif - - half4 filter[4] = {0.0f}; - half4 filter_trans[4] = {0.0f}; - half4 input[5] = {0.0f}; - - int filter_h_val0 = item_ch_id * 4 * 3; - int filter_h_val1 = filter_h_val0 + 3; - int filter_h_val2 = filter_h_val1 + 3; - int filter_h_val3 = filter_h_val2 + 3; - - for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { - int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; - - const int in_w_base_id = mul24(ch, in_w); - - int filter_w_val = ch * 3; - - for (int h = 0; h < 3; h++) { - - int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, - (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h)); - - for (int w = 0; w < 3; w++) { - - int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, - (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); - int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, - (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); - int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, - (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); - int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, - (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); - int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, - (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); - - filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0 - filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1 - filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2 - filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3 - - filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3 - filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3 - filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3 - filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3 - - input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val)); - input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val)); - input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val)); - input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val)); - input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val)); - - output[0] = mad(input[0].x, filter_trans[0], output[0]); - output[1] = mad(input[1].x, filter_trans[0], output[1]); - output[2] = mad(input[2].x, filter_trans[0], output[2]); - output[3] = mad(input[3].x, filter_trans[0], output[3]); - output[4] = mad(input[4].x, filter_trans[0], output[4]); - - if (ch_surplus < 3) { - output[0] = mad(input[0].y, filter_trans[1], output[0]); - output[1] = mad(input[1].y, filter_trans[1], output[1]); - output[2] = mad(input[2].y, filter_trans[1], output[2]); - output[3] = mad(input[3].y, filter_trans[1], output[3]); - output[4] = mad(input[4].y, filter_trans[1], output[4]); - } - if (ch_surplus < 2) { - output[0] = mad(input[0].z, filter_trans[2], output[0]); - output[1] = mad(input[1].z, filter_trans[2], output[1]); - output[2] = mad(input[2].z, filter_trans[2], output[2]); - output[3] = mad(input[3].z, filter_trans[2], output[3]); - output[4] = mad(input[4].z, filter_trans[2], output[4]); - } - if (ch_surplus < 1) { - output[0] = mad(input[0].w, filter_trans[3], output[0]); - output[1] = mad(input[1].w, filter_trans[3], output[1]); - output[2] = mad(input[2].w, filter_trans[3], output[2]); - output[3] = mad(input[3].w, filter_trans[3], output[3]); - output[4] = mad(input[4].w, filter_trans[3], output[4]); - } - } + half4 output[5] = {0.0f}; +#endif + + half4 filter[4] = {0.0f}; + half4 filter_trans[4] = {0.0f}; + half4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * 3; + int filter_h_val1 = filter_h_val0 + 3; + int filter_h_val2 = filter_h_val1 + 3; + int filter_h_val3 = filter_h_val2 + 3; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * 3; + + for (int h = 0; h < 3; h++) { + int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, + (out_batch_id * in_h + in_h_id + h < 0 || + out_batch_id * in_h + in_h_id + h >= in_h)); + + for (int w = 0; w < 3; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = + read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = + read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = + read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = + read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = + read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); + } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); } + } } + } #ifdef BATCH_NORM - half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); - half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); - output[0] = mad(scale, output[0], biase); - if (out_w_id1 < out_w) { - output[1] = mad(scale, output[1], biase); - } - if (out_w_id2 < out_w) { - output[2] = mad(scale, output[2], biase); - } - if (out_w_id3 < out_w) { - output[3] = mad(scale, output[3], biase); - } - if (out_w_id4 < out_w) { - output[4] = mad(scale, output[4], biase); - } + half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); + half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); + output[0] = mad(scale, output[0], biase); + if (out_w_id1 < out_w) { + output[1] = mad(scale, output[1], biase); + } + if (out_w_id2 < out_w) { + output[2] = mad(scale, output[2], biase); + } + if (out_w_id3 < out_w) { + output[3] = mad(scale, output[3], biase); + } + if (out_w_id4 < out_w) { + output[4] = mad(scale, output[4], biase); + } #endif #ifdef RELU - output[0] = activation(output[0]); - output[1] = activation(output[1]); - output[2] = activation(output[2]); - output[3] = activation(output[3]); - output[4] = activation(output[4]); -#endif - write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]); - if (out_w_id1 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]); - } - if (out_w_id2 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]); - } - if (out_w_id3 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]); - } - if (out_w_id4 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]); - } + output[0] = activation(output[0]); + output[1] = activation(output[1]); + output[2] = activation(output[2]); + output[3] = activation(output[3]); + output[4] = activation(output[4]); +#endif + write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } } - - -__kernel void depth_conv_3x3(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input, - __read_only image2d_t filter, +__kernel void depth_conv_3x3( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, + __read_only image2d_t new_scale, __read_only image2d_t new_biase, #endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - const int batch_index = out_nh / output_height; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - const int out_nh_in_one_batch = out_nh % output_height; + const int batch_index = out_nh / output_height; + const int out_nh_in_one_batch = out_nh % output_height; - int2 stride_xy = (int2)(stride, stride); - int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); - int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + int2 in_pos_in_one_block = + ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); #ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); + half4 output = read_imageh(bias, sampler, output_pos); #else - half4 output = 0.0f; + half4 output = 0.0f; #endif - const int filter_width = 3; - const int filter_height = 3; - - int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); - - int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height); - - int filter_x = pos_in_filter_block.x ; - int filter_y = pos_in_filter_block.y ; - - half4 inputs[9]; - - inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); - - inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); - - inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); - - inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - /* - if (output_pos.x == 112 && output_pos.y == 0) { - half4 input1 = inputs[3]; - float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); - printf(" input4 3 - %v4hlf \n", in); - printf(" --- %d ---\n", in_pos_in_one_block.x - 1); - } - */ - - - inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); - - inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); - - inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); - - inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); - - half4 filters[9]; - filters[0] = read_imageh(filter, sampler,(int2)(filter_x,filter_y)); - filters[1] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y)); - filters[2] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y)); - filters[3] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1)); - filters[4] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1)); - filters[5] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1)); - filters[6] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2)); - filters[7] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2)); - filters[8] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2)); - - for(int i = 0 ;i < 9 ; i++){ - output += inputs[i] * filters[i]; - } + const int filter_width = 3; + const int filter_height = 3; + + int2 pos_in_input_block = + (int2)(out_c * input_width, batch_index * input_height); + + int2 pos_in_filter_block = + (int2)(out_c * filter_width, batch_index * filter_height); + + int filter_x = pos_in_filter_block.x; + int filter_y = pos_in_filter_block.y; + + half4 inputs[9]; + + inputs[0] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 || + in_pos_in_one_block.y - 1 < 0 || + in_pos_in_one_block.x - 1 >= input_width || + in_pos_in_one_block.y - 1 >= input_height) + << 15)); + + inputs[1] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x, + pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - 1 >= input_height) + << 15)); + + inputs[2] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 || + in_pos_in_one_block.y - 1 < 0 || + in_pos_in_one_block.x + 1 >= input_width || + in_pos_in_one_block.y - 1 >= input_height) + << 15)); + + inputs[3] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - 1 >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + /* + if (output_pos.x == 112 && output_pos.y == 0) { + half4 input1 = inputs[3]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 3 - %v4hlf \n", in); + printf(" --- %d ---\n", in_pos_in_one_block.x - 1); + } + */ + + inputs[4] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x, + pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + inputs[5] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x + 1 >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + inputs[6] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, + pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 || + in_pos_in_one_block.y + 1 < 0 || + in_pos_in_one_block.x - 1 >= input_width || + in_pos_in_one_block.y + 1 >= input_height) + << 15)); + + inputs[7] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x, + pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + 1 >= input_height) + << 15)); + + inputs[8] = select( + read_imageh(input, sampler, + (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, + pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 || + in_pos_in_one_block.y + 1 < 0 || + in_pos_in_one_block.x + 1 >= input_width || + in_pos_in_one_block.y + 1 >= input_height) + << 15)); + + half4 filters[9]; + filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y)); + filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y)); + filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y)); + filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1)); + filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1)); + filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1)); + filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2)); + filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2)); + filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2)); + + for (int i = 0; i < 9; i++) { + output += inputs[i] * filters[i]; + } #ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef RELU - output = activation(output); + output = activation(output); #endif + /* + if (output_pos.x == 112 && output_pos.y == 0) { + for (int i = 0; i < 9; ++i) { + half4 input1 = inputs[i]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 %d - %v4hlf \n", i, in); + } + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" depth wise output output4 = %v4hlf \n", out); + printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); + printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); + printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); + printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); + } + */ - /* - - if (output_pos.x == 112 && output_pos.y == 0) { - - for (int i = 0; i < 9; ++i) { - half4 input1 = inputs[i]; - float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); - printf(" input4 %d - %v4hlf \n", i, in); - } - - float4 out = (float4)(output.x, output.y, output.z, output.w); - printf(" depth wise output output4 = %v4hlf \n", out); - printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); - printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); - printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); - printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); - } - - */ - - write_imageh(output_image, output_pos, output); - + write_imageh(output_image, output_pos, output); } - - -__kernel void depth_conv_3x3s1(__private const int ou_ch_blk, - __private const int ou_w_blk, - __private const int ou_nh, - __read_only image2d_t input, - __read_only image2d_t filter, +__kernel void depth_conv_3x3s1( + __private const int ou_ch_blk, __private const int ou_w_blk, + __private const int ou_nh, __read_only image2d_t input, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int in_ch, - __private const int in_w,/* of one block */ - __private const int in_h, /* of one block */ - __private const int ou_w, - __private const int ou_h) { - - const int ou_ch_blk_id = get_global_id(0); - const int ou_w_blk_id = get_global_id(1); - const int ou_nh_id = get_global_id(2); - const int w_blk_size = 2; - - const int batch_id = ou_nh_id / ou_h; - int ou_col_id = ou_w_blk_id * w_blk_size; - int ou_row_id = ou_nh_id % ou_h; - int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id); - - // input pos in one block and on batch - int col_id = ou_col_id - pad; - int row_id = ou_row_id - pad; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - -#ifdef BIASE_CH - half4 output[2]; - output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0)); - output[1] = output[0]; -#elif defined(BIASE_ELE) - half4 output[2]; - output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id)); - if (ou_col_id + 1 < ou_w) { - output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id)); - } -#else - half4 output[2] = {0.0f}; + __read_only image2d_t new_scale, __read_only image2d_t new_biase, #endif + __write_only image2d_t output_image, __private const int stride, + __private const int pad, __private const int dilation, + __private const int in_ch, __private const int in_w, /* of one block */ + __private const int in_h, /* of one block */ + __private const int ou_w, __private const int ou_h) { - half4 inputs[12]; + const int ou_ch_blk_id = get_global_id(0); + const int ou_w_blk_id = get_global_id(1); + const int ou_nh_id = get_global_id(2); + const int w_blk_size = 2; - int filter_x = ou_ch_blk_id * 3; - int filter_y = 0; - half4 filters[9]; - filters[0] = read_imageh(filter, sampler,(int2)(filter_x,filter_y)); - filters[1] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y)); - filters[2] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y)); + const int batch_id = ou_nh_id / ou_h; + int ou_col_id = ou_w_blk_id * w_blk_size; + int ou_row_id = ou_nh_id % ou_h; + int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id); - int in_x = mad24(ou_ch_blk_id, in_w, col_id); - int in_y = mad24(batch_id, in_h, row_id); + // input pos in one block and on batch + int col_id = ou_col_id - pad; + int row_id = ou_row_id - pad; - int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h); - int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w); - inputs[0] = read_imageh(input, sampler, (int2)(x0, y0)); - int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w); - inputs[1] = read_imageh(input, sampler, (int2)(x1, y0)); - int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w); - inputs[2] = read_imageh(input, sampler, (int2)(x2, y0)); - int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w); - inputs[3] = read_imageh(input, sampler, (int2)(x3, y0)); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - output[0] = mad(inputs[0], filters[0], output[0]); - output[1] = mad(inputs[1], filters[0], output[1]); +#ifdef BIASE_CH + half4 output[2]; + output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0)); + output[1] = output[0]; +#elif defined(BIASE_ELE) + half4 output[2]; + output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id)); + if (ou_col_id + 1 < ou_w) { + output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id)); + } +#else + half4 output[2] = {0.0f}; +#endif - output[0] = mad(inputs[1], filters[1], output[0]); - output[1] = mad(inputs[2], filters[1], output[1]); + half4 inputs[12]; - output[0] = mad(inputs[2], filters[2], output[0]); - output[1] = mad(inputs[3], filters[2], output[1]); + int filter_x = ou_ch_blk_id * 3; + int filter_y = 0; + half4 filters[9]; + filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y)); + filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y)); + filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y)); + int in_x = mad24(ou_ch_blk_id, in_w, col_id); + int in_y = mad24(batch_id, in_h, row_id); - filters[3] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1)); - filters[4] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1)); - filters[5] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1)); + int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h); + int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w); + inputs[0] = read_imageh(input, sampler, (int2)(x0, y0)); + int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w); + inputs[1] = read_imageh(input, sampler, (int2)(x1, y0)); + int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w); + inputs[2] = read_imageh(input, sampler, (int2)(x2, y0)); + int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w); + inputs[3] = read_imageh(input, sampler, (int2)(x3, y0)); + output[0] = mad(inputs[0], filters[0], output[0]); + output[1] = mad(inputs[1], filters[0], output[1]); - int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h); - inputs[4] = read_imageh(input, sampler, (int2)(x0, y1)); - inputs[5] = read_imageh(input, sampler, (int2)(x1, y1)); - inputs[6] = read_imageh(input, sampler, (int2)(x2, y1)); - inputs[7] = read_imageh(input, sampler, (int2)(x3, y1)); + output[0] = mad(inputs[1], filters[1], output[0]); + output[1] = mad(inputs[2], filters[1], output[1]); + output[0] = mad(inputs[2], filters[2], output[0]); + output[1] = mad(inputs[3], filters[2], output[1]); - output[0] = mad(inputs[4], filters[3], output[0]); - output[1] = mad(inputs[5], filters[3], output[1]); + filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1)); + filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1)); + filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1)); - output[0] = mad(inputs[5], filters[4], output[0]); - output[1] = mad(inputs[6], filters[4], output[1]); + int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h); + inputs[4] = read_imageh(input, sampler, (int2)(x0, y1)); + inputs[5] = read_imageh(input, sampler, (int2)(x1, y1)); + inputs[6] = read_imageh(input, sampler, (int2)(x2, y1)); + inputs[7] = read_imageh(input, sampler, (int2)(x3, y1)); - output[0] = mad(inputs[6], filters[5], output[0]); - output[1] = mad(inputs[7], filters[5], output[1]); + output[0] = mad(inputs[4], filters[3], output[0]); + output[1] = mad(inputs[5], filters[3], output[1]); + output[0] = mad(inputs[5], filters[4], output[0]); + output[1] = mad(inputs[6], filters[4], output[1]); - filters[6] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2)); - filters[7] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2)); - filters[8] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2)); + output[0] = mad(inputs[6], filters[5], output[0]); + output[1] = mad(inputs[7], filters[5], output[1]); - int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h); - inputs[8] = read_imageh(input, sampler, (int2)(x0, y2)); - inputs[9] = read_imageh(input, sampler, (int2)(x1, y2)); - inputs[10] = read_imageh(input, sampler, (int2)(x2, y2)); - inputs[11] = read_imageh(input, sampler, (int2)(x3, y2)); + filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2)); + filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2)); + filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2)); + int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h); + inputs[8] = read_imageh(input, sampler, (int2)(x0, y2)); + inputs[9] = read_imageh(input, sampler, (int2)(x1, y2)); + inputs[10] = read_imageh(input, sampler, (int2)(x2, y2)); + inputs[11] = read_imageh(input, sampler, (int2)(x3, y2)); - output[0] = mad(inputs[8], filters[6], output[0]); - output[1] = mad(inputs[9], filters[6], output[1]); + output[0] = mad(inputs[8], filters[6], output[0]); + output[1] = mad(inputs[9], filters[6], output[1]); - output[0] = mad(inputs[9], filters[7], output[0]); - output[1] = mad(inputs[10], filters[7], output[1]); + output[0] = mad(inputs[9], filters[7], output[0]); + output[1] = mad(inputs[10], filters[7], output[1]); - output[0] = mad(inputs[10], filters[8], output[0]); - output[1] = mad(inputs[11], filters[8], output[1]); + output[0] = mad(inputs[10], filters[8], output[0]); + output[1] = mad(inputs[11], filters[8], output[1]); #ifdef BATCH_NORM - half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0)); - half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0)); - output[0] = mad(scale, output[0], biase); - if (ou_col_id + 1 < ou_w) { - output[1] = mad(scale, output[1], biase); - } + half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0)); + half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0)); + output[0] = mad(scale, output[0], biase); + if (ou_col_id + 1 < ou_w) { + output[1] = mad(scale, output[1], biase); + } #endif #ifdef RELU - output[0] = activation(output[0]); - output[1] = activation(output[1]); + output[0] = activation(output[0]); + output[1] = activation(output[1]); #endif - write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]); - if (ou_col_id + 1 < ou_w) { - write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]); - } - + write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]); + if (ou_col_id + 1 < ou_w) { + write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]); + } } -__kernel void conv_1x1(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter, +__kernel void conv_1x1( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ - __private const int output_width, - __private const int output_height) { + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const uint kernelHXW = 1; int2 stride_xy = (int2)(stride, stride); int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); - int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + int2 in_pos_in_one_block = + ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); #ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); + half4 output = read_imageh(bias, sampler, output_pos); #else - half4 output = 0.0f; + half4 output = 0.0f; #endif - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - half4 input = read_imageh(input_image, sampler, pos_in); - - half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); - half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); - half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); - half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); -/* - output.x = dot(input, weight0); - output.y = dot(input, weight1); - output.z = dot(input, weight2); - output.w = dot(input, weight3); -*/ + for (int i = 0; i < input_c; ++i) { + int2 pos_in = + (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + half4 input = read_imageh(input_image, sampler, pos_in); - output = mad(input.x, weight0, output); - output = mad(input.y, weight1, output); - output = mad(input.z, weight2, output); - output = mad(input.w, weight3, output); + half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); + half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); + half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); + half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); + /* + output.x = dot(input, weight0); + output.y = dot(input, weight1); + output.z = dot(input, weight2); + output.w = dot(input, weight3); + */ - } + output = mad(input.x, weight0, output); + output = mad(input.y, weight1, output); + output = mad(input.z, weight2, output); + output = mad(input.w, weight3, output); + } #ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef RELU @@ -1017,14 +1138,12 @@ __kernel void conv_1x1_simple( __read_only image2d_t new_scale, __read_only image2d_t new_biase, #endif __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c,__private const int input_c_origin, - __private const int dilation, + __private const int offset, __private const int input_c, + __private const int input_c_origin, __private const int dilation, __private const int input_width, /* of one block */ __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height, - __private const int old_w -) { + __private const int output_width, __private const int output_height, + __private const int old_w) { half zero = 0.0f; const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -1035,7 +1154,7 @@ __kernel void conv_1x1_simple( int out_w2 = out_w + global_size_dim1 * 2; int out_w3 = out_w + global_size_dim1 * 3; - int outpos_main = mul24(out_c , old_w); + int outpos_main = mul24(out_c, old_w); int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); @@ -1064,14 +1183,14 @@ __kernel void conv_1x1_simple( #ifdef BIASE_CH half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output1 = output0; - half4 output2 = output0; - half4 output3 = output0; + half4 output1 = output0; + half4 output2 = output0; + half4 output3 = output0; #elif defined(BIASE_ELE) half4 output0 = read_imageh(bias, sampler, output_pos0); - half4 output1 = output0; - half4 output2 = output0; - half4 output3 = output0; + half4 output1 = output0; + half4 output2 = output0; + half4 output3 = output0; #else half4 output0 = 0.0f; @@ -1082,7 +1201,8 @@ __kernel void conv_1x1_simple( for (int i = 0; i < input_c; ++i) { // ------------0--------------- - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, + in_pos_in_one_block0.y); half4 input0 = read_imageh(input_image, sampler, pos_in); half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); @@ -1095,7 +1215,8 @@ __kernel void conv_1x1_simple( output0 = mad(input0.z, weight2, output0); output0 = mad(input0.w, weight3, output0); // -------------1-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, + in_pos_in_one_block1.y); half4 input1 = read_imageh(input_image, sampler, pos_in); output1 = mad(input1.x, weight0, output1); @@ -1104,7 +1225,8 @@ __kernel void conv_1x1_simple( output1 = mad(input1.w, weight3, output1); // -------------2-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, + in_pos_in_one_block2.y); half4 input2 = read_imageh(input_image, sampler, pos_in); output2 = mad(input2.x, weight0, output2); @@ -1113,7 +1235,8 @@ __kernel void conv_1x1_simple( output2 = mad(input2.w, weight3, output2); // -------------3-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, + in_pos_in_one_block3.y); half4 input3 = read_imageh(input_image, sampler, pos_in); output3 = mad(input3.x, weight0, output3); @@ -1124,38 +1247,38 @@ __kernel void conv_1x1_simple( #ifdef BATCH_NORM output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef RELU output0 = activation(output0); - output1 = activation(output1); - output2 = activation(output2); - output3 = activation(output3); + output1 = activation(output1); + output2 = activation(output2); + output3 = activation(output3); #endif if (out_w0 < old_w) { write_imageh(output_image, output_pos0, output0); } - if (out_w1 < old_w){ + if (out_w1 < old_w) { write_imageh(output_image, output_pos1, output1); } - if (out_w2 < old_w){ + if (out_w2 < old_w) { write_imageh(output_image, output_pos2, output2); } - if (out_w3 < old_w){ + if (out_w3 < old_w) { write_imageh(output_image, output_pos3, output3); } } @@ -1170,14 +1293,12 @@ __kernel void conv_1x1_wrapped( __read_only image2d_t new_scale, __read_only image2d_t new_biase, #endif __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c,__private const int input_c_origin, - __private const int dilation, + __private const int offset, __private const int input_c, + __private const int input_c_origin, __private const int dilation, __private const int input_width, /* of one block */ __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height, - __private const int old_w - ) { + __private const int output_width, __private const int output_height, + __private const int old_w) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -1188,7 +1309,7 @@ __kernel void conv_1x1_wrapped( int out_w2 = out_w + global_size_dim1 * 2; int out_w3 = out_w + global_size_dim1 * 3; - int outpos_main = mul24(out_c , old_w); + int outpos_main = mul24(out_c, old_w); int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); @@ -1216,15 +1337,15 @@ __kernel void conv_1x1_wrapped( ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); #ifdef BIASE_CH - half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0)); - half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - half4 output0 = read_imageh(bias, sampler, output_pos0); - half4 output1 = read_imageh(bias, sampler, output_pos1); - half4 output2 = read_imageh(bias, sampler, output_pos2); - half4 output3 = read_imageh(bias, sampler, output_pos3); + half4 output0 = read_imageh(bias, sampler, output_pos0); + half4 output1 = read_imageh(bias, sampler, output_pos1); + half4 output2 = read_imageh(bias, sampler, output_pos2); + half4 output3 = read_imageh(bias, sampler, output_pos3); #else half4 output0 = 0.0f; @@ -1237,7 +1358,8 @@ __kernel void conv_1x1_wrapped( int burndary_index = input_c * 4 - input_c_origin; for (int i = 0; i < input_c; ++i) { // ------------0--------------- - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, + in_pos_in_one_block0.y); half4 input0 = read_imageh(input_image, sampler, pos_in); half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); @@ -1245,30 +1367,31 @@ __kernel void conv_1x1_wrapped( half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); - if ((max_w_bound - pos_in.x-1) < input_width && (max_w_bound - pos_in.x-1)>=0 ){ - if (burndary_index==0){ + if ((max_w_bound - pos_in.x - 1) < input_width && + (max_w_bound - pos_in.x - 1) >= 0) { + if (burndary_index == 0) { output0 = mad(input0.x, weight0, output0); output0 = mad(input0.y, weight1, output0); output0 = mad(input0.z, weight2, output0); output0 = mad(input0.w, weight3, output0); - } else if (burndary_index==1){ + } else if (burndary_index == 1) { output0 = mad(input0.x, weight0, output0); output0 = mad(input0.y, weight1, output0); output0 = mad(input0.z, weight2, output0); output0 = mad(0.0f, weight3, output0); - } else if (burndary_index==2){ + } else if (burndary_index == 2) { output0 = mad(input0.x, weight0, output0); output0 = mad(input0.y, weight1, output0); output0 = mad(0.0f, weight2, output0); output0 = mad(0.0f, weight3, output0); - } else if (burndary_index==3){ + } else if (burndary_index == 3) { output0 = mad(input0.x, weight0, output0); output0 = mad(0.0f, weight1, output0); output0 = mad(0.0f, weight2, output0); output0 = mad(0.0f, weight3, output0); } - }else { + } else { output0 = mad(input0.x, weight0, output0); output0 = mad(input0.y, weight1, output0); output0 = mad(input0.z, weight2, output0); @@ -1276,33 +1399,34 @@ __kernel void conv_1x1_wrapped( } // -------------1-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, + in_pos_in_one_block1.y); half4 input1 = read_imageh(input_image, sampler, pos_in); - if (abs(max_w_bound - pos_in.x) < input_width){ - if (burndary_index==0){ + if (abs(max_w_bound - pos_in.x) < input_width) { + if (burndary_index == 0) { output1 = mad(input1.x, weight0, output1); output1 = mad(input1.y, weight1, output1); output1 = mad(input1.z, weight2, output1); output1 = mad(input1.w, weight3, output1); - } else if (burndary_index==1){ + } else if (burndary_index == 1) { output1 = mad(input1.x, weight0, output1); output1 = mad(input1.y, weight1, output1); output1 = mad(input1.z, weight2, output1); output1 = mad(0.0f, weight3, output1); - } else if (burndary_index==2){ + } else if (burndary_index == 2) { output1 = mad(input1.x, weight0, output1); output1 = mad(input1.y, weight1, output1); output1 = mad(0.0f, weight2, output1); output1 = mad(0.0f, weight3, output1); - } else if (burndary_index==3){ + } else if (burndary_index == 3) { output1 = mad(input1.x, weight0, output1); output1 = mad(0.0f, weight1, output1); output1 = mad(0.0f, weight2, output1); output1 = mad(0.0f, weight3, output1); } - }else { + } else { output1 = mad(input1.x, weight0, output1); output1 = mad(input1.y, weight1, output1); output1 = mad(input1.z, weight2, output1); @@ -1310,33 +1434,34 @@ __kernel void conv_1x1_wrapped( } // -------------2-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, + in_pos_in_one_block2.y); half4 input2 = read_imageh(input_image, sampler, pos_in); - if (abs(max_w_bound - pos_in.x) < input_width){ - if (burndary_index==0){ + if (abs(max_w_bound - pos_in.x) < input_width) { + if (burndary_index == 0) { output2 = mad(input2.x, weight0, output2); output2 = mad(input2.y, weight1, output2); output2 = mad(input2.z, weight2, output2); output2 = mad(input2.w, weight3, output2); - } else if (burndary_index==1){ + } else if (burndary_index == 1) { output2 = mad(input2.x, weight0, output2); output2 = mad(input2.y, weight1, output2); output2 = mad(input2.z, weight2, output2); output2 = mad(0.0f, weight3, output2); - } else if (burndary_index==2){ + } else if (burndary_index == 2) { output2 = mad(input2.x, weight0, output2); output2 = mad(input2.y, weight1, output2); output2 = mad(0.0f, weight2, output2); output2 = mad(0.0f, weight3, output2); - } else if (burndary_index==3){ + } else if (burndary_index == 3) { output2 = mad(input2.x, weight0, output2); output2 = mad(0.0f, weight1, output2); output2 = mad(0.0f, weight2, output2); output2 = mad(0.0f, weight3, output2); } - }else { + } else { output2 = mad(input2.x, weight0, output2); output2 = mad(input2.y, weight1, output2); output2 = mad(input2.z, weight2, output2); @@ -1344,33 +1469,34 @@ __kernel void conv_1x1_wrapped( } // -------------3-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, + in_pos_in_one_block3.y); half4 input3 = read_imageh(input_image, sampler, pos_in); - if (abs(max_w_bound - pos_in.x) < input_width){ - if (burndary_index==0){ + if (abs(max_w_bound - pos_in.x) < input_width) { + if (burndary_index == 0) { output3 = mad(input3.x, weight0, output3); output3 = mad(input3.y, weight1, output3); output3 = mad(input3.z, weight2, output3); output3 = mad(input3.w, weight3, output3); - } else if (burndary_index==1){ + } else if (burndary_index == 1) { output3 = mad(input3.x, weight0, output3); output3 = mad(input3.y, weight1, output3); output3 = mad(input3.z, weight2, output3); output3 = mad(0.0f, weight3, output3); - } else if (burndary_index==2){ + } else if (burndary_index == 2) { output3 = mad(input3.x, weight0, output3); output3 = mad(input3.y, weight1, output3); output3 = mad(0.0f, weight2, output3); output3 = mad(0.0f, weight3, output3); - } else if (burndary_index==3){ + } else if (burndary_index == 3) { output3 = mad(input3.x, weight0, output3); output3 = mad(0.0f, weight1, output3); output3 = mad(0.0f, weight2, output3); output3 = mad(0.0f, weight3, output3); } - }else { + } else { output3 = mad(input3.x, weight0, output3); output3 = mad(input3.y, weight1, output3); output3 = mad(input3.z, weight2, output3); @@ -1379,1015 +1505,1060 @@ __kernel void conv_1x1_wrapped( } #ifdef BATCH_NORM - output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef RELU - output0 = activation(output0); - output1 = activation(output1); - output2 = activation(output2); - output3 = activation(output3); + output0 = activation(output0); + output1 = activation(output1); + output2 = activation(output2); + output3 = activation(output3); #endif if (out_w0 < old_w) { write_imageh(output_image, output_pos0, output0); } - if (out_w1 < old_w){ + if (out_w1 < old_w) { write_imageh(output_image, output_pos1, output1); } - if (out_w2 < old_w){ + if (out_w2 < old_w) { write_imageh(output_image, output_pos2, output2); } - if (out_w3 < old_w){ + if (out_w3 < old_w) { write_imageh(output_image, output_pos3, output3); } } -__kernel void conv_7x7(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, - +__kernel void conv_7x7( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter_image, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ - __private const int output_width, - __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - if (out_c >= global_size_dim0 || - out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } - const int filter_n0 = 4 * out_c + 0; - const int filter_n1 = 4 * out_c + 1; - const int filter_n2 = 4 * out_c + 2; - const int filter_n3 = 4 * out_c + 3; + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; + if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } + const int filter_n0 = 4 * out_c + 0; + const int filter_n1 = 4 * out_c + 1; + const int filter_n2 = 4 * out_c + 2; + const int filter_n3 = 4 * out_c + 3; - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; #ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); + half4 output = read_imageh(bias, sampler, output_pos); #else - half4 output = 0.0f; -#endif - - half4 input; - half4 filter[4]; - int2 filter_pos0; - int2 filter_pos1; - int2 filter_pos2; - int2 filter_pos3; - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - for(int j = 0; j < 7; j++){ - for(int k = 0; k < 7; k++){ - input = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + (j - 3) * dilation, pos_in.y + (k - 3) * dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15)); - int filter_h = k; - int filter_w = j; - int filter_c = i; - - filter_pos0.x = filter_c * 7 + filter_w; - filter_pos0.y = filter_n0 * 7 + filter_h; - - filter_pos1.x = filter_c * 7 + filter_w; - filter_pos1.y = filter_n1 * 7 + filter_h; - - filter_pos2.x = filter_c * 7 + filter_w; - filter_pos2.y = filter_n2 * 7 + filter_h; - - filter_pos3.x = filter_c * 7 + filter_w; - filter_pos3.y = filter_n3 * 7 + filter_h; - - filter[0] = read_imageh(filter_image, sampler, filter_pos0); - filter[1] = read_imageh(filter_image, sampler, filter_pos1); - filter[2] = read_imageh(filter_image, sampler, filter_pos2); - filter[3] = read_imageh(filter_image, sampler, filter_pos3); - - output.x += dot(input, filter[0]); - output.y += dot(input, filter[1]); - output.z += dot(input, filter[2]); - output.w += dot(input, filter[3]); - } - } + half4 output = 0.0f; +#endif + + half4 input; + half4 filter[4]; + int2 filter_pos0; + int2 filter_pos1; + int2 filter_pos2; + int2 filter_pos3; + for (int i = 0; i < input_c; ++i) { + int2 pos_in = + (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + for (int j = 0; j < 7; j++) { + for (int k = 0; k < 7; k++) { + input = select( + read_imageh(input_image, sampler, + (int2)(pos_in.x + (j - 3) * dilation, + pos_in.y + (k - 3) * dilation)), + (half4)(0.0f), + (ushort4)( + (in_pos_in_one_block.x + (j - 3) * dilation < 0 || + in_pos_in_one_block.y + (k - 3) * dilation < 0 || + in_pos_in_one_block.x + (j - 3) * dilation >= input_width || + in_pos_in_one_block.y + (k - 3) * dilation >= input_height) + << 15)); + int filter_h = k; + int filter_w = j; + int filter_c = i; + + filter_pos0.x = filter_c * 7 + filter_w; + filter_pos0.y = filter_n0 * 7 + filter_h; + + filter_pos1.x = filter_c * 7 + filter_w; + filter_pos1.y = filter_n1 * 7 + filter_h; + + filter_pos2.x = filter_c * 7 + filter_w; + filter_pos2.y = filter_n2 * 7 + filter_h; + + filter_pos3.x = filter_c * 7 + filter_w; + filter_pos3.y = filter_n3 * 7 + filter_h; + + filter[0] = read_imageh(filter_image, sampler, filter_pos0); + filter[1] = read_imageh(filter_image, sampler, filter_pos1); + filter[2] = read_imageh(filter_image, sampler, filter_pos2); + filter[3] = read_imageh(filter_image, sampler, filter_pos3); + + output.x += dot(input, filter[0]); + output.y += dot(input, filter[1]); + output.z += dot(input, filter[2]); + output.w += dot(input, filter[3]); + } } + } #ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef RELU - output = activation(output); + output = activation(output); #endif - write_imageh(output_image, output_pos, output); + write_imageh(output_image, output_pos, output); } -__kernel void conv_7x7Pt1x2(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, - +__kernel void conv_7x7Pt1x2( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter_image, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ - __private const int output_width, - __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w1 = get_global_id(1); - const int out_nh = get_global_id(2); - - if (out_c >= global_size_dim0 || - out_w1 >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } - const int out_w = out_w1 * 2; + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w1 = get_global_id(1); + const int out_nh = get_global_id(2); - int2 output_pos = (int2)(out_c * output_width + out_w, out_nh); + if (out_c >= global_size_dim0 || out_w1 >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } + const int out_w = out_w1 * 2; - const int filter_n0 = 4 * out_c + 0; - const int filter_n1 = 4 * out_c + 1; - const int filter_n2 = 4 * out_c + 2; - const int filter_n3 = 4 * out_c + 3; + int2 output_pos = (int2)(out_c * output_width + out_w, out_nh); - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; + const int filter_n0 = 4 * out_c + 0; + const int filter_n1 = 4 * out_c + 1; + const int filter_n2 = 4 * out_c + 2; + const int filter_n3 = 4 * out_c + 3; - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; - half4 output0 = 0.0f; - half4 output1 = 0.0f; + half4 output0 = 0.0f; + half4 output1 = 0.0f; #ifdef BIASE_CH - output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); - output1 = output0; + output0 = read_imageh(bias, sampler, (int2)(out_c, 0)); + output1 = output0; #elif defined(BIASE_ELE) - output0 = read_imageh(bias, sampler, output_pos); - output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y)); + output0 = read_imageh(bias, sampler, output_pos); + output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y)); #else - output0 = 0.0f; - output1 = 0.0f; -#endif - - half4 input[8]; - half4 filter0[4]; - half4 filter1[4]; - half4 filter2[4]; - half4 filter3[4]; - int2 filter_pos0; - int2 filter_pos1; - int2 filter_pos2; - int2 filter_pos3; - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - for(int k = 0; k < 7; k++){ - for (int j = 0; j < 8; j++) { - input[j] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + (j - 3) * dilation, pos_in.y + (k - 3) * dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15)); - - int filter_h = k; - int filter_w = j; - int filter_c = i; - - if (j < 7) { - filter_pos0.x = filter_c * 7 + filter_w; - filter_pos0.y = filter_n0 * 7 + filter_h; - - filter_pos1.x = filter_c * 7 + filter_w; - filter_pos1.y = filter_n1 * 7 + filter_h; - - filter_pos2.x = filter_c * 7 + filter_w; - filter_pos2.y = filter_n2 * 7 + filter_h; - - filter_pos3.x = filter_c * 7 + filter_w; - filter_pos3.y = filter_n3 * 7 + filter_h; - - filter0[0] = read_imageh(filter_image, sampler, filter_pos0); - filter0[1] = read_imageh(filter_image, sampler, filter_pos1); - filter0[2] = read_imageh(filter_image, sampler, filter_pos2); - filter0[3] = read_imageh(filter_image, sampler, filter_pos3); - - output0.x += dot(input[j], filter0[0]); - output0.y += dot(input[j], filter0[1]); - output0.z += dot(input[j], filter0[2]); - output0.w += dot(input[j], filter0[3]); - } - - if (j > 0) { - output1.x += dot(input[j], filter1[0]); - output1.y += dot(input[j], filter1[1]); - output1.z += dot(input[j], filter1[2]); - output1.w += dot(input[j], filter1[3]); - } - - filter1[0] = filter0[0]; - filter1[1] = filter0[1]; - filter1[2] = filter0[2]; - filter1[3] = filter0[3]; - } + output0 = 0.0f; + output1 = 0.0f; +#endif + + half4 input[8]; + half4 filter0[4]; + half4 filter1[4]; + half4 filter2[4]; + half4 filter3[4]; + int2 filter_pos0; + int2 filter_pos1; + int2 filter_pos2; + int2 filter_pos3; + for (int i = 0; i < input_c; ++i) { + int2 pos_in = + (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + for (int k = 0; k < 7; k++) { + for (int j = 0; j < 8; j++) { + input[j] = select( + read_imageh(input_image, sampler, + (int2)(pos_in.x + (j - 3) * dilation, + pos_in.y + (k - 3) * dilation)), + (half4)(0.0f), + (ushort4)( + (in_pos_in_one_block.x + (j - 3) * dilation < 0 || + in_pos_in_one_block.y + (k - 3) * dilation < 0 || + in_pos_in_one_block.x + (j - 3) * dilation >= input_width || + in_pos_in_one_block.y + (k - 3) * dilation >= input_height) + << 15)); + + int filter_h = k; + int filter_w = j; + int filter_c = i; + + if (j < 7) { + filter_pos0.x = filter_c * 7 + filter_w; + filter_pos0.y = filter_n0 * 7 + filter_h; + + filter_pos1.x = filter_c * 7 + filter_w; + filter_pos1.y = filter_n1 * 7 + filter_h; + + filter_pos2.x = filter_c * 7 + filter_w; + filter_pos2.y = filter_n2 * 7 + filter_h; + + filter_pos3.x = filter_c * 7 + filter_w; + filter_pos3.y = filter_n3 * 7 + filter_h; + + filter0[0] = read_imageh(filter_image, sampler, filter_pos0); + filter0[1] = read_imageh(filter_image, sampler, filter_pos1); + filter0[2] = read_imageh(filter_image, sampler, filter_pos2); + filter0[3] = read_imageh(filter_image, sampler, filter_pos3); + + output0.x += dot(input[j], filter0[0]); + output0.y += dot(input[j], filter0[1]); + output0.z += dot(input[j], filter0[2]); + output0.w += dot(input[j], filter0[3]); + } + + if (j > 0) { + output1.x += dot(input[j], filter1[0]); + output1.y += dot(input[j], filter1[1]); + output1.z += dot(input[j], filter1[2]); + output1.w += dot(input[j], filter1[3]); } - } + + filter1[0] = filter0[0]; + filter1[1] = filter0[1]; + filter1[2] = filter0[2]; + filter1[3] = filter0[3]; + } + } + } #ifdef BATCH_NORM - half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0)); - half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output0 = output0 * s + b; - output1 = output1 * s + b; + half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0)); + half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output0 = output0 * s + b; + output1 = output1 * s + b; #endif #ifdef RELU - output0 = activation(output0); - output1 = activation(output1); + output0 = activation(output0); + output1 = activation(output1); #endif - write_imageh(output_image, output_pos, output0); - if ((output_pos.x + 1) % output_width != 0) { - write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1); - } + write_imageh(output_image, output_pos, output0); + if ((output_pos.x + 1) % output_width != 0) { + write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1); + } } // dilation == 1 -__kernel void conv_7x7spl(__private const int item_ch, - __private const int item_w, - __private const int item_h, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, +__kernel void conv_7x7spl( + __private const int item_ch, __private const int item_w, + __private const int item_h, __read_only image2d_t input_image, + __read_only image2d_t filter_image, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM -__read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, - __private const int stride, - __private const int pad, - __private const int dilation, - __private const int in_ch, - __private const int in_w, - __private const int in_h, - __private const int out_w, - __private const int out_h) { - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - // filter - const int filter_w = 7; - const int filter_h = 7; - - // item_id - const int item_ch_id = get_global_id(0); - const int item_w_id = get_global_id(1); - const int item_h_id = get_global_id(2); - - // out_width_id_per_blk and out_batch_id - int out_batch_id = item_h_id / in_h; - int out_w_base_id = item_ch_id * out_w; - int out_w_id0 = item_w_id; - int out_w_id1 = out_w_id0 + item_w; - int out_w_id2 = out_w_id1 + item_w; - int out_w_id3 = out_w_id2 + item_w; - int out_w_id4 = out_w_id3 + item_w; - - // in_width_id_per_blk and in_height_id_per_batch - int in_h_id = (item_h_id % out_h) * stride - pad; - int in_w_id0 = item_w_id * stride - pad; - int in_w_id1 = in_w_id0 + item_w * stride; - int in_w_id2 = in_w_id1 + item_w * stride; - int in_w_id3 = in_w_id2 + item_w * stride; - int in_w_id4 = in_w_id3 + item_w * stride; + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int pad, __private const int dilation, + __private const int in_ch, __private const int in_w, + __private const int in_h, __private const int out_w, + __private const int out_h) { + + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + // filter + const int filter_w = 7; + const int filter_h = 7; + + // item_id + const int item_ch_id = get_global_id(0); + const int item_w_id = get_global_id(1); + const int item_h_id = get_global_id(2); + + // out_width_id_per_blk and out_batch_id + int out_batch_id = item_h_id / in_h; + int out_w_base_id = item_ch_id * out_w; + int out_w_id0 = item_w_id; + int out_w_id1 = out_w_id0 + item_w; + int out_w_id2 = out_w_id1 + item_w; + int out_w_id3 = out_w_id2 + item_w; + int out_w_id4 = out_w_id3 + item_w; + + // in_width_id_per_blk and in_height_id_per_batch + int in_h_id = (item_h_id % out_h) * stride - pad; + int in_w_id0 = item_w_id * stride - pad; + int in_w_id1 = in_w_id0 + item_w * stride; + int in_w_id2 = in_w_id1 + item_w * stride; + int in_w_id3 = in_w_id2 + item_w * stride; + int in_w_id4 = in_w_id3 + item_w * stride; #ifdef BIASE_CH - half4 output[5]; - output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); - output[1] = output[0]; - output[2] = output[0]; - output[3] = output[0]; - output[4] = output[0]; + half4 output[5]; + output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0)); + output[1] = output[0]; + output[2] = output[0]; + output[3] = output[0]; + output[4] = output[0]; #elif defined(BIASE_ELE) - half4 output[5]; - output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); - if (out_w_id1 < out_w) { - output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id)); - } - if (out_w_id2 < out_w) { - output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id)); - } - if (out_w_id3 < out_w) { - output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id)); - } - if (out_w_id4 < out_w) { - output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id)); - } + half4 output[5]; + output[0] = + read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + output[1] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + output[2] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + output[3] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + output[4] = read_imageh(bias, sampler, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } #else - half4 output[5] = {0.0f}; -#endif - - half4 filter[4] = {0.0f}; - half4 filter_trans[4] = {0.0f}; - half4 input[5] = {0.0f}; - - int filter_h_val0 = item_ch_id * 4 * filter_h; - int filter_h_val1 = filter_h_val0 + filter_h; - int filter_h_val2 = filter_h_val1 + filter_h; - int filter_h_val3 = filter_h_val2 + filter_h; - - for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { - int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; - - const int in_w_base_id = mul24(ch, in_w); - - int filter_w_val = ch * filter_w; - - for (int h = 0; h < filter_h; h++) { - - int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, - (out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h)); - - for (int w = 0; w < filter_w; w++) { - - int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, - (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); - int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, - (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); - int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, - (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); - int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, - (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); - int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, - (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); - - filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0 - filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1 - filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2 - filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3 - - filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3 - filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3 - filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3 - filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3 - - input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val)); - input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val)); - input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val)); - input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val)); - input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val)); - - output[0] = mad(input[0].x, filter_trans[0], output[0]); - output[1] = mad(input[1].x, filter_trans[0], output[1]); - output[2] = mad(input[2].x, filter_trans[0], output[2]); - output[3] = mad(input[3].x, filter_trans[0], output[3]); - output[4] = mad(input[4].x, filter_trans[0], output[4]); - - if (ch_surplus < 3) { - output[0] = mad(input[0].y, filter_trans[1], output[0]); - output[1] = mad(input[1].y, filter_trans[1], output[1]); - output[2] = mad(input[2].y, filter_trans[1], output[2]); - output[3] = mad(input[3].y, filter_trans[1], output[3]); - output[4] = mad(input[4].y, filter_trans[1], output[4]); - } - if (ch_surplus < 2) { - output[0] = mad(input[0].z, filter_trans[2], output[0]); - output[1] = mad(input[1].z, filter_trans[2], output[1]); - output[2] = mad(input[2].z, filter_trans[2], output[2]); - output[3] = mad(input[3].z, filter_trans[2], output[3]); - output[4] = mad(input[4].z, filter_trans[2], output[4]); - } - if (ch_surplus < 1) { - output[0] = mad(input[0].w, filter_trans[3], output[0]); - output[1] = mad(input[1].w, filter_trans[3], output[1]); - output[2] = mad(input[2].w, filter_trans[3], output[2]); - output[3] = mad(input[3].w, filter_trans[3], output[3]); - output[4] = mad(input[4].w, filter_trans[3], output[4]); - } - } + half4 output[5] = {0.0f}; +#endif + + half4 filter[4] = {0.0f}; + half4 filter_trans[4] = {0.0f}; + half4 input[5] = {0.0f}; + + int filter_h_val0 = item_ch_id * 4 * filter_h; + int filter_h_val1 = filter_h_val0 + filter_h; + int filter_h_val2 = filter_h_val1 + filter_h; + int filter_h_val3 = filter_h_val2 + filter_h; + + for (int ch = 0; ch < (in_ch + 3) / 4; ch++) { + int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0; + + const int in_w_base_id = mul24(ch, in_w); + + int filter_w_val = ch * filter_w; + + for (int h = 0; h < filter_h; h++) { + int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1, + (out_batch_id * in_h + in_h_id + h < 0 || + out_batch_id * in_h + in_h_id + h >= in_h)); + + for (int w = 0; w < filter_w; w++) { + int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1, + (in_w_id0 + w < 0 || in_w_id0 + w >= in_w)); + int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1, + (in_w_id1 + w < 0 || in_w_id1 + w >= in_w)); + int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1, + (in_w_id2 + w < 0 || in_w_id2 + w >= in_w)); + int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1, + (in_w_id3 + w < 0 || in_w_id3 + w >= in_w)); + int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1, + (in_w_id4 + w < 0 || in_w_id4 + w >= in_w)); + + filter[0] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0 + filter[1] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1 + filter[2] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2 + filter[3] = read_imageh( + filter_image, sampler, + (int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3 + + filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, + filter[3].x); // in_ch:0,out_ch:0-3 + filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, + filter[3].y); // in_ch:1,out_ch:0-3 + filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, + filter[3].z); // in_ch:2,out_ch:0-3 + filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, + filter[3].w); // in_ch:3,out_ch:0-3 + + input[0] = + read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val)); + input[1] = + read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val)); + input[2] = + read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val)); + input[3] = + read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val)); + input[4] = + read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val)); + + output[0] = mad(input[0].x, filter_trans[0], output[0]); + output[1] = mad(input[1].x, filter_trans[0], output[1]); + output[2] = mad(input[2].x, filter_trans[0], output[2]); + output[3] = mad(input[3].x, filter_trans[0], output[3]); + output[4] = mad(input[4].x, filter_trans[0], output[4]); + + if (ch_surplus < 3) { + output[0] = mad(input[0].y, filter_trans[1], output[0]); + output[1] = mad(input[1].y, filter_trans[1], output[1]); + output[2] = mad(input[2].y, filter_trans[1], output[2]); + output[3] = mad(input[3].y, filter_trans[1], output[3]); + output[4] = mad(input[4].y, filter_trans[1], output[4]); } + if (ch_surplus < 2) { + output[0] = mad(input[0].z, filter_trans[2], output[0]); + output[1] = mad(input[1].z, filter_trans[2], output[1]); + output[2] = mad(input[2].z, filter_trans[2], output[2]); + output[3] = mad(input[3].z, filter_trans[2], output[3]); + output[4] = mad(input[4].z, filter_trans[2], output[4]); + } + if (ch_surplus < 1) { + output[0] = mad(input[0].w, filter_trans[3], output[0]); + output[1] = mad(input[1].w, filter_trans[3], output[1]); + output[2] = mad(input[2].w, filter_trans[3], output[2]); + output[3] = mad(input[3].w, filter_trans[3], output[3]); + output[4] = mad(input[4].w, filter_trans[3], output[4]); + } + } } + } #ifdef BATCH_NORM - half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); - half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); - output[0] = mad(scale, output[0], biase); - if (out_w_id1 < out_w) { - output[1] = mad(scale, output[1], biase); - } - if (out_w_id2 < out_w) { - output[2] = mad(scale, output[2], biase); - } - if (out_w_id3 < out_w) { - output[3] = mad(scale, output[3], biase); - } - if (out_w_id4 < out_w) { - output[4] = mad(scale, output[4], biase); - } + half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0)); + half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0)); + output[0] = mad(scale, output[0], biase); + if (out_w_id1 < out_w) { + output[1] = mad(scale, output[1], biase); + } + if (out_w_id2 < out_w) { + output[2] = mad(scale, output[2], biase); + } + if (out_w_id3 < out_w) { + output[3] = mad(scale, output[3], biase); + } + if (out_w_id4 < out_w) { + output[4] = mad(scale, output[4], biase); + } #endif #ifdef RELU - output[0] = activation(output[0]); - output[1] = activation(output[1]); - output[2] = activation(output[2]); - output[3] = activation(output[3]); - output[4] = activation(output[4]); -#endif - write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]); - if (out_w_id1 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]); - } - if (out_w_id2 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]); - } - if (out_w_id3 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]); - } - if (out_w_id4 < out_w) { - write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]); - } + output[0] = activation(output[0]); + output[1] = activation(output[1]); + output[2] = activation(output[2]); + output[3] = activation(output[3]); + output[4] = activation(output[4]); +#endif + write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), + output[0]); + if (out_w_id1 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), + output[1]); + } + if (out_w_id2 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), + output[2]); + } + if (out_w_id3 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), + output[3]); + } + if (out_w_id4 < out_w) { + write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), + output[4]); + } } -__kernel void conv_5x5(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter_image, - +__kernel void conv_5x5( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter_image, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ - __private const int output_width, - __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - if (out_c >= global_size_dim0 || - out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } - const filter_n0 = 4 * out_c + 0; - const filter_n1 = 4 * out_c + 1; - const filter_n2 = 4 * out_c + 2; - const filter_n3 = 4 * out_c + 3; + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height) { - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + + if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } + const filter_n0 = 4 * out_c + 0; + const filter_n1 = 4 * out_c + 1; + const filter_n2 = 4 * out_c + 2; + const filter_n3 = 4 * out_c + 3; - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; #ifdef BIASE_CH - half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - half4 output = read_imageh(bias, sampler, output_pos); + half4 output = read_imageh(bias, sampler, output_pos); #else - half4 output = 0.0f; -#endif - - half4 input; - half4 filter[4]; - int2 filter_pos0; - int2 filter_pos1; - int2 filter_pos2; - int2 filter_pos3; - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - for(int j = 0; j < 5; j++){ - for(int k = 0; k < 5; k++){ - input = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + (j - 2) * dilation, pos_in.y + (k - 2) * dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + (j - 2) * dilation < 0 || in_pos_in_one_block.y + (k - 2) * dilation < 0 || in_pos_in_one_block.x + (j - 2) * dilation >= input_width || in_pos_in_one_block.y + (k - 2) * dilation >= input_height) << 15)); - int filter_h = k; - int filter_w = j; - int filter_c = i; - - filter_pos0.x = filter_c * 5 + filter_w; - filter_pos0.y = filter_n0 * 5 + filter_h; - - filter_pos1.x = filter_c * 5 + filter_w; - filter_pos1.y = filter_n1 * 5 + filter_h; - - filter_pos2.x = filter_c * 5 + filter_w; - filter_pos2.y = filter_n2 * 5 + filter_h; - - filter_pos3.x = filter_c * 5 + filter_w; - filter_pos3.y = filter_n3 * 5 + filter_h; - - filter[0] = read_imageh(filter_image, sampler, filter_pos0); - filter[1] = read_imageh(filter_image, sampler, filter_pos1); - filter[2] = read_imageh(filter_image, sampler, filter_pos2); - filter[3] = read_imageh(filter_image, sampler, filter_pos3); - - output.x += dot(input, filter[0]); - output.y += dot(input, filter[1]); - output.z += dot(input, filter[2]); - output.w += dot(input, filter[3]); - } - } + half4 output = 0.0f; +#endif + + half4 input; + half4 filter[4]; + int2 filter_pos0; + int2 filter_pos1; + int2 filter_pos2; + int2 filter_pos3; + for (int i = 0; i < input_c; ++i) { + int2 pos_in = + (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + for (int j = 0; j < 5; j++) { + for (int k = 0; k < 5; k++) { + input = select( + read_imageh(input_image, sampler, + (int2)(pos_in.x + (j - 2) * dilation, + pos_in.y + (k - 2) * dilation)), + (half4)(0.0f), + (ushort4)( + (in_pos_in_one_block.x + (j - 2) * dilation < 0 || + in_pos_in_one_block.y + (k - 2) * dilation < 0 || + in_pos_in_one_block.x + (j - 2) * dilation >= input_width || + in_pos_in_one_block.y + (k - 2) * dilation >= input_height) + << 15)); + int filter_h = k; + int filter_w = j; + int filter_c = i; + + filter_pos0.x = filter_c * 5 + filter_w; + filter_pos0.y = filter_n0 * 5 + filter_h; + + filter_pos1.x = filter_c * 5 + filter_w; + filter_pos1.y = filter_n1 * 5 + filter_h; + + filter_pos2.x = filter_c * 5 + filter_w; + filter_pos2.y = filter_n2 * 5 + filter_h; + + filter_pos3.x = filter_c * 5 + filter_w; + filter_pos3.y = filter_n3 * 5 + filter_h; + + filter[0] = read_imageh(filter_image, sampler, filter_pos0); + filter[1] = read_imageh(filter_image, sampler, filter_pos1); + filter[2] = read_imageh(filter_image, sampler, filter_pos2); + filter[3] = read_imageh(filter_image, sampler, filter_pos3); + + output.x += dot(input, filter[0]); + output.y += dot(input, filter[1]); + output.z += dot(input, filter[2]); + output.w += dot(input, filter[3]); + } } + } #ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef RELU - output = activation(output); + output = activation(output); #endif - write_imageh(output_image, output_pos, output); + write_imageh(output_image, output_pos, output); } -__kernel void convBNAdd_3x3(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter, - +__kernel void convBNAdd_3x3( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ - __private const int output_width, - __private const int output_height) { - - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - - if (out_c >= global_size_dim0 || - out_w >= global_size_dim1 || - out_nh >= global_size_dim2) { - return; - } - - - int2 stride_xy; - stride_xy.x = stride; - stride_xy.y = stride; - - int2 ouput_pos_in_one_block; - ouput_pos_in_one_block.x = out_w; - ouput_pos_in_one_block.y = out_nh; - - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - - int2 in_pos_in_one_block; - in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; - in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height) { - half4 output = (half4)0.0f; + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); - half4 input[9]; + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - input[0] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + if (out_c >= global_size_dim0 || out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } - input[1] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; - input[2] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y - dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; - input[3] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - input[4] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; - input[5] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + half4 output = (half4)0.0f; - input[6] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x - dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + half4 input[9]; - input[7] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + for (int i = 0; i < input_c; ++i) { + int2 pos_in = + (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + input[0] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[1] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[2] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y - dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y - dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y - dilation >= input_height) + << 15)); + + input[3] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[4] = select( + read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[5] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y >= input_height) + << 15)); + + input[6] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x - dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x - dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + input[7] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); + + input[8] = + select(read_imageh(input_image, sampler, + (int2)(pos_in.x + dilation, pos_in.y + dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + dilation < 0 || + in_pos_in_one_block.y + dilation < 0 || + in_pos_in_one_block.x + dilation >= input_width || + in_pos_in_one_block.y + dilation >= input_height) + << 15)); - input[8] = select(read_imageh(input_image, sampler, - (int2)(pos_in.x + dilation, pos_in.y + dilation)), - (half4)(0.0f), - (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); + /* + for (int j = 0; j < 9; ++j) { + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); -/* - for (int j = 0; j < 9; ++j) { - int2 pos_of_weight; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - half4 weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - half4 weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - half4 weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - half4 weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - } -*/ - int j = 0; - int2 pos_of_weight; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - half4 weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - half4 weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - half4 weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - half4 weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 1; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 2; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 3; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 4; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 5; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 6; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 7; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); - - j = 8; - pos_of_weight.x = i * 3 + j % 3; - pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; - weight_x = read_imageh(filter, sampler, pos_of_weight); - output.x += dot(input[j], weight_x); - - pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; - weight_y = read_imageh(filter, sampler, pos_of_weight); - output.y += dot(input[j], weight_y); - - pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; - weight_z = read_imageh(filter, sampler, pos_of_weight); - output.z += dot(input[j], weight_z); - - pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; - weight_w = read_imageh(filter, sampler, pos_of_weight); - output.w += dot(input[j], weight_w); + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); - } + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + } + */ + int j = 0; + int2 pos_of_weight; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + half4 weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + half4 weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + half4 weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + half4 weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 1; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 2; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 3; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 4; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 5; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 6; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 7; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + + j = 8; + pos_of_weight.x = i * 3 + j % 3; + pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3; + weight_x = read_imageh(filter, sampler, pos_of_weight); + output.x += dot(input[j], weight_x); + + pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3; + weight_y = read_imageh(filter, sampler, pos_of_weight); + output.y += dot(input[j], weight_y); + + pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3; + weight_z = read_imageh(filter, sampler, pos_of_weight); + output.z += dot(input[j], weight_z); + + pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3; + weight_w = read_imageh(filter, sampler, pos_of_weight); + output.w += dot(input[j], weight_w); + } #ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef BIASE_CH - output += read_imageh(bias, sampler, (int2)(out_c, 0)); + output += read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - output += read_imageh(bias, sampler, output_pos); + output += read_imageh(bias, sampler, output_pos); #endif #ifdef RELU - output = activation(output); + output = activation(output); #endif - write_imageh(output_image, output_pos, output); + write_imageh(output_image, output_pos, output); } -__kernel void convBNAdd_1x1(__private const int global_size_dim0, - __private const int global_size_dim1, - __private const int global_size_dim2, - __read_only image2d_t input_image, - __read_only image2d_t filter, +__kernel void convBNAdd_1x1( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, - __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, - __private const int stride, - __private const int offset, - __private const int input_c, - __private const int dilation, - __private const int input_width,/* of one block */ - __private const int input_height,/* of one block */ - __private const int output_width, - __private const int output_height) { + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const uint kernelHXW = 1; int2 stride_xy = (int2)(stride, stride); int2 ouput_pos_in_one_block = (int2)(out_w, out_nh); - int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); - + int2 in_pos_in_one_block = + ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); half4 output = 0.0f; - for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - half4 input = read_imageh(input_image, sampler, pos_in); - - half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); - half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); - half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); - half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); -/* - output.x = dot(input, weight0); - output.y = dot(input, weight1); - output.z = dot(input, weight2); - output.w = dot(input, weight3); -*/ + for (int i = 0; i < input_c; ++i) { + int2 pos_in = + (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + half4 input = read_imageh(input_image, sampler, pos_in); - output = mad(input.x, weight0, output); - output = mad(input.y, weight1, output); - output = mad(input.z, weight2, output); - output = mad(input.w, weight3, output); + half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); + half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1)); + half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2)); + half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3)); + /* + output.x = dot(input, weight0); + output.y = dot(input, weight1); + output.z = dot(input, weight2); + output.w = dot(input, weight3); + */ - } + output = mad(input.x, weight0, output); + output = mad(input.y, weight1, output); + output = mad(input.z, weight2, output); + output = mad(input.w, weight3, output); + } #ifdef BATCH_NORM - output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef BIASE_CH - output += read_imageh(bias, sampler, (int2)(out_c, 0)); + output += read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - output += read_imageh(bias, sampler, output_pos); + output += read_imageh(bias, sampler, output_pos); #endif #ifdef RELU @@ -2398,24 +2569,22 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0, } __kernel void convBNAdd_1x1_spl( - __private const int global_size_dim0, __private const int global_size_dim1, - __private const int global_size_dim2, __read_only image2d_t input_image, - __read_only image2d_t filter, + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input_image, + __read_only image2d_t filter, #if defined(BIASE_CH) || defined(BIASE_ELE) - __read_only image2d_t bias, + __read_only image2d_t bias, #endif #ifdef BATCH_NORM - __read_only image2d_t new_scale, __read_only image2d_t new_biase, -#endif - __write_only image2d_t output_image, __private const int stride, - __private const int offset, __private const int input_c, - __private const int dilation, - __private const int input_width, /* of one block */ - __private const int input_height, /* of one block */ - __private const int output_width, - __private const int output_height, - __private const int old_w -) { + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height, + __private const int old_w) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -2426,33 +2595,32 @@ __kernel void convBNAdd_1x1_spl( int out_w2 = out_w + global_size_dim1 * 2; int out_w3 = out_w + global_size_dim1 * 3; - int outpos_main = mul24(out_c , old_w); + int outpos_main = mul24(out_c, old_w); int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh); int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh); int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh); int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh); const sampler_t sampler = - CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; int2 stride_xy = (int2)(stride, stride); int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh); int2 in_pos_in_one_block0 = - ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); + ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset); int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh); int2 in_pos_in_one_block1 = - ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset); + ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset); int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh); int2 in_pos_in_one_block2 = - ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset); + ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset); int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh); int2 in_pos_in_one_block3 = - ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); - + ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset); half4 output0 = 0.0f; half4 output1 = 0.0f; @@ -2461,7 +2629,8 @@ __kernel void convBNAdd_1x1_spl( for (int i = 0; i < input_c; ++i) { // ------------0--------------- - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y); + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, + in_pos_in_one_block0.y); half4 input0 = read_imageh(input_image, sampler, pos_in); half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0)); @@ -2475,7 +2644,8 @@ __kernel void convBNAdd_1x1_spl( output0 = mad(input0.w, weight3, output0); // -------------1-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, + in_pos_in_one_block1.y); half4 input1 = read_imageh(input_image, sampler, pos_in); // // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + @@ -2490,7 +2660,8 @@ __kernel void convBNAdd_1x1_spl( output1 = mad(input1.w, weight3, output1); // -------------2-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, + in_pos_in_one_block2.y); half4 input2 = read_imageh(input_image, sampler, pos_in); // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + @@ -2505,7 +2676,8 @@ __kernel void convBNAdd_1x1_spl( output2 = mad(input2.w, weight3, output2); // -------------3-------------- - pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y); + pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, + in_pos_in_one_block3.y); half4 input3 = read_imageh(input_image, sampler, pos_in); // half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + @@ -2521,29 +2693,29 @@ __kernel void convBNAdd_1x1_spl( } #ifdef BATCH_NORM - output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); - output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + - read_imageh(new_biase, sampler, (int2)(out_c, 0)); + output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif #ifdef BIASE_CH - output0 += read_imageh(bias, sampler, (int2)(out_c, 0)); - output1 += read_imageh(bias, sampler, (int2)(out_c, 0)); - output2 += read_imageh(bias, sampler, (int2)(out_c, 0)); - output3 += read_imageh(bias, sampler, (int2)(out_c, 0)); + output0 += read_imageh(bias, sampler, (int2)(out_c, 0)); + output1 += read_imageh(bias, sampler, (int2)(out_c, 0)); + output2 += read_imageh(bias, sampler, (int2)(out_c, 0)); + output3 += read_imageh(bias, sampler, (int2)(out_c, 0)); #elif defined(BIASE_ELE) - output0 += read_imageh(bias, sampler, output_pos0); - output1 += read_imageh(bias, sampler, output_pos1); - output2 += read_imageh(bias, sampler, output_pos2); - output3 += read_imageh(bias, sampler, output_pos3); + output0 += read_imageh(bias, sampler, output_pos0); + output1 += read_imageh(bias, sampler, output_pos1); + output2 += read_imageh(bias, sampler, output_pos2); + output3 += read_imageh(bias, sampler, output_pos3); #endif #ifdef RELU @@ -2557,22 +2729,108 @@ __kernel void convBNAdd_1x1_spl( write_imageh(output_image, output_pos0, output0); } - if (out_w1 < old_w){ + if (out_w1 < old_w) { write_imageh(output_image, output_pos1, output1); } - if (out_w2 < old_w){ + if (out_w2 < old_w) { write_imageh(output_image, output_pos2, output2); } - if (out_w3 < old_w){ + if (out_w3 < old_w) { write_imageh(output_image, output_pos3, output3); } } +__kernel void depth_conv( + __private const int global_size_dim0, __private const int global_size_dim1, + __private const int global_size_dim2, __read_only image2d_t input, + __read_only image2d_t filter, +#if defined(BIASE_CH) || defined(BIASE_ELE) + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, __private const int stride, + __private const int offset, __private const int input_c, + __private const int dilation, + __private const int input_width, /* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, __private const int output_height, + __private const int filter_width, __private const int filter_height) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + const int batch_index = out_nh / output_height; + const int out_nh_in_one_batch = out_nh % output_height; + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); + int2 in_pos_in_one_block = + ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); +#ifdef BIASE_CH + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#elif defined(BIASE_ELE) + half4 output = read_imageh(bias, sampler, output_pos); +#else + half4 output = 0.0f; +#endif + int2 pos_in_input_block = + (int2)(out_c * input_width, batch_index * input_height); + int2 pos_in_filter_block = + (int2)(out_c * filter_width, batch_index * filter_height); + int filter_x = pos_in_filter_block.x; + int filter_y = pos_in_filter_block.y; + int input_x_base = pos_in_input_block.x + in_pos_in_one_block.x; + int input_y_base = pos_in_input_block.y + in_pos_in_one_block.y; + int2 align = {filter_width / 2, filter_height / 2}; + /* if (output_pos.x == 0 && output_pos.y == 0){ + printf("align.x=%d align.y=%d \n ",align.x,align.y); + printf("stride=%d \n ",stride); + }*/ + for (int fy = 0; fy < filter_height; ++fy) { + for (int fx = 0; fx < filter_width; ++fx) { + int x_off = fx - align.x; + int y_off = fy - align.y; + /* if (output_pos.x == 0 && output_pos.y == 0){ + printf("fx=%d fy=%d \n ",fx,fy); + printf("x_off=%d y_off=%d \n ",x_off,y_off); + }*/ + half4 in = select( + read_imageh(input, sampler, + (int2)(input_x_base + x_off, input_y_base + y_off)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + x_off < 0 || + in_pos_in_one_block.y + y_off < 0 || + in_pos_in_one_block.x + x_off >= input_width || + in_pos_in_one_block.y + y_off >= input_height) + << 15)); + half4 f = + read_imageh(filter, sampler, (int2)(filter_x + fx, filter_y + fy)); + output += in * f; + /*if (output_pos.x ==111 && output_pos.y == 0){ + printf("in={ %f , %f , %f , %f } \n + ",convert_float(in.x),convert_float(in.y),convert_float(in.z),convert_float(in.w)); + printf("filter={ %f , %f , %f , %f } \n + ",convert_float(f.x),convert_float(f.y),convert_float(f.z),convert_float(f.w)); + printf("output={ %f , %f , %f , %f } \n + ",convert_float(output.x),convert_float(output.y),convert_float(output.z),convert_float(output.w)); + }*/ + } + } +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif - - +#ifdef RELU + output = activation(output); +#endif + write_imageh(output_image, output_pos, output); +} \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl index b975eb4056..4895c07d20 100644 --- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl +++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl @@ -13,33 +13,101 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma OPENCL EXTENSION cl_khr_fp16 : enable -__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) { - int x = get_global_id(0); - int y = get_global_id(1); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int2 coords; - coords.x = x; - coords.y = y; - half4 in = read_imageh(input, sampler, coords); - half4 biase = read_imageh(bias, sampler, coords); - half4 output = in * biase; - write_imageh(outputImage,coords,output); - } - - -__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,__write_only -image2d_t outputImage, int w) { +__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias, + __write_only image2d_t outputImage) { int x = get_global_id(0); int y = get_global_id(1); - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + half4 in = read_imageh(input, sampler, coords); + half4 biase = read_imageh(bias, sampler, coords); + half4 output = in * biase; + write_imageh(outputImage, coords, output); +} + +__kernel void channel_mul(__global image2d_t input, __global image2d_t bias, + __write_only image2d_t outputImage, int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; int2 coords; coords.x = x; coords.y = y; int2 coords_bias; - coords_bias.x = x/w; + coords_bias.x = x / w; coords_bias.y = 0; half4 in = read_imageh(input, sampler, coords); half4 biase = read_imageh(bias, sampler, coords_bias); half4 output = in * biase; - write_imageh(outputImage,coords,output); + write_imageh(outputImage, coords, output); } + +// etc : 1 1 1 72 +// run time Y [value,0,0,0] * 72 +__kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias, + __write_only image2d_t outputImage, int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + + int2 coords_bias0; + int2 coords_bias1; + int2 coords_bias2; + int2 coords_bias3; + + /* if (x == 0 && y == 0) { + half4 b = (half4){0, 0, 0, 0}; + #define PPI(j, k) \ + b = read_imageh(bias, sampler, (int2){j, k}); \ + printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \ + convert_float(b.y), convert_float(b.z), convert_float(b.w)); + for (int i = 0; i < 73; ++i) { + PPI(i, 0); + } + #undef PPI + }*/ + + coords_bias0.x = x / w * 4; + coords_bias0.y = 0; + + coords_bias1.x = x / w * 4 + 1; + coords_bias1.y = 0; + + coords_bias2.x = x / w * 4 + 2; + coords_bias2.y = 0; + + coords_bias3.x = x / w * 4 + 3; + coords_bias3.y = 0; + + half4 biase0 = read_imageh(bias, sampler, coords_bias0); + half4 biase1 = read_imageh(bias, sampler, coords_bias1); + half4 biase2 = read_imageh(bias, sampler, coords_bias2); + half4 biase3 = read_imageh(bias, sampler, coords_bias3); + /* if (x == 0 && y == 0) { + printf("bias0={ %f , %f , %f , %f }\n ", + convert_float(biase0.x), convert_float(biase0.y), + convert_float(biase0.z), convert_float(biase0.w)); + + printf("bias1={ %f , %f , %f , %f }\n ", + convert_float(biase1.x), convert_float(biase1.y), + convert_float(biase1.z), convert_float(biase1.w)); + printf("bias2={ %f , %f , %f , %f }\n ", + convert_float(biase2.x), convert_float(biase2.y), + convert_float(biase2.z), convert_float(biase2.w)); + printf("bias3={ %f , %f , %f , %f }\n ", + convert_float(biase3.x), convert_float(biase3.y), + convert_float(biase3.z), convert_float(biase3.w)); + }*/ + half4 biase = {biase0.x, biase1.x, biase2.x, biase3.x}; + half4 in = read_imageh(input, sampler, coords); + half4 output = mad(in, biase, 0); + write_imageh(outputImage, coords, output); +} \ No newline at end of file diff --git a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index a3a469dc86..1772cd275b 100644 --- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -174,6 +174,16 @@ bool ConvAddBNReluKernel::Init( build_options); } + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] != 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + // other depthwise not with filter 3x3 + DLOG << "depth_conv basic "; + param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; + this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options); + } else if (param->Filter()->dims()[2] == 3 && param->Filter()->dims()[3] == 3) { // if (param->Strides()[0] == param->Strides()[1] && @@ -214,6 +224,7 @@ void ConvAddBNReluKernel::Compute( case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: + case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(), param.NewScale(), param.NewBias()); break; diff --git a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp index a0e890a70b..94ffc001b4 100644 --- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp @@ -71,6 +71,14 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { build_options); } + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] != 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; + this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options); + } else if (param->Filter()->dims()[2] == 3 && param->Filter()->dims()[3] == 3) { // if (param->Strides()[0] == param->Strides()[1] && @@ -124,6 +132,7 @@ void ConvAddKernel::Compute( case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: case ConvParam::EXEC_SLIDINGWINDOW5x5_FLOAT: case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: + case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias()); break; case ConvParam::EXEC_SLIDINGWINDOW7x7_FLOAT: diff --git a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp index 77738fe34c..370934849c 100644 --- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp @@ -72,6 +72,14 @@ bool ConvAddReluKernel::Init( build_options); } + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] != 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + DLOG << "init depwise conv basic"; + param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; + this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options); } else if (param->Filter()->dims()[2] == 3 && param->Filter()->dims()[3] == 3) { // if (param->Strides()[0] == param->Strides()[1] && @@ -130,6 +138,7 @@ void ConvAddReluKernel::Compute( case ConvParam::EXEC_SLIDINGWINDOW5x5_FLOAT: case ConvParam::EXEC_SLIDINGWINDOW7x7_FLOAT: case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: + case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias()); break; case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: diff --git a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp index c8cb97c2e2..02fdfb782e 100644 --- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp @@ -129,6 +129,14 @@ bool ConvBNReluKernel::Init( build_options); } + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] != 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; + this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options); } else if (param->Filter()->dims()[2] == 3 && param->Filter()->dims()[3] == 3) { // if (param->Strides()[0] == param->Strides()[1] && @@ -168,6 +176,7 @@ void ConvBNReluKernel::Compute( case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: + case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(), param.NewBias()); break; diff --git a/mobile/src/operators/kernel/cl/conv_kernel.cpp b/mobile/src/operators/kernel/cl/conv_kernel.cpp index 2859715b9c..0965e5feb2 100644 --- a/mobile/src/operators/kernel/cl/conv_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp @@ -66,6 +66,14 @@ bool ConvKernel::Init(ConvParam *param) { } DLOG << "depth_conv 3x3"; + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] != 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; + this->cl_helper_.AddKernel("depth_conv", conv_kernel_file); } else if (param->Filter()->dims()[2] == 3 && param->Filter()->dims()[3] == 3) { // if (param->Strides()[0] == param->Strides()[1] && @@ -115,6 +123,7 @@ void ConvKernel::Compute(const ConvParam ¶m) { case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: case ConvParam::EXEC_SLIDINGWINDOW7x7_FLOAT: + case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: ConvAddBnRelu(&this->cl_helper_, param); break; case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: diff --git a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp index 0e63ccb095..ecfc5fbd10 100644 --- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp +++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp @@ -72,6 +72,14 @@ bool ConvReluKernel::Init(FusionConvReluParam *param) { DLOG << "depth_conv 3x3"; + } else if (param->Filter()->dims()[1] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] != 3) { + param->Filter()->InitDWImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + + param->ExecMode() = ConvParam::EXEC_DEPTHWISEBASIC_FLOAT; + this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options); } else if (param->Filter()->dims()[2] == 3 && param->Filter()->dims()[3] == 3) { // if (param->Strides()[0] == param->Strides()[1] && @@ -120,6 +128,7 @@ void ConvReluKernel::Compute( case ConvParam::EXEC_SLIDINGWINDOW1x1_FLOAT: case ConvParam::EXEC_SLIDINGWINDOW3x3_FLOAT: case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: + case ConvParam::EXEC_DEPTHWISEBASIC_FLOAT: ConvAddBnRelu(&this->cl_helper_, param, true); break; case ConvParam::EXEC_DEPTHWISE3x3S1_FLOAT: diff --git a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp index 9f2aca7850..fd5b9e6bc3 100644 --- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp +++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp @@ -15,6 +15,8 @@ limitations under the License. */ #ifdef ELEMENTWISEMUL_OP #include "operators/kernel/elementwise_mul_kernel.h" +#include +#include #include "framework/cl/cl_image.h" namespace paddle_mobile { @@ -23,19 +25,24 @@ namespace operators { template <> bool ElementwiseMulKernel::Init( ElementwiseMulParam *param) { - DLOG << "-----init add-----"; framework::CLImage *bias = reinterpret_cast( const_cast(param->InputY())); if (bias->dims() == param->InputX()->dims()) { + DLOG << "init element wise mul"; this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl"); - } else if (bias->dims().size() == 4) { + } else if (bias->dims().size() == 1) { + DLOG << "init channel_mul"; this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl"); + } else if (bias->dims().size() == 2) { + // etc. input 1 72 28 28 + // filter 1 72 + DLOG << "init channel_mul_d2"; + this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl"); } else { - DLOG << "error:bias dims is error"; + PADDLE_MOBILE_ENFORCE(false, "element mul not supported yet"); } return true; } - template <> void ElementwiseMulKernel::Compute( const ElementwiseMulParam ¶m) { @@ -64,8 +71,8 @@ void ElementwiseMulKernel::Compute( clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); - } else if (bias->dims().size() == 4) { - DLOG << "zp7 444"; + } else if (bias->dims().size() == 1) { + DLOG << "channel mul"; cl_mem input_image = input->GetCLImage(); cl_mem bias_image = bias->GetCLImage(); cl_mem output_image = output->GetCLImage(); @@ -84,14 +91,48 @@ void ElementwiseMulKernel::Compute( CL_CHECK_ERRORS(status); auto width = input->ImageWidth(); auto height = input->ImageHeight(); - DLOG << "dede:" << width << "," << height; size_t global_work_size[2] = {width, height}; status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); + } else if (bias->dims().size() == 2) { + DLOG << "channel mul d2"; + + // etc. input 1 72 28 28 + // filter 1 72 --> 1 1 1 72 + DLOG << "input->ImageDims(): " << input->ImageDims(); + DLOG << "bias->ImageDims(): " << bias->ImageDims(); + DLOG << "out->ImageDims(): " << output->ImageDims(); + + DLOG << "channel mul d2"; + cl_mem input_image = input->GetCLImage(); + cl_mem bias_image = bias->GetCLImage(); + cl_mem output_image = output->GetCLImage(); + int tensor_w = input->dims()[input->dims().size() - 1]; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), + reinterpret_cast(&input_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), + reinterpret_cast(&bias_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(cl_mem), + reinterpret_cast(&output_image)); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(cl_int), + reinterpret_cast(&tensor_w)); + CL_CHECK_ERRORS(status); + auto width = input->ImageWidth(); + auto height = input->ImageHeight(); + size_t global_work_size[2] = {width, height}; + status = + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, global_work_size, NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + + // bias->PrintTensor(*bias); } else { - DLOG << "error:bias dims is error"; + PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet") } } diff --git a/mobile/src/operators/op_param.h b/mobile/src/operators/op_param.h index 0415291a73..e58159fbb7 100644 --- a/mobile/src/operators/op_param.h +++ b/mobile/src/operators/op_param.h @@ -489,6 +489,7 @@ class ConvParam : public OpParam { EXEC_SLIDINGWINDOW5x5_FLOAT, EXEC_SLIDINGWINDOW7x7_FLOAT, EXEC_GEMM1x1s1_FLOAT, + EXEC_DEPTHWISEBASIC_FLOAT, }; ExecMode &ExecMode() const { return exec_mode_; } -- GitLab