diff --git a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..b1b0de72bbeb9fcffa65a5838802821bbe9267a3 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl @@ -0,0 +1,158 @@ +#define BIASE +#define BATCH_NORM +#define RELU +#include "cl_common.h" + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void depth_conv_3x3(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input, + __read_only image2d_t filter, +#ifdef BIASE + __read_only image2d_t bias, +#endif +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height, /* of one block */ + __private const int output_width, + __private const int output_height, + __private const int filter_width, + __private const int filter_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const int batch_index = out_nh / output_height; + + const int out_nh_in_one_batch = out_nh % output_height; + + + int2 stride_xy = (int2)(stride, stride); + int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); + + int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset); + +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); + + int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height); + + int filter_x = pos_in_filter_block.x ; + int filter_y = pos_in_filter_block.y ; + + half4 inputs[9]; + + inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); + + inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); + + inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); + + inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + /* + if (output_pos.x == 112 && output_pos.y == 0) { + half4 input1 = inputs[3]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 3 - %v4hlf \n", in); + printf(" --- %d ---\n", in_pos_in_one_block.x - 1); + } + */ + + + inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + + inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); + + inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); + + inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); + + half4 filters[9]; + filters[0] = read_imageh(filter, sampler,(int2)(filter_x,filter_y)); + filters[1] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y)); + filters[2] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y)); + filters[3] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1)); + filters[4] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1)); + filters[5] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1)); + filters[6] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2)); + filters[7] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2)); + filters[8] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2)); + + for(int i = 0 ;i < 9 ; i++){ + output += inputs[i] * filters[i]; + } +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + + /* + + if (output_pos.x == 112 && output_pos.y == 0) { + + for (int i = 0; i < 9; ++i) { + half4 input1 = inputs[i]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 %d - %v4hlf \n", i, in); + } + + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" depth wise output output4 = %v4hlf \n", out); + printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); + printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); + printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); + printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); + } + + */ + + write_imageh(output_image, output_pos, output); + +} \ No newline at end of file diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index 65f39a0ac6e18e8a206c47ffc1b7e3fba56c6085..c74c5575f78de10025feb1cb3c4e278cdd3b9ebc 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -29,6 +29,14 @@ bool ConvAddBNReluKernel::Init( param->Paddings()[0] == param->Paddings()[1], "need equal"); + auto filter_ddim = param->Filter()->dims(); + + std::vector filter_shape( + {filter_ddim[1], filter_ddim[0], filter_ddim[2], filter_ddim[3]}); + framework::DDim ddim = framework::make_ddim(filter_shape); + if (filter_ddim[1] == 1) { + param->Filter()->Resize(ddim); + } param->Filter()->InitCLImage(cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); param->Bias()->InitCLImage(cl_helper_.CLContext(), @@ -43,21 +51,21 @@ bool ConvAddBNReluKernel::Init( const int C = mean->numel(); -// for (int j = 0; j < C; ++j) { -// DLOG << " mean - " << j << mean->data()[j]; -// } -// -// for (int j = 0; j < C; ++j) { -// DLOG << " variance - " << j << variance->data()[j]; -// } -// -// for (int j = 0; j < C; ++j) { -// DLOG << " scale - " << j << scale->data()[j]; -// } -// -// for (int j = 0; j < C; ++j) { -// DLOG << " bias - " << j << bias->data()[j]; -// } + // for (int j = 0; j < C; ++j) { + // DLOG << " mean - " << j << mean->data()[j]; + // } + // + // for (int j = 0; j < C; ++j) { + // DLOG << " variance - " << j << variance->data()[j]; + // } + // + // for (int j = 0; j < C; ++j) { + // DLOG << " scale - " << j << scale->data()[j]; + // } + // + // for (int j = 0; j < C; ++j) { + // DLOG << " bias - " << j << bias->data()[j]; + // } // // DLOG << " climage mean: " << *mean; @@ -85,21 +93,21 @@ bool ConvAddBNReluKernel::Init( framework::CLImage *new_scale = new framework::CLImage(); -// for (int j = 0; j < C; ++j) { -// DLOG << " new scale - " << j << new_scale_ptr[j]; -// } -// -// for (int j = 0; j < C; ++j) { -// DLOG << " new bias - " << j << new_bias_ptr[j]; -// } + // for (int j = 0; j < C; ++j) { + // DLOG << " new scale - " << j << new_scale_ptr[j]; + // } + // + // for (int j = 0; j < C; ++j) { + // DLOG << " new bias - " << j << new_bias_ptr[j]; + // } new_scale->SetTensorData(new_scale_ptr, variance->dims()); new_scale->InitCLImage(this->cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); -// DLOG << " climage - y bias: " << *(param->Bias()); -// -// DLOG << " climage - new scale: " << *new_scale; + // DLOG << " climage - y bias: " << *(param->Bias()); + // + // DLOG << " climage - new scale: " << *new_scale; framework::CLImage *new_bias = new framework::CLImage(); @@ -107,9 +115,9 @@ bool ConvAddBNReluKernel::Init( new_bias->InitCLImage(this->cl_helper_.CLContext(), cl_helper_.CLCommandQueue()); -// DLOG << " climage - new bias: " << *new_bias; -// -// DLOG << " climage - filter: " << *(param->Filter()); + // DLOG << " climage - new bias: " << *new_bias; + // + // DLOG << " climage - filter: " << *(param->Filter()); param->SetNewScale(new_scale); param->SetNewBias(new_bias); @@ -131,8 +139,12 @@ bool ConvAddBNReluKernel::Init( param->Filter()->HeightOfOneBlock() == 1) { this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); DLOG << " conv add bn relu conv 1x1"; - } else if (param->Filter()->dims()[1] == 1) { - this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); + } else if (param->Filter()->dims()[0] == 1 && + param->Input()->dims()[1] == param->Output()->dims()[1] && + param->Filter()->dims()[2] == 3) { + // this->cl_helper_.AddKernel("depth_conv_3x3", + // "conv_add_bn_relu_kernel.cl"); + this->cl_helper_.AddKernel("depth_conv_3x3", "depthwise_conv_kernel.cl"); DLOG << " conv add bn relu depth_conv_3x3"; } else if (param->Filter()->WidthOfOneBlock() == 3 && param->Filter()->HeightOfOneBlock() == 3) { @@ -167,21 +179,23 @@ void ConvAddBNReluKernel::Compute( int input_height = param.Input()->HeightOfOneBlock(); int output_width = param.Output()->WidthOfOneBlock(); int output_height = param.Output()->HeightOfOneBlock(); - -// DLOG << " c block " << c_block; -// DLOG << " w " << w; -// DLOG << " nh " << nh; -// DLOG << " stride " << stride; -// DLOG << " offset " << offset; -// DLOG << " input_c " << input_c; -// DLOG << " dilation " << dilation; -// DLOG << " input width " << input_width; -// DLOG << " input height " << input_height; -// DLOG << " output width " << output_width; -// DLOG << " output height " << output_height; -// DLOG << " input dim " << param.Input()->dims(); -// DLOG << " output dim " << param.Output()->dims(); -// DLOG << " filter dim " << param.Filter()->dims(); + int filter_width = param.Filter()->WidthOfOneBlock(); + int filter_height = param.Filter()->HeightOfOneBlock(); + + // DLOG << " c block " << c_block; + // DLOG << " w " << w; + // DLOG << " nh " << nh; + // DLOG << " stride " << stride; + // DLOG << " offset " << offset; + // DLOG << " input_c " << input_c; + // DLOG << " dilation " << dilation; + // DLOG << " input width " << input_width; + // DLOG << " input height " << input_height; + // DLOG << " output width " << output_width; + // DLOG << " output height " << output_height; + // DLOG << " input dim " << param.Input()->dims(); + // DLOG << " output dim " << param.Output()->dims(); + // DLOG << " filter dim " << param.Filter()->dims(); cl_int status; @@ -236,12 +250,21 @@ void ConvAddBNReluKernel::Compute( status = clSetKernelArg(kernel, 16, sizeof(int), &output_height); CL_CHECK_ERRORS(status); -// cl_event out_event = param.Output()->GetClEvent(); -// cl_event wait_event = param.Input()->GetClEvent(); + if (param.Filter()->dims()[0] == 1 && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == 3) { + status = clSetKernelArg(kernel, 17, sizeof(int), &filter_width); + CL_CHECK_ERRORS(status); + + status = clSetKernelArg(kernel, 18, sizeof(int), &filter_height); + CL_CHECK_ERRORS(status); + } + // cl_event out_event = param.Output()->GetClEvent(); + // cl_event wait_event = param.Input()->GetClEvent(); - status = - clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, - default_work_size.data(), NULL, 0, NULL, NULL); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); CL_CHECK_ERRORS(status); }