From 64280b85460c39b2c3bc8b628518c26db3deb678 Mon Sep 17 00:00:00 2001 From: yangfei Date: Fri, 19 Oct 2018 14:44:20 +0800 Subject: [PATCH] imp conv kernel and relu kernel --- .../kernel/cl/cl_kernel/conv_kernel.cl | 142 ++++++++++++++---- src/operators/kernel/cl/relu_kernel.cpp | 41 ++--- 2 files changed, 134 insertions(+), 49 deletions(-) diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl index 5e27f49556..5d10a382f8 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl @@ -82,47 +82,47 @@ __kernel void conv_3x3(__private const int global_size_dim0, input[0] = select(read_imageh(input_image, sampler, (int2)(pos_in.x - dilation, pos_in.y - dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); input[1] = select(read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y - dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); input[2] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y - dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); input[3] = select(read_imageh(input_image, sampler, (int2)(pos_in.x - dilation, pos_in.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); input[4] = select(read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); input[5] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); input[6] = select(read_imageh(input_image, sampler, (int2)(pos_in.x - dilation, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); input[7] = select(read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); input[8] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); for (int j = 0; j < 9; ++j) { int2 fuck; @@ -158,6 +158,7 @@ __kernel void conv_3x3(__private const int global_size_dim0, + __kernel void depth_conv_3x3(__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2, @@ -184,6 +185,9 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, const int out_w = get_global_id(1); const int out_nh = get_global_id(2); + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; @@ -192,7 +196,6 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, const int out_nh_in_one_batch = out_nh % output_height; - const uint kernelHXW = 1; int2 stride_xy = (int2)(stride, stride); int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); @@ -206,53 +209,65 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, #endif int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); - int weight_x_to = out_c * 3; + int weight_y_to = out_c * 12; half4 inputs[9]; inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + /* + if (output_pos.x == 112 && output_pos.y == 0) { + half4 input1 = inputs[3]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 3 - %v4hlf \n", in); + printf(" --- %d ---\n", in_pos_in_one_block.x - 1); + } + */ + inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); for (int j = 0; j < 9; ++j) { half4 input = inputs[j]; - half4 weight = read_imageh(filter, sampler, (int2)(weight_x_to + j % 3, j / 3)); - output.x += input.x * weight.x; - output.y += input.y * weight.y; - output.z += input.z * weight.z; - output.w += input.w * weight.w; + half4 weight0 = read_imageh(filter, sampler, (int2)(j % 3, weight_y_to + j / 3)); + half4 weight1 = read_imageh(filter, sampler, (int2)(j % 3, weight_y_to + 3 + j / 3)); + half4 weight2 = read_imageh(filter, sampler, (int2)(j % 3, weight_y_to + 6 + j / 3)); + half4 weight3 = read_imageh(filter, sampler, (int2)(j % 3, weight_y_to + 9 + j / 3)); + output.x += input.x * weight0.x; + output.y += input.y * weight1.x; + output.z += input.z * weight2.x; + output.w += input.w * weight3.x; } #ifdef BATCH_NORM @@ -263,7 +278,27 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, output = activation(output); #endif - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + + /* + + if (output_pos.x == 112 && output_pos.y == 0) { + + for (int i = 0; i < 9; ++i) { + half4 input1 = inputs[i]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 %d - %v4hlf \n", i, in); + } + + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" depth wise output output4 = %v4hlf \n", out); + printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); + printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); + printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); + printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); + } + + */ + write_imageh(output_image, output_pos, output); } @@ -306,9 +341,19 @@ __kernel void conv_1x1(__private const int global_size_dim0, half4 output = 0.0f; #endif + int out_c_p = 0, out_w_p = 0, out_nh_p = 0; + +/* + if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" after bias output4 = %v4hlf \n", out); + + } + +*/ + for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - if (pos_in.x >=0 && pos_in.y >= 0 && pos_in.x < input_width && pos_in.y < input_height) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); half4 input = read_imageh(input_image, sampler, pos_in); half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0)); @@ -322,20 +367,57 @@ __kernel void conv_1x1(__private const int global_size_dim0, half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3)); output.w += dot(input, weight_w); +/* + if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { + printf("x - %d \n", pos_in.x); - } + printf("y - %d \n", pos_in.y); + + float4 in = (float4)(input.x, input.y, input.z, input.w); + printf("input4 = %v4hlf \n", in); + + float4 w = (float4)(weight_x.x, weight_x.y, weight_x.z, weight_x.w); + printf("weight4 = %v4hlf \n", w); + + } +*/ } +/* + if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf("output4 = %v4hlf \n", out); + + } + +*/ #ifdef BATCH_NORM output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif +/* + if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" after batch output4 = %v4hlf \n", out); + + } + +*/ + #ifdef RELU output = activation(output); #endif +/* + if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" after relu output4 = %v4hlf \n", out); + + } + +*/ + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); write_imageh(output_image, output_pos, output); - } diff --git a/src/operators/kernel/cl/relu_kernel.cpp b/src/operators/kernel/cl/relu_kernel.cpp index a50079db60..1871e8ae72 100644 --- a/src/operators/kernel/cl/relu_kernel.cpp +++ b/src/operators/kernel/cl/relu_kernel.cpp @@ -21,35 +21,38 @@ namespace operators { template <> bool ReluKernel::Init(ReluParam* param) { this->cl_helper_.AddKernel("relu", "relu.cl"); - this->cl_helper_.AddKernel("relu_p0", "relu.cl"); - this->cl_helper_.AddKernel("relu_p1", "relu.cl"); - const auto dim = - const_cast(param->InputX())->ImageDims(); - param->getMidImage().InitEmptyImage(this->cl_helper_.CLContext(), - this->cl_helper_.CLCommandQueue(), dim); +// this->cl_helper_.AddKernel("relu_p0", "relu.cl"); +// this->cl_helper_.AddKernel("relu_p1", "relu.cl"); +// const auto dim = +// const_cast(param->InputX())->ImageDims(); +// param->getMidImage().InitEmptyImage(this->cl_helper_.CLContext(), +// this->cl_helper_.CLCommandQueue(), dim); return true; } template <> void ReluKernel::Compute(const ReluParam& param) { - auto kernel_p0 = this->cl_helper_.KernelAt(1); - auto kernel_p1 = this->cl_helper_.KernelAt(2); + auto kernel = this->cl_helper_.KernelAt(0); +// auto kernel_p0 = this->cl_helper_.KernelAt(1); +// auto kernel_p1 = this->cl_helper_.KernelAt(2); const auto* input = param.InputX(); auto* output = param.Out(); auto default_work_size = this->cl_helper_.DefaultWorkSize(*output); auto inputImage = input->GetCLImage(); auto outputImage = output->GetCLImage(); - auto tImage = - const_cast&>(param).getMidImage().GetCLImage(); - clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage); - clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage); - clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage); - clSetKernelArg(kernel_p1, 1, sizeof(cl_mem), &outputImage); - const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; - - // clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel_p0, 3, - // NULL, - // work_size, NULL, 0, NULL, NULL); +// auto tImage = +// const_cast&>(param).getMidImage().GetCLImage(); + clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); +// clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage); +// clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage); +// clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage); +// clSetKernelArg(kernel_p1, 1, sizeof(cl_mem), &outputImage); + const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()}; + + clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, + NULL, + work_size, NULL, 0, NULL, NULL); // clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel_p1, 3, // NULL, // work_size, NULL, 0, NULL, NULL); -- GitLab