diff --git a/src/operators/fusion_conv_add.h b/src/operators/fusion_conv_add.h index 98cca486f62d2e2236bc4fcbc3f84f808097110a..f64219028fd9bed6b6d06cc21ad21bb472809427 100644 --- a/src/operators/fusion_conv_add.h +++ b/src/operators/fusion_conv_add.h @@ -85,6 +85,16 @@ static framework::FusionOpRegistrar convadd_registrar( #ifdef PADDLE_MOBILE_FPGA #endif +#ifdef PADDLE_MOBILE_CL + +#ifndef CONV_ADD_REGISTER +static framework::FusionOpRegistrar convadd_registrar( + new FusionConvAddMatcher()); +#define CONV_ADD_REGISTER +#endif + +#endif + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl index 708321faf81546a1dc7306758c3d8d5dae44d737..b2492b2ff3b677df14ed425822050d71dab9d72d 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl @@ -101,47 +101,47 @@ __kernel void conv_3x3(__private const int global_size_dim0, input[0] = select(read_imageh(input_image, sampler, (int2)(pos_in.x - dilation, pos_in.y - dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); input[1] = select(read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y - dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); input[2] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y - dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); input[3] = select(read_imageh(input_image, sampler, (int2)(pos_in.x - dilation, pos_in.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); input[4] = select(read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); input[5] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); input[6] = select(read_imageh(input_image, sampler, (int2)(pos_in.x - dilation, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); input[7] = select(read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); input[8] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); for (int j = 0; j < 9; ++j) { int2 fuck; @@ -177,6 +177,7 @@ __kernel void conv_3x3(__private const int global_size_dim0, + __kernel void depth_conv_3x3(__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2, @@ -203,6 +204,9 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, const int out_w = get_global_id(1); const int out_nh = get_global_id(2); + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; @@ -211,7 +215,6 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, const int out_nh_in_one_batch = out_nh % output_height; - const uint kernelHXW = 1; int2 stride_xy = (int2)(stride, stride); int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); @@ -225,53 +228,65 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, #endif int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height); - int weight_x_to = out_c * 3; + int weight_y_to = out_c * 12; half4 inputs[9]; inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + /* + if (output_pos.x == 112 && output_pos.y == 0) { + half4 input1 = inputs[3]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 3 - %v4hlf \n", in); + printf(" --- %d ---\n", in_pos_in_one_block.x - 1); + } + */ + inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); for (int j = 0; j < 9; ++j) { half4 input = inputs[j]; - half4 weight = read_imageh(filter, sampler, (int2)(weight_x_to + j % 3, j / 3)); - output.x += input.x * weight.x; - output.y += input.y * weight.y; - output.z += input.z * weight.z; - output.w += input.w * weight.w; + half4 weight0 = read_imageh(filter, sampler, (int2)(j % 3, weight_y_to + j / 3)); + half4 weight1 = read_imageh(filter, sampler, (int2)(j % 3, weight_y_to + 3 + j / 3)); + half4 weight2 = read_imageh(filter, sampler, (int2)(j % 3, weight_y_to + 6 + j / 3)); + half4 weight3 = read_imageh(filter, sampler, (int2)(j % 3, weight_y_to + 9 + j / 3)); + output.x += input.x * weight0.x; + output.y += input.y * weight1.x; + output.z += input.z * weight2.x; + output.w += input.w * weight3.x; } #ifdef BATCH_NORM @@ -282,7 +297,27 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, output = activation(output); #endif - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + + /* + + if (output_pos.x == 112 && output_pos.y == 0) { + + for (int i = 0; i < 9; ++i) { + half4 input1 = inputs[i]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 %d - %v4hlf \n", i, in); + } + + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" depth wise output output4 = %v4hlf \n", out); + printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); + printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); + printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); + printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); + } + + */ + write_imageh(output_image, output_pos, output); } @@ -325,9 +360,19 @@ __kernel void conv_1x1(__private const int global_size_dim0, half4 output = 0.0f; #endif + int out_c_p = 0, out_w_p = 0, out_nh_p = 0; + +/* + if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" after bias output4 = %v4hlf \n", out); + + } + +*/ + for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - if (pos_in.x >=0 && pos_in.y >= 0 && pos_in.x < input_width && pos_in.y < input_height) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); half4 input = read_imageh(input_image, sampler, pos_in); half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0)); @@ -341,21 +386,56 @@ __kernel void conv_1x1(__private const int global_size_dim0, half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3)); output.w += dot(input, weight_w); +/* + if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { + printf("x - %d \n", pos_in.x); + + printf("y - %d \n", pos_in.y); + + float4 in = (float4)(input.x, input.y, input.z, input.w); + printf("input4 = %v4hlf \n", in); + + float4 w = (float4)(weight_x.x, weight_x.y, weight_x.z, weight_x.w); + printf("weight4 = %v4hlf \n", w); + + } +*/ + } +/* + if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf("output4 = %v4hlf \n", out); - } } +*/ + #ifdef BATCH_NORM output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif +/* + if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" after batch output4 = %v4hlf \n", out); + + } + +*/ + #ifdef RELU output = activation(output); #endif - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - write_imageh(output_image, output_pos, output); +/* + if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" after relu output4 = %v4hlf \n", out); -} + } +*/ + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); +} diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index c6306aeba7fe1a00a6ee16c8d141aadf8d102d9c..ff1dbfd2a382f4b26b6c8c06fc4f23dd884a2f2f 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -36,12 +36,12 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { if (param->Filter()->WidthOfOneBlock() == 1 && param->Filter()->HeightOfOneBlock() == 1) { - this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); + this->cl_helper_.AddKernel("conv_1x1", "conv_add_kernel.cl"); } else if (param->Filter()->dims()[1] == 1) { - this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); + this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_kernel.cl"); } else if (param->Filter()->WidthOfOneBlock() == 3 && param->Filter()->HeightOfOneBlock() == 3) { - this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl"); + this->cl_helper_.AddKernel("conv_3x3", "conv_add_kernel.cl"); } else { PADDLE_MOBILE_THROW_EXCEPTION(" not support "); }