diff --git a/src/framework/cl/cl_engine.h b/src/framework/cl/cl_engine.h index d8a4c771a5da9aff96cbb86abec63762b9cd4dce..6411b3cd05c925eb9823224bfd198d1033f64cdb 100644 --- a/src/framework/cl/cl_engine.h +++ b/src/framework/cl/cl_engine.h @@ -84,8 +84,24 @@ class CLEngine { bool BuildProgram(cl_program program) { cl_int status; status = clBuildProgram(program, 0, 0, "-cl-fast-relaxed-math", 0, 0); + CL_CHECK_ERRORS(status); - return true; + + if (status_ == CL_BUILD_PROGRAM_FAILURE) { + size_t log_size; + clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(), + CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char *log = (char *)malloc(log_size); + clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(), + CL_PROGRAM_BUILD_LOG, log_size, log, NULL); + DLOG << " program build error: " << log; + } + + if (status == CL_SUCCESS) { + return true; + } else { + return false; + } } cl_device_id DeviceID(int index = 0) { return devices_[index]; } diff --git a/src/framework/cl/cl_scope.h b/src/framework/cl/cl_scope.h index 58357632be8b6257de5a9ecf1e1e950f9f422f77..77c55e0acd244d9091cc1d88f5733796a5a1b50f 100644 --- a/src/framework/cl/cl_scope.h +++ b/src/framework/cl/cl_scope.h @@ -59,21 +59,7 @@ class CLScope { context_.get(), "./cl_kernel/" + file_name); DLOG << " --- begin build program -> " << file_name << " --- "; - status_ = - clBuildProgram(program.get(), 0, 0, "-cl-fast-relaxed-math", 0, 0); - - CL_CHECK_ERRORS(status_); - - if (status_ == CL_BUILD_PROGRAM_FAILURE) { - size_t log_size; - clGetProgramBuildInfo(program.get(), CLEngine::Instance()->DeviceID(), - CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - char *log = (char *)malloc(log_size); - clGetProgramBuildInfo(program.get(), CLEngine::Instance()->DeviceID(), - CL_PROGRAM_BUILD_LOG, log_size, log, NULL); - DLOG << " program build error: " << log; - } - + CLEngine::Instance()->BuildProgram(program.get()); DLOG << " --- end build program -> " << file_name << " --- "; programs_[file_name] = std::move(program); diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl index ae04c64aa9ce90e39d320f9d8a9b9c3f388bdf13..5ddbd7d327787e66549fbf5dea9fd1280dace531 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl @@ -103,47 +103,47 @@ __kernel void conv_3x3(__private const int global_size_dim0, input[0] = select(read_imageh(input_image, sampler, (int2)(pos_in.x - dilation, pos_in.y - dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); input[1] = select(read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y - dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); input[2] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y - dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15)); input[3] = select(read_imageh(input_image, sampler, (int2)(pos_in.x - dilation, pos_in.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); input[4] = select(read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); input[5] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15)); input[6] = select(read_imageh(input_image, sampler, (int2)(pos_in.x - dilation, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); input[7] = select(read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); input[8] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15)); for (int j = 0; j < 9; ++j) { int2 fuck; @@ -206,6 +206,9 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, const int out_w = get_global_id(1); const int out_nh = get_global_id(2); + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; @@ -214,7 +217,6 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, const int out_nh_in_one_batch = out_nh % output_height; - const uint kernelHXW = 1; int2 stride_xy = (int2)(stride, stride); int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch); @@ -234,39 +236,48 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15)); inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); + /* + if (output_pos.x == 112 && output_pos.y == 0) { + half4 input1 = inputs[3]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 3 - %v4hlf \n", in); + printf(" --- %d ---\n", in_pos_in_one_block.x - 1); + } + */ + inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15)); inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height)); + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15)); inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)), (half4)(0.0f), - (ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height)); + (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15)); for (int j = 0; j < 9; ++j) { half4 input = inputs[j]; @@ -288,7 +299,27 @@ __kernel void depth_conv_3x3(__private const int global_size_dim0, output = activation(output); #endif - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + + /* + + if (output_pos.x == 112 && output_pos.y == 0) { + + for (int i = 0; i < 9; ++i) { + half4 input1 = inputs[i]; + float4 in = (float4)(input1.x, input1.y, input1.z, input1.w); + printf(" input4 %d - %v4hlf \n", i, in); + } + + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" depth wise output output4 = %v4hlf \n", out); + printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x); + printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y); + printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x); + printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y); + } + + */ + write_imageh(output_image, output_pos, output); } @@ -331,9 +362,19 @@ __kernel void conv_1x1(__private const int global_size_dim0, half4 output = 0.0f; #endif + int out_c_p = 0, out_w_p = 0, out_nh_p = 0; + +/* + if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" after bias output4 = %v4hlf \n", out); + + } + +*/ + for (int i = 0; i < input_c; ++i) { - int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); - if (pos_in.x >=0 && pos_in.y >= 0 && pos_in.x < input_width && pos_in.y < input_height) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); half4 input = read_imageh(input_image, sampler, pos_in); half4 weight_x = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 0)); @@ -347,21 +388,56 @@ __kernel void conv_1x1(__private const int global_size_dim0, half4 weight_w = read_imageh(filter, sampler, (int2)(i, out_c * 4 + 3)); output.w += dot(input, weight_w); +/* + if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { + printf("x - %d \n", pos_in.x); - } + printf("y - %d \n", pos_in.y); + + float4 in = (float4)(input.x, input.y, input.z, input.w); + printf("input4 = %v4hlf \n", in); + + float4 w = (float4)(weight_x.x, weight_x.y, weight_x.z, weight_x.w); + printf("weight4 = %v4hlf \n", w); + + } +*/ } +/* + if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf("output4 = %v4hlf \n", out); + + } + +*/ #ifdef BATCH_NORM output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); #endif +/* + if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" after batch output4 = %v4hlf \n", out); + + } + +*/ + #ifdef RELU output = activation(output); #endif - int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); - write_imageh(output_image, output_pos, output); +/* + if (out_c == out_c_p && out_w == out_w_p && out_nh == out_nh_p) { + float4 out = (float4)(output.x, output.y, output.z, output.w); + printf(" after relu output4 = %v4hlf \n", out); -} + } +*/ + int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh); + write_imageh(output_image, output_pos, output); +} diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index 0d5dc25157f5d345f299aaca258968b47422979c..061562d61df5150c6048bbb7b6ad9f33359cd331 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -130,11 +130,14 @@ bool ConvAddBNReluKernel::Init( if (param->Filter()->WidthOfOneBlock() == 1 && param->Filter()->HeightOfOneBlock() == 1) { this->cl_helper_.AddKernel("conv_1x1", "conv_add_bn_relu_kernel.cl"); + DLOG << " conv add bn relu conv 1x1"; } else if (param->Filter()->dims()[1] == 1) { this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl"); + DLOG << " conv add bn relu depth_conv_3x3"; } else if (param->Filter()->WidthOfOneBlock() == 3 && param->Filter()->HeightOfOneBlock() == 3) { this->cl_helper_.AddKernel("conv_3x3", "conv_add_bn_relu_kernel.cl"); + DLOG << " conv add bn relu conv_3x3"; } else { PADDLE_MOBILE_THROW_EXCEPTION(" not support "); } @@ -176,6 +179,9 @@ void ConvAddBNReluKernel::Compute( DLOG << " input height " << input_height; DLOG << " output width " << output_width; DLOG << " output height " << output_height; + DLOG << " input dim " << param.Input()->dims(); + DLOG << " output dim " << param.Output()->dims(); + DLOG << " filter dim " << param.Filter()->dims(); cl_int status;