diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index b7625ca65dec440aa18ad5867626cd0897098980..8fc9f49ffa3deef64ee9fdba956096fffc2dc05c 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -257,16 +257,21 @@ class CLImage { float *p = tensor_data; size_t i0 = 0; for (int n = 0; n < N; n++) { - for (int c = 0; c < C; c++) { + for (int c = 0; c < c_block_ * 4; c++) { size_t i1 = i0 + (c / 4) * W; for (int h = 0; h < H; h++) { size_t i2 = (i1 << 2) + c % 4; for (int w = 0; w < W; w++) { - // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + (c - // % 4); - imageData[i2] = Float2Half(*p); - i2 += 4; - p++; + if (c < C) { + // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + + // (c % 4); + imageData[i2] = Float2Half(*p); + i2 += 4; + p++; + } else { + imageData[i2] = 0.0; + i2 += 4; + } } i1 += width; } diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index 0539627b76e506421dc29c3c7d89523d0b10af69..8421c6257320cb2664da91209dc8708255109dfe 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -37,7 +37,7 @@ limitations under the License. */ #include "framework/cl/cl_image.h" #endif -int debug_to = 3; +int debug_to = 2; namespace paddle_mobile { namespace framework { diff --git a/src/operators/feed_op.cpp b/src/operators/feed_op.cpp index dc6d0e9227e707f6b71474c520afefbd454f9f9a..9299d7cc1979da1eb660e9f66d94545255cad04e 100644 --- a/src/operators/feed_op.cpp +++ b/src/operators/feed_op.cpp @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "feed_op.h" +#include "operators/feed_op.h" + namespace paddle_mobile { namespace operators { @@ -22,6 +23,7 @@ void FeedOp::InferShape() const { out_dims[0] = this->param_.BatchSize(); this->param_.Out()->Resize(out_dims); } + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl index eb73248d740cac8c0553ec93b6aa89a3ab52453b..ae04c64aa9ce90e39d320f9d8a9b9c3f388bdf13 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl @@ -65,6 +65,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, const int out_w = get_global_id(1); const int out_nh = get_global_id(2); + if (out_c >= global_size_dim0 || + out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + printf(" out of range "); + return; + } + + int2 stride_xy; stride_xy.x = stride; stride_xy.y = stride; @@ -135,24 +143,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, input[8] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); for (int j = 0; j < 9; ++j) { int2 fuck; fuck.x = i * 3 + j % 3; - fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3; half4 weight_x = read_imageh(filter, sampler, fuck); output.x += dot(input[j], weight_x); - fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3; half4 weight_y = read_imageh(filter, sampler, fuck); output.y += dot(input[j], weight_y); - fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3; half4 weight_z = read_imageh(filter, sampler, fuck); output.z += dot(input[j], weight_z); - fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3; half4 weight_w = read_imageh(filter, sampler, fuck); output.w += dot(input[j], weight_w); } diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl index 69fc177e9e89200dfb5afc006421ca72704e932d..708321faf81546a1dc7306758c3d8d5dae44d737 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl @@ -63,6 +63,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, const int out_w = get_global_id(1); const int out_nh = get_global_id(2); + if (out_c >= global_size_dim0 || + out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + printf(" out of range "); + return; + } + + int2 stride_xy; stride_xy.x = stride; stride_xy.y = stride; @@ -133,24 +141,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, input[8] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); for (int j = 0; j < 9; ++j) { int2 fuck; fuck.x = i * 3 + j % 3; - fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3; half4 weight_x = read_imageh(filter, sampler, fuck); output.x += dot(input[j], weight_x); - fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3; half4 weight_y = read_imageh(filter, sampler, fuck); output.y += dot(input[j], weight_y); - fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3; half4 weight_z = read_imageh(filter, sampler, fuck); output.z += dot(input[j], weight_z); - fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3; half4 weight_w = read_imageh(filter, sampler, fuck); output.w += dot(input[j], weight_w); } @@ -169,7 +177,6 @@ __kernel void conv_3x3(__private const int global_size_dim0, - __kernel void depth_conv_3x3(__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2, diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl index 02717a0853fc4d3a613c09c58a4933920b7dcd0a..5e27f4955609cc2c3954986fdba7e93778db36d8 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl @@ -44,6 +44,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, const int out_w = get_global_id(1); const int out_nh = get_global_id(2); + if (out_c >= global_size_dim0 || + out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + printf(" out of range "); + return; + } + + int2 stride_xy; stride_xy.x = stride; stride_xy.y = stride; @@ -114,24 +122,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, input[8] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); for (int j = 0; j < 9; ++j) { int2 fuck; fuck.x = i * 3 + j % 3; - fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3; half4 weight_x = read_imageh(filter, sampler, fuck); output.x += dot(input[j], weight_x); - fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3; half4 weight_y = read_imageh(filter, sampler, fuck); output.y += dot(input[j], weight_y); - fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3; half4 weight_z = read_imageh(filter, sampler, fuck); output.z += dot(input[j], weight_z); - fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3; half4 weight_w = read_imageh(filter, sampler, fuck); output.w += dot(input[j], weight_w); } @@ -150,7 +158,6 @@ __kernel void conv_3x3(__private const int global_size_dim0, - __kernel void depth_conv_3x3(__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2, diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index 0ff05d4968f4cfdd7f7b85beb89ed63dee91ec4b..333aa79043639db8586765ed5caa17963ddcae75 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -54,6 +54,14 @@ __kernel void conv_3x3(__private const int global_size_dim0, const int out_w = get_global_id(1); const int out_nh = get_global_id(2); + if (out_c >= global_size_dim0 || + out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + printf(" out of range "); + return; + } + + int2 stride_xy; stride_xy.x = stride; stride_xy.y = stride; @@ -124,24 +132,24 @@ __kernel void conv_3x3(__private const int global_size_dim0, input[8] = select(read_imageh(input_image, sampler, (int2)(pos_in.x + dilation, pos_in.y + dilation)), (half4)(0.0f), - (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); + (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height)); for (int j = 0; j < 9; ++j) { int2 fuck; fuck.x = i * 3 + j % 3; - fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3; half4 weight_x = read_imageh(filter, sampler, fuck); output.x += dot(input[j], weight_x); - fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3; half4 weight_y = read_imageh(filter, sampler, fuck); output.y += dot(input[j], weight_y); - fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3; half4 weight_z = read_imageh(filter, sampler, fuck); output.z += dot(input[j], weight_z); - fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3; + fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3; half4 weight_w = read_imageh(filter, sampler, fuck); output.w += dot(input[j], weight_w); } @@ -158,9 +166,6 @@ __kernel void conv_3x3(__private const int global_size_dim0, write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); } - - - __kernel void depth_conv_3x3(__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2, diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp index 1c75b8be9fe05eefac0930a6fe2b79c42e952148..0d5dc25157f5d345f299aaca258968b47422979c 100644 --- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp @@ -165,6 +165,18 @@ void ConvAddBNReluKernel::Compute( int output_width = param.Output()->WidthOfOneBlock(); int output_height = param.Output()->HeightOfOneBlock(); + DLOG << " c block " << c_block; + DLOG << " w " << w; + DLOG << " nh " << nh; + DLOG << " stride " << stride; + DLOG << " offset " << offset; + DLOG << " input_c " << input_c; + DLOG << " dilation " << dilation; + DLOG << " input width " << input_width; + DLOG << " input height " << input_height; + DLOG << " output width " << output_width; + DLOG << " output height " << output_height; + cl_int status; status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);