From e4ec380d3f72026d0fd3ffaba1829a5142005140 Mon Sep 17 00:00:00 2001 From: liuqi Date: Wed, 29 Nov 2017 17:50:35 +0800 Subject: [PATCH] Conv3x3 opencl : remove array. --- mace/kernels/opencl/cl/conv_2d_3x3.cl | 234 +++++++++++++------------- 1 file changed, 118 insertions(+), 116 deletions(-) diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index ef76aa79..30a5cdd7 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -3,7 +3,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */ #ifdef BIAS - __read_only image2d_t bias, /* cout%4 * cout/4 */ + __read_only image2d_t bias, /* cout%4 * cout/4 */ #endif __write_only image2d_t output, __private const int in_height, @@ -19,24 +19,27 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const int out_hb = get_global_id(2); const int rounded_in_ch = in_ch_blks * 4; - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + DATA_TYPE4 out0 = 0; + DATA_TYPE4 out1 = 0; + DATA_TYPE4 out2 = 0; + DATA_TYPE4 out3 = 0; + DATA_TYPE4 out4 = 0; - VEC_DATA_TYPE(DATA_TYPE, 4) out[5] = {0}; + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; #ifdef BIAS - out[0] = - CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0)); - out[1] = out[0]; - out[2] = out[0]; - out[3] = out[0]; - out[4] = out[0]; + out0 = + READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)); + out1 = out0; + out2 = out0; + out3 = out0; + out4 = out0; #endif - int w[5]; - w[0] = out_w_blk - padding_left; - w[1] = w[0] + out_w_blks; - w[2] = w[1] + out_w_blks; - w[3] = w[2] + out_w_blks; - w[4] = w[3] + out_w_blks; + int w0 = out_w_blk - padding_left; + int w1 = w0 + out_w_blks; + int w2 = w1 + out_w_blks; + int w3 = w2 + out_w_blks; + int w4 = w3 + out_w_blks; const int batch_idx = out_hb / out_height; const int height_idx = out_hb % out_height; @@ -51,112 +54,111 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const int input_image_width = in_ch_blks * in_width; - VEC_DATA_TYPE(DATA_TYPE, 4) in[5]; - VEC_DATA_TYPE(DATA_TYPE, 4) weights[4]; + DATA_TYPE4 in0, in1, in2, in3, in4; + DATA_TYPE4 weights0, weights1, weights2, weights3; int in_idx, hb_idx, width_idx, in_width_idx; // Unrolling this loop hurt perfmance - for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { - for (int i = 0; i < 9; ++i) { - - in_idx = in_ch_blk * in_width; - - hb_idx = i / 3; - width_idx = i % 3; - in_width_idx = w[0] + width_idx; - // Judge the width border for padding input. - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[0] = 0; - } else { - in[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); - } - in_width_idx = w[1] + width_idx; - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[1] = 0; - } else { - in[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); - } - in_width_idx = w[2] + width_idx; - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[2] = 0; - } else { - in[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); + for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { + for (short hb_idx = 0; hb_idx < 3; ++ hb_idx) { + for (short width_idx = 0; width_idx < 3; ++width_idx) { + + in_idx = in_ch_blk * in_width; + + in_width_idx = w0 + width_idx; + // Judge the width border for padding input. + if (in_width_idx < 0 || in_width_idx >= in_width) { + in0 = 0; + } else { + in0 = READ_IMAGET(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); + } + in_width_idx = w1 + width_idx; + if (in_width_idx < 0 || in_width_idx >= in_width) { + in1 = 0; + } else { + in1 = READ_IMAGET(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); + } + in_width_idx = w2 + width_idx; + if (in_width_idx < 0 || in_width_idx >= in_width) { + in2 = 0; + } else { + in2 = READ_IMAGET(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); + } + in_width_idx = w3 + width_idx; + if (in_width_idx < 0 || in_width_idx >= in_width) { + in3 = 0; + } else { + in3 = READ_IMAGET(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); + } + in_width_idx = w4 + width_idx; + if (in_width_idx < 0 || in_width_idx >= in_width) { + in4 = 0; + } else { + in4 = READ_IMAGET(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); + } + + int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch; + weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk)); + weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk)); + weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk)); + weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk)); + + // Will prefetch L2 improve performance? How to pretch image data? + + // Interleaving load and mul does not improve performance as expected + out0 += in0.x * weights0; + out0 += in0.y * weights1; + out0 += in0.z * weights2; + out0 += in0.w * weights3; + + out1 += in1.x * weights0; + out1 += in1.y * weights1; + out1 += in1.z * weights2; + out1 += in1.w * weights3; + + out2 += in2.x * weights0; + out2 += in2.y * weights1; + out2 += in2.z * weights2; + out2 += in2.w * weights3; + + out3 += in3.x * weights0; + out3 += in3.y * weights1; + out3 += in3.z * weights2; + out3 += in3.w * weights3; + + out4 += in4.x * weights0; + out4 += in4.y * weights1; + out4 += in4.z * weights2; + out4 += in4.w * weights3; } - in_width_idx = w[3] + width_idx; - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[3] = 0; - } else { - in[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); - } - in_width_idx = w[4] + width_idx; - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[4] = 0; - } else { - in[4] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); - } - - - int filter_idx = (in_ch_blk << 2) + i * rounded_in_ch; - weights[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk)); - weights[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk)); - weights[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk)); - weights[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk)); - - // Will prefetch L2 improve performance? How to pretch image data? - - // Interleaving load and mul does not improve performance as expected - out[0] += in[0].x * weights[0]; - out[0] += in[0].y * weights[1]; - out[0] += in[0].z * weights[2]; - out[0] += in[0].w * weights[3]; - - out[1] += in[1].x * weights[0]; - out[1] += in[1].y * weights[1]; - out[1] += in[1].z * weights[2]; - out[1] += in[1].w * weights[3]; - - out[2] += in[2].x * weights[0]; - out[2] += in[2].y * weights[1]; - out[2] += in[2].z * weights[2]; - out[2] += in[2].w * weights[3]; - - out[3] += in[3].x * weights[0]; - out[3] += in[3].y * weights[1]; - out[3] += in[3].z * weights[2]; - out[3] += in[3].w * weights[3]; - - out[4] += in[4].x * weights[0]; - out[4] += in[4].y * weights[1]; - out[4] += in[4].z * weights[2]; - out[4] += in[4].w * weights[3]; } } const int out_x_base = out_ch_blk * out_width; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, - (int2)(out_x_base + w[0] + padding_left, out_hb), - out[0]); - - w[1] += padding_left; - if (w[1] >= out_width) return; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, - (int2)(out_x_base + w[1], out_hb), - out[1]); - - w[2] += padding_left; - if (w[2] >= out_width) return; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, - (int2)(out_x_base + w[2], out_hb), - out[2]); - - w[3] += padding_left; - if (w[3] >= out_width) return; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, - (int2)(out_x_base + w[3], out_hb), - out[3]); - - w[4] += padding_left; - if (w[4] >= out_width) return; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, - (int2)(out_x_base + w[4], out_hb), - out[4]); + WRITE_IMAGET(output, + (int2)(out_x_base + w0 + padding_left, out_hb), + out0); + + w1 += padding_left; + if (w1 >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w1, out_hb), + out1); + + w2 += padding_left; + if (w2 >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w2, out_hb), + out2); + + w3 += padding_left; + if (w3 >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w3, out_hb), + out3); + + w4 += padding_left; + if (w4 >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w4, out_hb), + out4); } -- GitLab