diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl index 93d249ea25a113c0bb1437b43631e1c9e3372488..027b678bfeac4345155dca29eea17ab0212ec7d6 100644 --- a/mace/kernels/opencl/cl/batch_norm.cl +++ b/mace/kernels/opencl/cl/batch_norm.cl @@ -17,12 +17,13 @@ __kernel void batch_norm(__read_only image2d_t input, DATA_TYPE4 mean_value = READ_IMAGET(mean, SAMPLER, (int2)(ch_blk, 0)); DATA_TYPE4 var_value = READ_IMAGET(var, SAMPLER, (int2)(ch_blk, 0)); + // native_rsqrt seems not faster than rsqrt DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)epsilon); - DATA_TYPE4 new_offset = offset_value - mean_value * new_scale; + DATA_TYPE4 new_offset = mad(0 - mean_value, new_scale, offset_value); - const int pos = ch_blk * width + w; + const int pos = mad24(ch_blk, width, w); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); - DATA_TYPE4 out = in * new_scale + new_offset; + DATA_TYPE4 out = mad(in, new_scale, new_offset); WRITE_IMAGET(output, (int2)(pos, hb), out); } diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl index 3dc0eabe60f8fce68e03e8c8abe789fb35ace510..f5180a3c0d58b478d81d08e65743c4af1f77c189 100644 --- a/mace/kernels/opencl/cl/bias_add.cl +++ b/mace/kernels/opencl/cl/bias_add.cl @@ -8,8 +8,7 @@ __kernel void bias_add(__read_only image2d_t input, const int hb = get_global_id(2); const int width = get_global_size(1); - - const int pos = ch_blk * width + w; + const int pos = mad24(ch_blk, width, w); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(ch_blk, 0)); DATA_TYPE4 out = in + bias_value; diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl index 5ae4fb04e141f6510529083b2fee9e8826494b6d..3a3efea4be125f6d7b94b39871838a7b0f7d96f4 100644 --- a/mace/kernels/opencl/cl/concat.cl +++ b/mace/kernels/opencl/cl/concat.cl @@ -30,37 +30,37 @@ __kernel void concat_channel(__read_only image2d_t input0, const int width_idx = get_global_id(1); const int width = get_global_size(1); const int hb_idx = get_global_id(2); - const int input0_chan_blk = (input0_chan + 3) / 4; + const int input0_chan_blk = (input0_chan + 3) >> 2; DATA_TYPE4 data = 0; #ifdef DIVISIBLE_FOUR if (chan_blk_idx + 1 <= input0_chan_blk) { data = READ_IMAGET(input0, SAMPLER, - (int2)(chan_blk_idx * width + width_idx, hb_idx)); + (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx)); } else { data = READ_IMAGET(input1, SAMPLER, - (int2)((chan_blk_idx - input0_chan_blk) * width + width_idx, hb_idx)); + (int2)(mad24((chan_blk_idx - input0_chan_blk), width, width_idx), hb_idx)); } #else if (chan_blk_idx + 1 < input0_chan_blk) { data = READ_IMAGET(input0, SAMPLER, - (int2)(chan_blk_idx * width + width_idx, hb_idx)); + (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx)); } else if (chan_blk_idx >= input0_chan_blk) { const int in_chan_idx = chan_blk_idx - input0_chan_blk; DATA_TYPE4 data0 = READ_IMAGET(input1, SAMPLER, - (int2)(in_chan_idx * width + width_idx, hb_idx)); + (int2)(mad24(in_chan_idx, width, width_idx), hb_idx)); DATA_TYPE4 data1 = READ_IMAGET(input1, SAMPLER, - (int2)((in_chan_idx + 1) * width + width_idx, hb_idx)); + (int2)(mad24((in_chan_idx + 1), width, width_idx), hb_idx)); data = stitch_vector(data0, data1, input0_chan % 4, true); } else { DATA_TYPE4 data0 = READ_IMAGET(input0, SAMPLER, - (int2)(chan_blk_idx * width + width_idx, hb_idx)); + (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx)); DATA_TYPE4 data1 = READ_IMAGET(input1, SAMPLER, (int2)(width_idx, hb_idx)); @@ -68,7 +68,7 @@ __kernel void concat_channel(__read_only image2d_t input0, } #endif - WRITE_IMAGET(output, (int2)(chan_blk_idx * width + width_idx, hb_idx), data); + WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data); } //__kernel void concat_width(__read_only image2d_t input0, diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl index 060cb39a7921882a6fd2c733e3a370e7c2110ed4..ccfe86d85c83a24898ce9640747fe46b95be2d1f 100644 --- a/mace/kernels/opencl/cl/conv_2d.cl +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -19,7 +19,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ const int out_w_blk = get_global_id(1); const int out_w_blks = get_global_size(1); const int out_hb = get_global_id(2); - const int rounded_in_ch = in_ch_blks * 4; + const int rounded_in_ch = in_ch_blks << 2; #ifdef BIAS DATA_TYPE4 out0 = @@ -41,29 +41,29 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ int in_width3 = in_width2 + out_w_blks; const int height_idx = (out_hb % out_height) - padding_top; #else - int in_width0 = out_w_blk * 2 - padding_left; - int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left; - int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left; - int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left; - const int height_idx = (out_hb % out_height) * 2 - padding_top; + int in_width0 = (out_w_blk << 1) - padding_left; + int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left; + int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left; + int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left; + const int height_idx = ((out_hb % out_height) << 1) - padding_top; #endif - const int batch_idx = (out_hb / out_height) * in_height; + const int batch_idx = mul24((out_hb / out_height), in_height); + const int rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width); DATA_TYPE4 in0, in1, in2, in3; DATA_TYPE4 weights0, weights1, weights2, weights3; - int in_idx, in_width_idx; - // Unrolling this loop hurt perfmance for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { + const int in_idx = mul24(in_ch_blk, in_width); + int filter_x_part0 = in_ch_blk << 2; for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) { - int in_hb_value = height_idx + hb_idx; in_hb_value = select(in_hb_value + batch_idx, -1, (in_hb_value < 0 || in_hb_value >= in_height)); + int filter_x_part1 = 0; for (short width_idx = 0; width_idx < filter_width; ++width_idx) { - in_idx = in_ch_blk * in_width; int in_width_value; #define READ_INPUT(i) \ in_width_value = in_width##i + width_idx; \ @@ -79,36 +79,37 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ #undef READ_INPUT - int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch; + // int filter_idx = (hb_idx * filter_width + width_idx) * rounded_in_ch + (in_ch_blk << 2); + int filter_idx = filter_x_part0 + filter_x_part1; weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk)); weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk)); weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk)); weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk)); - // Will prefetch L2 improve performance? How to pretch image data? + out0 = mad(in0.x, weights0, out0); + out0 = mad(in0.y, weights1, out0); + out0 = mad(in0.z, weights2, out0); + out0 = mad(in0.w, weights3, out0); - // Interleaving load and mul does not improve performance as expected - out0 += in0.x * weights0; - out0 += in0.y * weights1; - out0 += in0.z * weights2; - out0 += in0.w * weights3; - out1 += in1.x * weights0; - out1 += in1.y * weights1; - out1 += in1.z * weights2; - out1 += in1.w * weights3; + out1 = mad(in1.x, weights0, out1); + out1 = mad(in1.y, weights1, out1); + out1 = mad(in1.z, weights2, out1); + out1 = mad(in1.w, weights3, out1); - out2 += in2.x * weights0; - out2 += in2.y * weights1; - out2 += in2.z * weights2; - out2 += in2.w * weights3; + out2 = mad(in2.x, weights0, out2); + out2 = mad(in2.y, weights1, out2); + out2 = mad(in2.z, weights2, out2); + out2 = mad(in2.w, weights3, out2); - out3 += in3.x * weights0; - out3 += in3.y * weights1; - out3 += in3.z * weights2; - out3 += in3.w * weights3; + out3 = mad(in3.x, weights0, out3); + out3 = mad(in3.y, weights1, out3); + out3 = mad(in3.z, weights2, out3); + out3 = mad(in3.w, weights3, out3); + filter_x_part1 += rounded_in_ch; } + filter_x_part0 += rounded_in_ch_x_filter_width; } } @@ -120,28 +121,20 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ out3 = fmax(out3, 0); #endif - const int out_x_base = out_ch_blk * out_width; + const int out_x_base = mul24(out_ch_blk, out_width); int w = out_w_blk; - WRITE_IMAGET(output, - (int2)(out_x_base + w, out_hb), - out0); + WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0); w += out_w_blks; if (w >= out_width) return; - WRITE_IMAGET(output, - (int2)(out_x_base + w, out_hb), - out1); + WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1); w += out_w_blks; if (w >= out_width) return; - WRITE_IMAGET(output, - (int2)(out_x_base + w, out_hb), - out2); + WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2); w += out_w_blks; if (w >= out_width) return; - WRITE_IMAGET(output, - (int2)(out_x_base + w, out_hb), - out3); + WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); } diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index 1249f1fcffa00b1719203bd2000ac4335df326c3..e741833a31fde27d0764110629b07dd861dfc3af 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -36,11 +36,11 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] w.w = w.z + out_w_blks; int out_hb_idx = (out_hb % height); #else - w.x = out_w_blk * 2; - w.y = (out_w_blk + out_w_blks) * 2; - w.z = (out_w_blk + 2 * out_w_blks) * 2; - w.w = (out_w_blk + 3 * out_w_blks) * 2; - int out_hb_idx = (out_hb % height) * 2; + w.x = out_w_blk << 1; + w.y = (out_w_blk + out_w_blks) << 1; + w.z = (out_w_blk + (out_w_blks << 1)) << 1; + w.w = (out_w_blk + (out_w_blks << 1) + out_w_blks) << 1; + int out_hb_idx = (out_hb % height) << 1; #endif w.x = select(w.x, INT_MIN, w.x >= in_width); @@ -48,47 +48,46 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] w.z = select(w.z, INT_MIN, w.z >= in_width); w.w = select(w.w, INT_MIN, w.w >= in_width); - out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height, + out_hb_idx = select(mad24((out_hb / height), in_height, out_hb_idx), -1, out_hb_idx >= in_height); // Unrolling this loop hurt perfmance int in_x_base = 0; + int filter_x_base = 0; for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { - DATA_TYPE4 in0 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.x, out_hb_idx)); DATA_TYPE4 in1 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.y, out_hb_idx)); DATA_TYPE4 in2 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.z, out_hb_idx)); DATA_TYPE4 in3 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.w, out_hb_idx)); - const int filter_x0 = in_ch_blk << 2; - DATA_TYPE4 weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0, out_ch_blk)); - DATA_TYPE4 weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 1, out_ch_blk)); - DATA_TYPE4 weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 2, out_ch_blk)); - DATA_TYPE4 weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 3, out_ch_blk)); - // Will prefetch L2 improve performance? How to pretch image data? - - out0 += in0.x * weights0; - out0 += in0.y * weights1; - out0 += in0.z * weights2; - out0 += in0.w * weights3; - - out1 += in1.x * weights0; - out1 += in1.y * weights1; - out1 += in1.z * weights2; - out1 += in1.w * weights3; - - out2 += in2.x * weights0; - out2 += in2.y * weights1; - out2 += in2.z * weights2; - out2 += in2.w * weights3; - - out3 += in3.x * weights0; - out3 += in3.y * weights1; - out3 += in3.z * weights2; - out3 += in3.w * weights3; + DATA_TYPE4 weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 0, out_ch_blk)); + DATA_TYPE4 weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 1, out_ch_blk)); + DATA_TYPE4 weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 2, out_ch_blk)); + DATA_TYPE4 weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 3, out_ch_blk)); + + out0 = mad(in0.x, weights0, out0); + out0 = mad(in0.y, weights1, out0); + out0 = mad(in0.z, weights2, out0); + out0 = mad(in0.w, weights3, out0); + + out1 = mad(in1.x, weights0, out1); + out1 = mad(in1.y, weights1, out1); + out1 = mad(in1.z, weights2, out1); + out1 = mad(in1.w, weights3, out1); + + out2 = mad(in2.x, weights0, out2); + out2 = mad(in2.y, weights1, out2); + out2 = mad(in2.z, weights2, out2); + out2 = mad(in2.w, weights3, out2); + + out3 = mad(in3.x, weights0, out3); + out3 = mad(in3.y, weights1, out3); + out3 = mad(in3.z, weights2, out3); + out3 = mad(in3.w, weights3, out3); in_x_base += in_width; + filter_x_base += 4; } #ifdef FUSED_RELU @@ -99,7 +98,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] out3 = fmax(out3, 0); #endif - const int out_x_base = out_ch_blk * width; + const int out_x_base = mul24(out_ch_blk, width); int out_x_idx = out_w_blk; WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0); @@ -114,5 +113,4 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] out_x_idx += out_w_blks; if (out_x_idx >= width) return; WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3); - } diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 9d032a729fbf767442e27c28f878d3ed1f89c474..8a5ece6b31d907fc0a564c7407c969d6102b4c3a 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -17,7 +17,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const int out_w_blk = get_global_id(1); const int out_w_blks = get_global_size(1); const int out_hb = get_global_id(2); - const int rounded_in_ch = in_ch_blks * 4; + const int rounded_in_ch = in_ch_blks << 2; #ifdef BIAS DATA_TYPE4 out0 = @@ -42,29 +42,30 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] int in_width4 = in_width3 + out_w_blks; const int height_idx = (out_hb % out_height) - padding_top; #else - int in_width0 = out_w_blk * 2 - padding_left; - int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left; - int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left; - int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left; - int in_width4 = (out_w_blk + 4 * out_w_blks) * 2 - padding_left; - const int height_idx = (out_hb % out_height) * 2 - padding_top; + int in_width0 = (out_w_blk << 1) - padding_left; + int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left; + int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left; + int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left; + int in_width4 = ((out_w_blk + (out_w_blks << 2)) << 1) - padding_left; + const int height_idx = ((out_hb % out_height) << 1) - padding_top; #endif - const int batch_idx = (out_hb / out_height) * in_height; + const int batch_idx = mul24((out_hb / out_height), in_height); + const int rounded_in_ch_x_3 = (rounded_in_ch << 1) + rounded_in_ch; DATA_TYPE4 in0, in1, in2, in3, in4; DATA_TYPE4 weights0, weights1, weights2, weights3; - int in_idx, hb_idx, width_idx, in_width_idx; - // Unrolling this loop hurt perfmance + int hb_idx, width_idx, in_width_idx; for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { + const int in_idx = mul24(in_ch_blk, in_width); + int filter_x_part0 = in_ch_blk << 2; for (short hb_idx = 0; hb_idx < 3; ++hb_idx) { int in_hb_value = height_idx + hb_idx; in_hb_value = select(in_hb_value + batch_idx, -1, (in_hb_value < 0 || in_hb_value >= in_height)); + int filter_x_part1 = 0; for (short width_idx = 0; width_idx < 3; ++width_idx) { - - in_idx = in_ch_blk * in_width; int in_width_value; #define READ_INPUT(i) \ in_width_value = in_width##i + width_idx; \ @@ -81,40 +82,42 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] #undef READ_INPUT - int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch; + // int filter_idx = (hb_idx * 3 + width_idx) * rounded_in_ch + (in_ch_blk << 2); + int filter_idx = filter_x_part0 + filter_x_part1; weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk)); weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk)); weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk)); weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk)); - // Will prefetch L2 improve performance? How to pretch image data? - - // Interleaving load and mul does not improve performance as expected - out0 += in0.x * weights0; - out0 += in0.y * weights1; - out0 += in0.z * weights2; - out0 += in0.w * weights3; - - out1 += in1.x * weights0; - out1 += in1.y * weights1; - out1 += in1.z * weights2; - out1 += in1.w * weights3; - - out2 += in2.x * weights0; - out2 += in2.y * weights1; - out2 += in2.z * weights2; - out2 += in2.w * weights3; - - out3 += in3.x * weights0; - out3 += in3.y * weights1; - out3 += in3.z * weights2; - out3 += in3.w * weights3; - - out4 += in4.x * weights0; - out4 += in4.y * weights1; - out4 += in4.z * weights2; - out4 += in4.w * weights3; + out0 = mad(in0.x, weights0, out0); + out0 = mad(in0.y, weights1, out0); + out0 = mad(in0.z, weights2, out0); + out0 = mad(in0.w, weights3, out0); + + + out1 = mad(in1.x, weights0, out1); + out1 = mad(in1.y, weights1, out1); + out1 = mad(in1.z, weights2, out1); + out1 = mad(in1.w, weights3, out1); + + out2 = mad(in2.x, weights0, out2); + out2 = mad(in2.y, weights1, out2); + out2 = mad(in2.z, weights2, out2); + out2 = mad(in2.w, weights3, out2); + + out3 = mad(in3.x, weights0, out3); + out3 = mad(in3.y, weights1, out3); + out3 = mad(in3.z, weights2, out3); + out3 = mad(in3.w, weights3, out3); + + out4 = mad(in4.x, weights0, out4); + out4 = mad(in4.y, weights1, out4); + out4 = mad(in4.z, weights2, out4); + out4 = mad(in4.w, weights3, out4); + + filter_x_part1 += rounded_in_ch; } + filter_x_part0 += rounded_in_ch_x_3; } } @@ -127,7 +130,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] out4 = fmax(out4, 0); #endif - const int out_x_base = out_ch_blk * out_width; + const int out_x_base = mul24(out_ch_blk, out_width); int w = out_w_blk; WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl index bd2763fc8c7eb68c06e09b4822ccc5025807a151..f2298a93264e5a3ad79ac3977d226ef7dbb3058a 100644 --- a/mace/kernels/opencl/cl/pooling.cl +++ b/mace/kernels/opencl/cl/pooling.cl @@ -15,7 +15,7 @@ inline int calculate_avg_block_size(const int pool_size, const int w_start = max(0, pos_w); const int h_end = min(pos_h + pool_size, h_size); const int w_end = min(pos_w + pool_size, w_size); - return (h_end - h_start) * (w_end - w_start); + return mul24((h_end - h_start), (w_end - w_start)); } // Supported data type: half/float @@ -33,10 +33,10 @@ __kernel void pooling(__read_only image2d_t input, const int out_width = get_global_size(1); const int out_hb_idx = get_global_id(2); - const int batch_idx = (out_hb_idx / out_height) * in_height; - const int in_height_start = (out_hb_idx % out_height) * stride - pad_top; - const int in_width_start = out_width_idx * stride - pad_left; - const int in_channel_offset = out_chan_idx * in_width; + const int batch_idx = mul24((out_hb_idx / out_height), in_height); + const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top; + const int in_width_start = mul24(out_width_idx, stride) - pad_left; + const int in_channel_offset = mul24(out_chan_idx, in_width); #ifdef POOL_AVG @@ -83,5 +83,5 @@ __kernel void pooling(__read_only image2d_t input, } #endif - WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res); + WRITE_IMAGET(output, (int2)(mad24(out_chan_idx, out_width, out_width_idx), out_hb_idx), res); } diff --git a/mace/kernels/opencl/cl/relu.cl b/mace/kernels/opencl/cl/relu.cl index c8ea1c092f898b912db90555fd3309d7d21d349a..e0762bdb0312af28ae49a2b54927e36bfc5b78dc 100644 --- a/mace/kernels/opencl/cl/relu.cl +++ b/mace/kernels/opencl/cl/relu.cl @@ -8,7 +8,7 @@ __kernel void relu(__read_only image2d_t input, const int hb = get_global_id(2); const int width = get_global_size(1); - const int pos = ch_blk * width + w; + const int pos = mad24(ch_blk, width, w); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); DATA_TYPE4 out = fmax(in, 0); WRITE_IMAGET(output, (int2)(pos, hb), out); @@ -22,7 +22,7 @@ __kernel void relux(__read_only image2d_t input, const int hb = get_global_id(2); const int width = get_global_size(1); - const int pos = ch_blk * width + w; + const int pos = mad24(ch_blk, width, w); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); DATA_TYPE4 out = clamp(in, 0, max_limit); WRITE_IMAGET(output, (int2)(pos, hb), out); diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl index 72f09c96cd266725237ea7afb8d88fe1915b2573..e0b4b83dcf2e6cc4610d664408db05550a58f0de 100644 --- a/mace/kernels/opencl/cl/resize_bilinear.cl +++ b/mace/kernels/opencl/cl/resize_bilinear.cl @@ -25,8 +25,8 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * const float h_lerp = h_in - h_lower; const float w_lerp = w_in - w_lower; - const int in_w_offset = ch_blk * in_width; - const int in_h_offset = b * in_height; + const int in_w_offset = mul24(ch_blk, in_width); + const int in_h_offset = mul24(b, in_height); DATA_TYPE4 top_left = READ_IMAGET(input, SAMPLER, (int2)(in_w_offset + w_lower, in_h_offset + h_lower)); @@ -37,13 +37,12 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * DATA_TYPE4 bottom_right = READ_IMAGET(input, SAMPLER, (int2)(in_w_offset + w_upper, in_h_offset + h_upper)); - DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp; - DATA_TYPE4 bottom = bottom_left + (bottom_right - bottom_left) * w_lerp; + DATA_TYPE4 top = mad((top_right - top_left), w_lerp, top_left); + DATA_TYPE4 bottom = mad((bottom_right - bottom_left), w_lerp, bottom_left); + DATA_TYPE4 out = mad((bottom - top), h_lerp, top); - DATA_TYPE4 out = top + (bottom - top) * h_lerp; - - const int out_w_offset = ch_blk * out_width; - const int out_h_offset = b * out_height; + const int out_w_offset = mul24(ch_blk, out_width); + const int out_h_offset = mul24(b, out_height); WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out); }