提交 63e6e07f 编写于 作者: L Liangliang He

Optimizing OpenCL kernel with mad/mad24/mul24

上级 22581f22
...@@ -17,12 +17,13 @@ __kernel void batch_norm(__read_only image2d_t input, ...@@ -17,12 +17,13 @@ __kernel void batch_norm(__read_only image2d_t input,
DATA_TYPE4 mean_value = READ_IMAGET(mean, SAMPLER, (int2)(ch_blk, 0)); DATA_TYPE4 mean_value = READ_IMAGET(mean, SAMPLER, (int2)(ch_blk, 0));
DATA_TYPE4 var_value = READ_IMAGET(var, SAMPLER, (int2)(ch_blk, 0)); DATA_TYPE4 var_value = READ_IMAGET(var, SAMPLER, (int2)(ch_blk, 0));
// native_rsqrt seems not faster than rsqrt
DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)epsilon); DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)epsilon);
DATA_TYPE4 new_offset = offset_value - mean_value * new_scale; DATA_TYPE4 new_offset = mad(0 - mean_value, new_scale, offset_value);
const int pos = ch_blk * width + w; const int pos = mad24(ch_blk, width, w);
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
DATA_TYPE4 out = in * new_scale + new_offset; DATA_TYPE4 out = mad(in, new_scale, new_offset);
WRITE_IMAGET(output, (int2)(pos, hb), out); WRITE_IMAGET(output, (int2)(pos, hb), out);
} }
...@@ -8,8 +8,7 @@ __kernel void bias_add(__read_only image2d_t input, ...@@ -8,8 +8,7 @@ __kernel void bias_add(__read_only image2d_t input,
const int hb = get_global_id(2); const int hb = get_global_id(2);
const int width = get_global_size(1); const int width = get_global_size(1);
const int pos = mad24(ch_blk, width, w);
const int pos = ch_blk * width + w;
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(ch_blk, 0)); DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(ch_blk, 0));
DATA_TYPE4 out = in + bias_value; DATA_TYPE4 out = in + bias_value;
......
...@@ -30,37 +30,37 @@ __kernel void concat_channel(__read_only image2d_t input0, ...@@ -30,37 +30,37 @@ __kernel void concat_channel(__read_only image2d_t input0,
const int width_idx = get_global_id(1); const int width_idx = get_global_id(1);
const int width = get_global_size(1); const int width = get_global_size(1);
const int hb_idx = get_global_id(2); const int hb_idx = get_global_id(2);
const int input0_chan_blk = (input0_chan + 3) / 4; const int input0_chan_blk = (input0_chan + 3) >> 2;
DATA_TYPE4 data = 0; DATA_TYPE4 data = 0;
#ifdef DIVISIBLE_FOUR #ifdef DIVISIBLE_FOUR
if (chan_blk_idx + 1 <= input0_chan_blk) { if (chan_blk_idx + 1 <= input0_chan_blk) {
data = READ_IMAGET(input0, data = READ_IMAGET(input0,
SAMPLER, SAMPLER,
(int2)(chan_blk_idx * width + width_idx, hb_idx)); (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
} else { } else {
data = READ_IMAGET(input1, data = READ_IMAGET(input1,
SAMPLER, SAMPLER,
(int2)((chan_blk_idx - input0_chan_blk) * width + width_idx, hb_idx)); (int2)(mad24((chan_blk_idx - input0_chan_blk), width, width_idx), hb_idx));
} }
#else #else
if (chan_blk_idx + 1 < input0_chan_blk) { if (chan_blk_idx + 1 < input0_chan_blk) {
data = READ_IMAGET(input0, data = READ_IMAGET(input0,
SAMPLER, SAMPLER,
(int2)(chan_blk_idx * width + width_idx, hb_idx)); (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
} else if (chan_blk_idx >= input0_chan_blk) { } else if (chan_blk_idx >= input0_chan_blk) {
const int in_chan_idx = chan_blk_idx - input0_chan_blk; const int in_chan_idx = chan_blk_idx - input0_chan_blk;
DATA_TYPE4 data0 = READ_IMAGET(input1, DATA_TYPE4 data0 = READ_IMAGET(input1,
SAMPLER, SAMPLER,
(int2)(in_chan_idx * width + width_idx, hb_idx)); (int2)(mad24(in_chan_idx, width, width_idx), hb_idx));
DATA_TYPE4 data1 = READ_IMAGET(input1, DATA_TYPE4 data1 = READ_IMAGET(input1,
SAMPLER, SAMPLER,
(int2)((in_chan_idx + 1) * width + width_idx, hb_idx)); (int2)(mad24((in_chan_idx + 1), width, width_idx), hb_idx));
data = stitch_vector(data0, data1, input0_chan % 4, true); data = stitch_vector(data0, data1, input0_chan % 4, true);
} else { } else {
DATA_TYPE4 data0 = READ_IMAGET(input0, DATA_TYPE4 data0 = READ_IMAGET(input0,
SAMPLER, SAMPLER,
(int2)(chan_blk_idx * width + width_idx, hb_idx)); (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
DATA_TYPE4 data1 = READ_IMAGET(input1, DATA_TYPE4 data1 = READ_IMAGET(input1,
SAMPLER, SAMPLER,
(int2)(width_idx, hb_idx)); (int2)(width_idx, hb_idx));
...@@ -68,7 +68,7 @@ __kernel void concat_channel(__read_only image2d_t input0, ...@@ -68,7 +68,7 @@ __kernel void concat_channel(__read_only image2d_t input0,
} }
#endif #endif
WRITE_IMAGET(output, (int2)(chan_blk_idx * width + width_idx, hb_idx), data); WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data);
} }
//__kernel void concat_width(__read_only image2d_t input0, //__kernel void concat_width(__read_only image2d_t input0,
......
...@@ -19,7 +19,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -19,7 +19,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
const int out_w_blk = get_global_id(1); const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1); const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2); const int out_hb = get_global_id(2);
const int rounded_in_ch = in_ch_blks * 4; const int rounded_in_ch = in_ch_blks << 2;
#ifdef BIAS #ifdef BIAS
DATA_TYPE4 out0 = DATA_TYPE4 out0 =
...@@ -41,29 +41,29 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -41,29 +41,29 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
int in_width3 = in_width2 + out_w_blks; int in_width3 = in_width2 + out_w_blks;
const int height_idx = (out_hb % out_height) - padding_top; const int height_idx = (out_hb % out_height) - padding_top;
#else #else
int in_width0 = out_w_blk * 2 - padding_left; int in_width0 = (out_w_blk << 1) - padding_left;
int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left; int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left;
int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left; int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left;
int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left; int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left;
const int height_idx = (out_hb % out_height) * 2 - padding_top; const int height_idx = ((out_hb % out_height) << 1) - padding_top;
#endif #endif
const int batch_idx = (out_hb / out_height) * in_height; const int batch_idx = mul24((out_hb / out_height), in_height);
const int rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width);
DATA_TYPE4 in0, in1, in2, in3; DATA_TYPE4 in0, in1, in2, in3;
DATA_TYPE4 weights0, weights1, weights2, weights3; DATA_TYPE4 weights0, weights1, weights2, weights3;
int in_idx, in_width_idx;
// Unrolling this loop hurt perfmance
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
const int in_idx = mul24(in_ch_blk, in_width);
int filter_x_part0 = in_ch_blk << 2;
for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) { for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
int in_hb_value = height_idx + hb_idx; int in_hb_value = height_idx + hb_idx;
in_hb_value = select(in_hb_value + batch_idx, in_hb_value = select(in_hb_value + batch_idx,
-1, -1,
(in_hb_value < 0 || in_hb_value >= in_height)); (in_hb_value < 0 || in_hb_value >= in_height));
int filter_x_part1 = 0;
for (short width_idx = 0; width_idx < filter_width; ++width_idx) { for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
in_idx = in_ch_blk * in_width;
int in_width_value; int in_width_value;
#define READ_INPUT(i) \ #define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \ in_width_value = in_width##i + width_idx; \
...@@ -79,36 +79,37 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -79,36 +79,37 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
#undef READ_INPUT #undef READ_INPUT
int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch; // int filter_idx = (hb_idx * filter_width + width_idx) * rounded_in_ch + (in_ch_blk << 2);
int filter_idx = filter_x_part0 + filter_x_part1;
weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk)); weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk)); weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk)); weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk)); weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data? out0 = mad(in0.x, weights0, out0);
out0 = mad(in0.y, weights1, out0);
out0 = mad(in0.z, weights2, out0);
out0 = mad(in0.w, weights3, out0);
// Interleaving load and mul does not improve performance as expected
out0 += in0.x * weights0;
out0 += in0.y * weights1;
out0 += in0.z * weights2;
out0 += in0.w * weights3;
out1 += in1.x * weights0; out1 = mad(in1.x, weights0, out1);
out1 += in1.y * weights1; out1 = mad(in1.y, weights1, out1);
out1 += in1.z * weights2; out1 = mad(in1.z, weights2, out1);
out1 += in1.w * weights3; out1 = mad(in1.w, weights3, out1);
out2 += in2.x * weights0; out2 = mad(in2.x, weights0, out2);
out2 += in2.y * weights1; out2 = mad(in2.y, weights1, out2);
out2 += in2.z * weights2; out2 = mad(in2.z, weights2, out2);
out2 += in2.w * weights3; out2 = mad(in2.w, weights3, out2);
out3 += in3.x * weights0; out3 = mad(in3.x, weights0, out3);
out3 += in3.y * weights1; out3 = mad(in3.y, weights1, out3);
out3 += in3.z * weights2; out3 = mad(in3.z, weights2, out3);
out3 += in3.w * weights3; out3 = mad(in3.w, weights3, out3);
filter_x_part1 += rounded_in_ch;
} }
filter_x_part0 += rounded_in_ch_x_filter_width;
} }
} }
...@@ -120,28 +121,20 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ ...@@ -120,28 +121,20 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
out3 = fmax(out3, 0); out3 = fmax(out3, 0);
#endif #endif
const int out_x_base = out_ch_blk * out_width; const int out_x_base = mul24(out_ch_blk, out_width);
int w = out_w_blk; int w = out_w_blk;
WRITE_IMAGET(output, WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0);
(int2)(out_x_base + w, out_hb),
out0);
w += out_w_blks; w += out_w_blks;
if (w >= out_width) return; if (w >= out_width) return;
WRITE_IMAGET(output, WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1);
(int2)(out_x_base + w, out_hb),
out1);
w += out_w_blks; w += out_w_blks;
if (w >= out_width) return; if (w >= out_width) return;
WRITE_IMAGET(output, WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2);
(int2)(out_x_base + w, out_hb),
out2);
w += out_w_blks; w += out_w_blks;
if (w >= out_width) return; if (w >= out_width) return;
WRITE_IMAGET(output, WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
(int2)(out_x_base + w, out_hb),
out3);
} }
...@@ -36,11 +36,11 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -36,11 +36,11 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
w.w = w.z + out_w_blks; w.w = w.z + out_w_blks;
int out_hb_idx = (out_hb % height); int out_hb_idx = (out_hb % height);
#else #else
w.x = out_w_blk * 2; w.x = out_w_blk << 1;
w.y = (out_w_blk + out_w_blks) * 2; w.y = (out_w_blk + out_w_blks) << 1;
w.z = (out_w_blk + 2 * out_w_blks) * 2; w.z = (out_w_blk + (out_w_blks << 1)) << 1;
w.w = (out_w_blk + 3 * out_w_blks) * 2; w.w = (out_w_blk + (out_w_blks << 1) + out_w_blks) << 1;
int out_hb_idx = (out_hb % height) * 2; int out_hb_idx = (out_hb % height) << 1;
#endif #endif
w.x = select(w.x, INT_MIN, w.x >= in_width); w.x = select(w.x, INT_MIN, w.x >= in_width);
...@@ -48,47 +48,46 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -48,47 +48,46 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
w.z = select(w.z, INT_MIN, w.z >= in_width); w.z = select(w.z, INT_MIN, w.z >= in_width);
w.w = select(w.w, INT_MIN, w.w >= in_width); w.w = select(w.w, INT_MIN, w.w >= in_width);
out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height, out_hb_idx = select(mad24((out_hb / height), in_height, out_hb_idx),
-1, -1,
out_hb_idx >= in_height); out_hb_idx >= in_height);
// Unrolling this loop hurt perfmance // Unrolling this loop hurt perfmance
int in_x_base = 0; int in_x_base = 0;
int filter_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
DATA_TYPE4 in0 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.x, out_hb_idx)); DATA_TYPE4 in0 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.x, out_hb_idx));
DATA_TYPE4 in1 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.y, out_hb_idx)); DATA_TYPE4 in1 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.y, out_hb_idx));
DATA_TYPE4 in2 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.z, out_hb_idx)); DATA_TYPE4 in2 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.z, out_hb_idx));
DATA_TYPE4 in3 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.w, out_hb_idx)); DATA_TYPE4 in3 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.w, out_hb_idx));
const int filter_x0 = in_ch_blk << 2; DATA_TYPE4 weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 0, out_ch_blk));
DATA_TYPE4 weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0, out_ch_blk)); DATA_TYPE4 weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 1, out_ch_blk));
DATA_TYPE4 weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 1, out_ch_blk)); DATA_TYPE4 weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 2, out_ch_blk));
DATA_TYPE4 weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 2, out_ch_blk)); DATA_TYPE4 weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 3, out_ch_blk));
DATA_TYPE4 weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data? out0 = mad(in0.x, weights0, out0);
out0 = mad(in0.y, weights1, out0);
out0 += in0.x * weights0; out0 = mad(in0.z, weights2, out0);
out0 += in0.y * weights1; out0 = mad(in0.w, weights3, out0);
out0 += in0.z * weights2;
out0 += in0.w * weights3; out1 = mad(in1.x, weights0, out1);
out1 = mad(in1.y, weights1, out1);
out1 += in1.x * weights0; out1 = mad(in1.z, weights2, out1);
out1 += in1.y * weights1; out1 = mad(in1.w, weights3, out1);
out1 += in1.z * weights2;
out1 += in1.w * weights3; out2 = mad(in2.x, weights0, out2);
out2 = mad(in2.y, weights1, out2);
out2 += in2.x * weights0; out2 = mad(in2.z, weights2, out2);
out2 += in2.y * weights1; out2 = mad(in2.w, weights3, out2);
out2 += in2.z * weights2;
out2 += in2.w * weights3; out3 = mad(in3.x, weights0, out3);
out3 = mad(in3.y, weights1, out3);
out3 += in3.x * weights0; out3 = mad(in3.z, weights2, out3);
out3 += in3.y * weights1; out3 = mad(in3.w, weights3, out3);
out3 += in3.z * weights2;
out3 += in3.w * weights3;
in_x_base += in_width; in_x_base += in_width;
filter_x_base += 4;
} }
#ifdef FUSED_RELU #ifdef FUSED_RELU
...@@ -99,7 +98,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -99,7 +98,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out3 = fmax(out3, 0); out3 = fmax(out3, 0);
#endif #endif
const int out_x_base = out_ch_blk * width; const int out_x_base = mul24(out_ch_blk, width);
int out_x_idx = out_w_blk; int out_x_idx = out_w_blk;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0); WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
...@@ -114,5 +113,4 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -114,5 +113,4 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out_x_idx += out_w_blks; out_x_idx += out_w_blks;
if (out_x_idx >= width) return; if (out_x_idx >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3); WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
} }
...@@ -17,7 +17,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -17,7 +17,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int out_w_blk = get_global_id(1); const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1); const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2); const int out_hb = get_global_id(2);
const int rounded_in_ch = in_ch_blks * 4; const int rounded_in_ch = in_ch_blks << 2;
#ifdef BIAS #ifdef BIAS
DATA_TYPE4 out0 = DATA_TYPE4 out0 =
...@@ -42,29 +42,30 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -42,29 +42,30 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
int in_width4 = in_width3 + out_w_blks; int in_width4 = in_width3 + out_w_blks;
const int height_idx = (out_hb % out_height) - padding_top; const int height_idx = (out_hb % out_height) - padding_top;
#else #else
int in_width0 = out_w_blk * 2 - padding_left; int in_width0 = (out_w_blk << 1) - padding_left;
int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left; int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left;
int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left; int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left;
int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left; int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left;
int in_width4 = (out_w_blk + 4 * out_w_blks) * 2 - padding_left; int in_width4 = ((out_w_blk + (out_w_blks << 2)) << 1) - padding_left;
const int height_idx = (out_hb % out_height) * 2 - padding_top; const int height_idx = ((out_hb % out_height) << 1) - padding_top;
#endif #endif
const int batch_idx = (out_hb / out_height) * in_height; const int batch_idx = mul24((out_hb / out_height), in_height);
const int rounded_in_ch_x_3 = (rounded_in_ch << 1) + rounded_in_ch;
DATA_TYPE4 in0, in1, in2, in3, in4; DATA_TYPE4 in0, in1, in2, in3, in4;
DATA_TYPE4 weights0, weights1, weights2, weights3; DATA_TYPE4 weights0, weights1, weights2, weights3;
int in_idx, hb_idx, width_idx, in_width_idx; int hb_idx, width_idx, in_width_idx;
// Unrolling this loop hurt perfmance
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
const int in_idx = mul24(in_ch_blk, in_width);
int filter_x_part0 = in_ch_blk << 2;
for (short hb_idx = 0; hb_idx < 3; ++hb_idx) { for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
int in_hb_value = height_idx + hb_idx; int in_hb_value = height_idx + hb_idx;
in_hb_value = select(in_hb_value + batch_idx, in_hb_value = select(in_hb_value + batch_idx,
-1, -1,
(in_hb_value < 0 || in_hb_value >= in_height)); (in_hb_value < 0 || in_hb_value >= in_height));
int filter_x_part1 = 0;
for (short width_idx = 0; width_idx < 3; ++width_idx) { for (short width_idx = 0; width_idx < 3; ++width_idx) {
in_idx = in_ch_blk * in_width;
int in_width_value; int in_width_value;
#define READ_INPUT(i) \ #define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \ in_width_value = in_width##i + width_idx; \
...@@ -81,40 +82,42 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -81,40 +82,42 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#undef READ_INPUT #undef READ_INPUT
int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch; // int filter_idx = (hb_idx * 3 + width_idx) * rounded_in_ch + (in_ch_blk << 2);
int filter_idx = filter_x_part0 + filter_x_part1;
weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk)); weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk)); weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk)); weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk)); weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data? out0 = mad(in0.x, weights0, out0);
out0 = mad(in0.y, weights1, out0);
// Interleaving load and mul does not improve performance as expected out0 = mad(in0.z, weights2, out0);
out0 += in0.x * weights0; out0 = mad(in0.w, weights3, out0);
out0 += in0.y * weights1;
out0 += in0.z * weights2;
out0 += in0.w * weights3; out1 = mad(in1.x, weights0, out1);
out1 = mad(in1.y, weights1, out1);
out1 += in1.x * weights0; out1 = mad(in1.z, weights2, out1);
out1 += in1.y * weights1; out1 = mad(in1.w, weights3, out1);
out1 += in1.z * weights2;
out1 += in1.w * weights3; out2 = mad(in2.x, weights0, out2);
out2 = mad(in2.y, weights1, out2);
out2 += in2.x * weights0; out2 = mad(in2.z, weights2, out2);
out2 += in2.y * weights1; out2 = mad(in2.w, weights3, out2);
out2 += in2.z * weights2;
out2 += in2.w * weights3; out3 = mad(in3.x, weights0, out3);
out3 = mad(in3.y, weights1, out3);
out3 += in3.x * weights0; out3 = mad(in3.z, weights2, out3);
out3 += in3.y * weights1; out3 = mad(in3.w, weights3, out3);
out3 += in3.z * weights2;
out3 += in3.w * weights3; out4 = mad(in4.x, weights0, out4);
out4 = mad(in4.y, weights1, out4);
out4 += in4.x * weights0; out4 = mad(in4.z, weights2, out4);
out4 += in4.y * weights1; out4 = mad(in4.w, weights3, out4);
out4 += in4.z * weights2;
out4 += in4.w * weights3; filter_x_part1 += rounded_in_ch;
} }
filter_x_part0 += rounded_in_ch_x_3;
} }
} }
...@@ -127,7 +130,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -127,7 +130,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out4 = fmax(out4, 0); out4 = fmax(out4, 0);
#endif #endif
const int out_x_base = out_ch_blk * out_width; const int out_x_base = mul24(out_ch_blk, out_width);
int w = out_w_blk; int w = out_w_blk;
WRITE_IMAGET(output, WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb), (int2)(out_x_base + w, out_hb),
......
...@@ -15,7 +15,7 @@ inline int calculate_avg_block_size(const int pool_size, ...@@ -15,7 +15,7 @@ inline int calculate_avg_block_size(const int pool_size,
const int w_start = max(0, pos_w); const int w_start = max(0, pos_w);
const int h_end = min(pos_h + pool_size, h_size); const int h_end = min(pos_h + pool_size, h_size);
const int w_end = min(pos_w + pool_size, w_size); const int w_end = min(pos_w + pool_size, w_size);
return (h_end - h_start) * (w_end - w_start); return mul24((h_end - h_start), (w_end - w_start));
} }
// Supported data type: half/float // Supported data type: half/float
...@@ -33,10 +33,10 @@ __kernel void pooling(__read_only image2d_t input, ...@@ -33,10 +33,10 @@ __kernel void pooling(__read_only image2d_t input,
const int out_width = get_global_size(1); const int out_width = get_global_size(1);
const int out_hb_idx = get_global_id(2); const int out_hb_idx = get_global_id(2);
const int batch_idx = (out_hb_idx / out_height) * in_height; const int batch_idx = mul24((out_hb_idx / out_height), in_height);
const int in_height_start = (out_hb_idx % out_height) * stride - pad_top; const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top;
const int in_width_start = out_width_idx * stride - pad_left; const int in_width_start = mul24(out_width_idx, stride) - pad_left;
const int in_channel_offset = out_chan_idx * in_width; const int in_channel_offset = mul24(out_chan_idx, in_width);
#ifdef POOL_AVG #ifdef POOL_AVG
...@@ -83,5 +83,5 @@ __kernel void pooling(__read_only image2d_t input, ...@@ -83,5 +83,5 @@ __kernel void pooling(__read_only image2d_t input,
} }
#endif #endif
WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res); WRITE_IMAGET(output, (int2)(mad24(out_chan_idx, out_width, out_width_idx), out_hb_idx), res);
} }
...@@ -8,7 +8,7 @@ __kernel void relu(__read_only image2d_t input, ...@@ -8,7 +8,7 @@ __kernel void relu(__read_only image2d_t input,
const int hb = get_global_id(2); const int hb = get_global_id(2);
const int width = get_global_size(1); const int width = get_global_size(1);
const int pos = ch_blk * width + w; const int pos = mad24(ch_blk, width, w);
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
DATA_TYPE4 out = fmax(in, 0); DATA_TYPE4 out = fmax(in, 0);
WRITE_IMAGET(output, (int2)(pos, hb), out); WRITE_IMAGET(output, (int2)(pos, hb), out);
...@@ -22,7 +22,7 @@ __kernel void relux(__read_only image2d_t input, ...@@ -22,7 +22,7 @@ __kernel void relux(__read_only image2d_t input,
const int hb = get_global_id(2); const int hb = get_global_id(2);
const int width = get_global_size(1); const int width = get_global_size(1);
const int pos = ch_blk * width + w; const int pos = mad24(ch_blk, width, w);
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb)); DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
DATA_TYPE4 out = clamp(in, 0, max_limit); DATA_TYPE4 out = clamp(in, 0, max_limit);
WRITE_IMAGET(output, (int2)(pos, hb), out); WRITE_IMAGET(output, (int2)(pos, hb), out);
......
...@@ -25,8 +25,8 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * ...@@ -25,8 +25,8 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
const float h_lerp = h_in - h_lower; const float h_lerp = h_in - h_lower;
const float w_lerp = w_in - w_lower; const float w_lerp = w_in - w_lower;
const int in_w_offset = ch_blk * in_width; const int in_w_offset = mul24(ch_blk, in_width);
const int in_h_offset = b * in_height; const int in_h_offset = mul24(b, in_height);
DATA_TYPE4 top_left = READ_IMAGET(input, SAMPLER, DATA_TYPE4 top_left = READ_IMAGET(input, SAMPLER,
(int2)(in_w_offset + w_lower, in_h_offset + h_lower)); (int2)(in_w_offset + w_lower, in_h_offset + h_lower));
...@@ -37,13 +37,12 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * ...@@ -37,13 +37,12 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
DATA_TYPE4 bottom_right = READ_IMAGET(input, SAMPLER, DATA_TYPE4 bottom_right = READ_IMAGET(input, SAMPLER,
(int2)(in_w_offset + w_upper, in_h_offset + h_upper)); (int2)(in_w_offset + w_upper, in_h_offset + h_upper));
DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp; DATA_TYPE4 top = mad((top_right - top_left), w_lerp, top_left);
DATA_TYPE4 bottom = bottom_left + (bottom_right - bottom_left) * w_lerp; DATA_TYPE4 bottom = mad((bottom_right - bottom_left), w_lerp, bottom_left);
DATA_TYPE4 out = mad((bottom - top), h_lerp, top);
DATA_TYPE4 out = top + (bottom - top) * h_lerp; const int out_w_offset = mul24(ch_blk, out_width);
const int out_h_offset = mul24(b, out_height);
const int out_w_offset = ch_blk * out_width;
const int out_h_offset = b * out_height;
WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out); WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out);
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册