提交 63e6e07f 编写于 作者: L Liangliang He

Optimizing OpenCL kernel with mad/mad24/mul24

上级 22581f22
......@@ -17,12 +17,13 @@ __kernel void batch_norm(__read_only image2d_t input,
DATA_TYPE4 mean_value = READ_IMAGET(mean, SAMPLER, (int2)(ch_blk, 0));
DATA_TYPE4 var_value = READ_IMAGET(var, SAMPLER, (int2)(ch_blk, 0));
// native_rsqrt seems not faster than rsqrt
DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)epsilon);
DATA_TYPE4 new_offset = offset_value - mean_value * new_scale;
DATA_TYPE4 new_offset = mad(0 - mean_value, new_scale, offset_value);
const int pos = ch_blk * width + w;
const int pos = mad24(ch_blk, width, w);
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
DATA_TYPE4 out = in * new_scale + new_offset;
DATA_TYPE4 out = mad(in, new_scale, new_offset);
WRITE_IMAGET(output, (int2)(pos, hb), out);
}
......@@ -8,8 +8,7 @@ __kernel void bias_add(__read_only image2d_t input,
const int hb = get_global_id(2);
const int width = get_global_size(1);
const int pos = ch_blk * width + w;
const int pos = mad24(ch_blk, width, w);
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(ch_blk, 0));
DATA_TYPE4 out = in + bias_value;
......
......@@ -30,37 +30,37 @@ __kernel void concat_channel(__read_only image2d_t input0,
const int width_idx = get_global_id(1);
const int width = get_global_size(1);
const int hb_idx = get_global_id(2);
const int input0_chan_blk = (input0_chan + 3) / 4;
const int input0_chan_blk = (input0_chan + 3) >> 2;
DATA_TYPE4 data = 0;
#ifdef DIVISIBLE_FOUR
if (chan_blk_idx + 1 <= input0_chan_blk) {
data = READ_IMAGET(input0,
SAMPLER,
(int2)(chan_blk_idx * width + width_idx, hb_idx));
(int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
} else {
data = READ_IMAGET(input1,
SAMPLER,
(int2)((chan_blk_idx - input0_chan_blk) * width + width_idx, hb_idx));
(int2)(mad24((chan_blk_idx - input0_chan_blk), width, width_idx), hb_idx));
}
#else
if (chan_blk_idx + 1 < input0_chan_blk) {
data = READ_IMAGET(input0,
SAMPLER,
(int2)(chan_blk_idx * width + width_idx, hb_idx));
(int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
} else if (chan_blk_idx >= input0_chan_blk) {
const int in_chan_idx = chan_blk_idx - input0_chan_blk;
DATA_TYPE4 data0 = READ_IMAGET(input1,
SAMPLER,
(int2)(in_chan_idx * width + width_idx, hb_idx));
(int2)(mad24(in_chan_idx, width, width_idx), hb_idx));
DATA_TYPE4 data1 = READ_IMAGET(input1,
SAMPLER,
(int2)((in_chan_idx + 1) * width + width_idx, hb_idx));
(int2)(mad24((in_chan_idx + 1), width, width_idx), hb_idx));
data = stitch_vector(data0, data1, input0_chan % 4, true);
} else {
DATA_TYPE4 data0 = READ_IMAGET(input0,
SAMPLER,
(int2)(chan_blk_idx * width + width_idx, hb_idx));
(int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
DATA_TYPE4 data1 = READ_IMAGET(input1,
SAMPLER,
(int2)(width_idx, hb_idx));
......@@ -68,7 +68,7 @@ __kernel void concat_channel(__read_only image2d_t input0,
}
#endif
WRITE_IMAGET(output, (int2)(chan_blk_idx * width + width_idx, hb_idx), data);
WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data);
}
//__kernel void concat_width(__read_only image2d_t input0,
......
......@@ -19,7 +19,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2);
const int rounded_in_ch = in_ch_blks * 4;
const int rounded_in_ch = in_ch_blks << 2;
#ifdef BIAS
DATA_TYPE4 out0 =
......@@ -41,29 +41,29 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
int in_width3 = in_width2 + out_w_blks;
const int height_idx = (out_hb % out_height) - padding_top;
#else
int in_width0 = out_w_blk * 2 - padding_left;
int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
const int height_idx = (out_hb % out_height) * 2 - padding_top;
int in_width0 = (out_w_blk << 1) - padding_left;
int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left;
int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left;
int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left;
const int height_idx = ((out_hb % out_height) << 1) - padding_top;
#endif
const int batch_idx = (out_hb / out_height) * in_height;
const int batch_idx = mul24((out_hb / out_height), in_height);
const int rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width);
DATA_TYPE4 in0, in1, in2, in3;
DATA_TYPE4 weights0, weights1, weights2, weights3;
int in_idx, in_width_idx;
// Unrolling this loop hurt perfmance
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
const int in_idx = mul24(in_ch_blk, in_width);
int filter_x_part0 = in_ch_blk << 2;
for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
int in_hb_value = height_idx + hb_idx;
in_hb_value = select(in_hb_value + batch_idx,
-1,
(in_hb_value < 0 || in_hb_value >= in_height));
int filter_x_part1 = 0;
for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
in_idx = in_ch_blk * in_width;
int in_width_value;
#define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \
......@@ -79,36 +79,37 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
#undef READ_INPUT
int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch;
// int filter_idx = (hb_idx * filter_width + width_idx) * rounded_in_ch + (in_ch_blk << 2);
int filter_idx = filter_x_part0 + filter_x_part1;
weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
out0 = mad(in0.x, weights0, out0);
out0 = mad(in0.y, weights1, out0);
out0 = mad(in0.z, weights2, out0);
out0 = mad(in0.w, weights3, out0);
// Interleaving load and mul does not improve performance as expected
out0 += in0.x * weights0;
out0 += in0.y * weights1;
out0 += in0.z * weights2;
out0 += in0.w * weights3;
out1 += in1.x * weights0;
out1 += in1.y * weights1;
out1 += in1.z * weights2;
out1 += in1.w * weights3;
out1 = mad(in1.x, weights0, out1);
out1 = mad(in1.y, weights1, out1);
out1 = mad(in1.z, weights2, out1);
out1 = mad(in1.w, weights3, out1);
out2 += in2.x * weights0;
out2 += in2.y * weights1;
out2 += in2.z * weights2;
out2 += in2.w * weights3;
out2 = mad(in2.x, weights0, out2);
out2 = mad(in2.y, weights1, out2);
out2 = mad(in2.z, weights2, out2);
out2 = mad(in2.w, weights3, out2);
out3 += in3.x * weights0;
out3 += in3.y * weights1;
out3 += in3.z * weights2;
out3 += in3.w * weights3;
out3 = mad(in3.x, weights0, out3);
out3 = mad(in3.y, weights1, out3);
out3 = mad(in3.z, weights2, out3);
out3 = mad(in3.w, weights3, out3);
filter_x_part1 += rounded_in_ch;
}
filter_x_part0 += rounded_in_ch_x_filter_width;
}
}
......@@ -120,28 +121,20 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
out3 = fmax(out3, 0);
#endif
const int out_x_base = out_ch_blk * out_width;
const int out_x_base = mul24(out_ch_blk, out_width);
int w = out_w_blk;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out0);
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out1);
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out2);
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2);
w += out_w_blks;
if (w >= out_width) return;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
out3);
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
}
......@@ -36,11 +36,11 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
w.w = w.z + out_w_blks;
int out_hb_idx = (out_hb % height);
#else
w.x = out_w_blk * 2;
w.y = (out_w_blk + out_w_blks) * 2;
w.z = (out_w_blk + 2 * out_w_blks) * 2;
w.w = (out_w_blk + 3 * out_w_blks) * 2;
int out_hb_idx = (out_hb % height) * 2;
w.x = out_w_blk << 1;
w.y = (out_w_blk + out_w_blks) << 1;
w.z = (out_w_blk + (out_w_blks << 1)) << 1;
w.w = (out_w_blk + (out_w_blks << 1) + out_w_blks) << 1;
int out_hb_idx = (out_hb % height) << 1;
#endif
w.x = select(w.x, INT_MIN, w.x >= in_width);
......@@ -48,47 +48,46 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
w.z = select(w.z, INT_MIN, w.z >= in_width);
w.w = select(w.w, INT_MIN, w.w >= in_width);
out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height,
out_hb_idx = select(mad24((out_hb / height), in_height, out_hb_idx),
-1,
out_hb_idx >= in_height);
// Unrolling this loop hurt perfmance
int in_x_base = 0;
int filter_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
DATA_TYPE4 in0 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.x, out_hb_idx));
DATA_TYPE4 in1 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.y, out_hb_idx));
DATA_TYPE4 in2 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.z, out_hb_idx));
DATA_TYPE4 in3 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.w, out_hb_idx));
const int filter_x0 = in_ch_blk << 2;
DATA_TYPE4 weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0, out_ch_blk));
DATA_TYPE4 weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 1, out_ch_blk));
DATA_TYPE4 weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 2, out_ch_blk));
DATA_TYPE4 weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
out0 += in0.x * weights0;
out0 += in0.y * weights1;
out0 += in0.z * weights2;
out0 += in0.w * weights3;
out1 += in1.x * weights0;
out1 += in1.y * weights1;
out1 += in1.z * weights2;
out1 += in1.w * weights3;
out2 += in2.x * weights0;
out2 += in2.y * weights1;
out2 += in2.z * weights2;
out2 += in2.w * weights3;
out3 += in3.x * weights0;
out3 += in3.y * weights1;
out3 += in3.z * weights2;
out3 += in3.w * weights3;
DATA_TYPE4 weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 0, out_ch_blk));
DATA_TYPE4 weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 1, out_ch_blk));
DATA_TYPE4 weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 2, out_ch_blk));
DATA_TYPE4 weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 3, out_ch_blk));
out0 = mad(in0.x, weights0, out0);
out0 = mad(in0.y, weights1, out0);
out0 = mad(in0.z, weights2, out0);
out0 = mad(in0.w, weights3, out0);
out1 = mad(in1.x, weights0, out1);
out1 = mad(in1.y, weights1, out1);
out1 = mad(in1.z, weights2, out1);
out1 = mad(in1.w, weights3, out1);
out2 = mad(in2.x, weights0, out2);
out2 = mad(in2.y, weights1, out2);
out2 = mad(in2.z, weights2, out2);
out2 = mad(in2.w, weights3, out2);
out3 = mad(in3.x, weights0, out3);
out3 = mad(in3.y, weights1, out3);
out3 = mad(in3.z, weights2, out3);
out3 = mad(in3.w, weights3, out3);
in_x_base += in_width;
filter_x_base += 4;
}
#ifdef FUSED_RELU
......@@ -99,7 +98,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out3 = fmax(out3, 0);
#endif
const int out_x_base = out_ch_blk * width;
const int out_x_base = mul24(out_ch_blk, width);
int out_x_idx = out_w_blk;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
......@@ -114,5 +113,4 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out_x_idx += out_w_blks;
if (out_x_idx >= width) return;
WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
}
......@@ -17,7 +17,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2);
const int rounded_in_ch = in_ch_blks * 4;
const int rounded_in_ch = in_ch_blks << 2;
#ifdef BIAS
DATA_TYPE4 out0 =
......@@ -42,29 +42,30 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
int in_width4 = in_width3 + out_w_blks;
const int height_idx = (out_hb % out_height) - padding_top;
#else
int in_width0 = out_w_blk * 2 - padding_left;
int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
int in_width4 = (out_w_blk + 4 * out_w_blks) * 2 - padding_left;
const int height_idx = (out_hb % out_height) * 2 - padding_top;
int in_width0 = (out_w_blk << 1) - padding_left;
int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left;
int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left;
int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left;
int in_width4 = ((out_w_blk + (out_w_blks << 2)) << 1) - padding_left;
const int height_idx = ((out_hb % out_height) << 1) - padding_top;
#endif
const int batch_idx = (out_hb / out_height) * in_height;
const int batch_idx = mul24((out_hb / out_height), in_height);
const int rounded_in_ch_x_3 = (rounded_in_ch << 1) + rounded_in_ch;
DATA_TYPE4 in0, in1, in2, in3, in4;
DATA_TYPE4 weights0, weights1, weights2, weights3;
int in_idx, hb_idx, width_idx, in_width_idx;
// Unrolling this loop hurt perfmance
int hb_idx, width_idx, in_width_idx;
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
const int in_idx = mul24(in_ch_blk, in_width);
int filter_x_part0 = in_ch_blk << 2;
for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
int in_hb_value = height_idx + hb_idx;
in_hb_value = select(in_hb_value + batch_idx,
-1,
(in_hb_value < 0 || in_hb_value >= in_height));
int filter_x_part1 = 0;
for (short width_idx = 0; width_idx < 3; ++width_idx) {
in_idx = in_ch_blk * in_width;
int in_width_value;
#define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \
......@@ -81,40 +82,42 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#undef READ_INPUT
int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch;
// int filter_idx = (hb_idx * 3 + width_idx) * rounded_in_ch + (in_ch_blk << 2);
int filter_idx = filter_x_part0 + filter_x_part1;
weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
out0 += in0.x * weights0;
out0 += in0.y * weights1;
out0 += in0.z * weights2;
out0 += in0.w * weights3;
out1 += in1.x * weights0;
out1 += in1.y * weights1;
out1 += in1.z * weights2;
out1 += in1.w * weights3;
out2 += in2.x * weights0;
out2 += in2.y * weights1;
out2 += in2.z * weights2;
out2 += in2.w * weights3;
out3 += in3.x * weights0;
out3 += in3.y * weights1;
out3 += in3.z * weights2;
out3 += in3.w * weights3;
out4 += in4.x * weights0;
out4 += in4.y * weights1;
out4 += in4.z * weights2;
out4 += in4.w * weights3;
out0 = mad(in0.x, weights0, out0);
out0 = mad(in0.y, weights1, out0);
out0 = mad(in0.z, weights2, out0);
out0 = mad(in0.w, weights3, out0);
out1 = mad(in1.x, weights0, out1);
out1 = mad(in1.y, weights1, out1);
out1 = mad(in1.z, weights2, out1);
out1 = mad(in1.w, weights3, out1);
out2 = mad(in2.x, weights0, out2);
out2 = mad(in2.y, weights1, out2);
out2 = mad(in2.z, weights2, out2);
out2 = mad(in2.w, weights3, out2);
out3 = mad(in3.x, weights0, out3);
out3 = mad(in3.y, weights1, out3);
out3 = mad(in3.z, weights2, out3);
out3 = mad(in3.w, weights3, out3);
out4 = mad(in4.x, weights0, out4);
out4 = mad(in4.y, weights1, out4);
out4 = mad(in4.z, weights2, out4);
out4 = mad(in4.w, weights3, out4);
filter_x_part1 += rounded_in_ch;
}
filter_x_part0 += rounded_in_ch_x_3;
}
}
......@@ -127,7 +130,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out4 = fmax(out4, 0);
#endif
const int out_x_base = out_ch_blk * out_width;
const int out_x_base = mul24(out_ch_blk, out_width);
int w = out_w_blk;
WRITE_IMAGET(output,
(int2)(out_x_base + w, out_hb),
......
......@@ -15,7 +15,7 @@ inline int calculate_avg_block_size(const int pool_size,
const int w_start = max(0, pos_w);
const int h_end = min(pos_h + pool_size, h_size);
const int w_end = min(pos_w + pool_size, w_size);
return (h_end - h_start) * (w_end - w_start);
return mul24((h_end - h_start), (w_end - w_start));
}
// Supported data type: half/float
......@@ -33,10 +33,10 @@ __kernel void pooling(__read_only image2d_t input,
const int out_width = get_global_size(1);
const int out_hb_idx = get_global_id(2);
const int batch_idx = (out_hb_idx / out_height) * in_height;
const int in_height_start = (out_hb_idx % out_height) * stride - pad_top;
const int in_width_start = out_width_idx * stride - pad_left;
const int in_channel_offset = out_chan_idx * in_width;
const int batch_idx = mul24((out_hb_idx / out_height), in_height);
const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top;
const int in_width_start = mul24(out_width_idx, stride) - pad_left;
const int in_channel_offset = mul24(out_chan_idx, in_width);
#ifdef POOL_AVG
......@@ -83,5 +83,5 @@ __kernel void pooling(__read_only image2d_t input,
}
#endif
WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res);
WRITE_IMAGET(output, (int2)(mad24(out_chan_idx, out_width, out_width_idx), out_hb_idx), res);
}
......@@ -8,7 +8,7 @@ __kernel void relu(__read_only image2d_t input,
const int hb = get_global_id(2);
const int width = get_global_size(1);
const int pos = ch_blk * width + w;
const int pos = mad24(ch_blk, width, w);
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
DATA_TYPE4 out = fmax(in, 0);
WRITE_IMAGET(output, (int2)(pos, hb), out);
......@@ -22,7 +22,7 @@ __kernel void relux(__read_only image2d_t input,
const int hb = get_global_id(2);
const int width = get_global_size(1);
const int pos = ch_blk * width + w;
const int pos = mad24(ch_blk, width, w);
DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
DATA_TYPE4 out = clamp(in, 0, max_limit);
WRITE_IMAGET(output, (int2)(pos, hb), out);
......
......@@ -25,8 +25,8 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
const float h_lerp = h_in - h_lower;
const float w_lerp = w_in - w_lower;
const int in_w_offset = ch_blk * in_width;
const int in_h_offset = b * in_height;
const int in_w_offset = mul24(ch_blk, in_width);
const int in_h_offset = mul24(b, in_height);
DATA_TYPE4 top_left = READ_IMAGET(input, SAMPLER,
(int2)(in_w_offset + w_lower, in_h_offset + h_lower));
......@@ -37,13 +37,12 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
DATA_TYPE4 bottom_right = READ_IMAGET(input, SAMPLER,
(int2)(in_w_offset + w_upper, in_h_offset + h_upper));
DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp;
DATA_TYPE4 bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
DATA_TYPE4 top = mad((top_right - top_left), w_lerp, top_left);
DATA_TYPE4 bottom = mad((bottom_right - bottom_left), w_lerp, bottom_left);
DATA_TYPE4 out = mad((bottom - top), h_lerp, top);
DATA_TYPE4 out = top + (bottom - top) * h_lerp;
const int out_w_offset = ch_blk * out_width;
const int out_h_offset = b * out_height;
const int out_w_offset = mul24(ch_blk, out_width);
const int out_h_offset = mul24(b, out_height);
WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册