Optimizing OpenCL kernel with mad/mad24/mul24

63e6e07f · Liangliang He · 22581f22 · 63e6e07f · 63e6e07f · 63e6e07f
9 changed file
--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
@@ -17,12 +17,13 @@ __kernel void batch_norm(__read_only image2d_t input,
  DATA_TYPE4 mean_value = READ_IMAGET(mean, SAMPLER, (int2)(ch_blk, 0));
  DATA_TYPE4 var_value = READ_IMAGET(var, SAMPLER, (int2)(ch_blk, 0));

+  // native_rsqrt seems not faster than rsqrt
  DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)epsilon);
-  DATA_TYPE4 new_offset = offset_value - mean_value * new_scale;
+  DATA_TYPE4 new_offset = mad(0 - mean_value, new_scale, offset_value);

-  const int pos = ch_blk * width + w;
+  const int pos = mad24(ch_blk, width, w);

  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
-  DATA_TYPE4 out = in * new_scale + new_offset;
+  DATA_TYPE4 out = mad(in, new_scale, new_offset);
  WRITE_IMAGET(output, (int2)(pos, hb), out);
 }
--- a/mace/kernels/opencl/cl/bias_add.cl
+++ b/mace/kernels/opencl/cl/bias_add.cl
@@ -8,8 +8,7 @@ __kernel void bias_add(__read_only image2d_t input,
  const int hb = get_global_id(2);
  const int width = get_global_size(1);

-
-  const int pos = ch_blk * width + w;
+  const int pos = mad24(ch_blk, width, w);
  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
  DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(ch_blk, 0));
  DATA_TYPE4 out = in + bias_value;

--- a/mace/kernels/opencl/cl/concat.cl
+++ b/mace/kernels/opencl/cl/concat.cl
@@ -30,37 +30,37 @@ __kernel void concat_channel(__read_only image2d_t input0,
  const int width_idx = get_global_id(1);
  const int width = get_global_size(1);
  const int hb_idx = get_global_id(2);
-  const int input0_chan_blk = (input0_chan + 3) / 4;
+  const int input0_chan_blk = (input0_chan + 3) >> 2;

  DATA_TYPE4 data = 0;
 #ifdef DIVISIBLE_FOUR
  if (chan_blk_idx + 1 <= input0_chan_blk) {
    data = READ_IMAGET(input0,
                       SAMPLER,
-                       (int2)(chan_blk_idx * width + width_idx, hb_idx));
+                       (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
  } else {
    data = READ_IMAGET(input1,
                       SAMPLER,
-                       (int2)((chan_blk_idx - input0_chan_blk) * width + width_idx, hb_idx));
+                       (int2)(mad24((chan_blk_idx - input0_chan_blk), width, width_idx), hb_idx));
  }
 #else
  if (chan_blk_idx + 1 < input0_chan_blk) {
    data = READ_IMAGET(input0,
                       SAMPLER,
-                       (int2)(chan_blk_idx * width + width_idx, hb_idx));
+                       (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
  } else if (chan_blk_idx >= input0_chan_blk) {
    const int in_chan_idx = chan_blk_idx - input0_chan_blk;
    DATA_TYPE4 data0 = READ_IMAGET(input1,
                                   SAMPLER,
-                                   (int2)(in_chan_idx * width + width_idx, hb_idx));
+                                   (int2)(mad24(in_chan_idx, width, width_idx), hb_idx));
    DATA_TYPE4 data1 = READ_IMAGET(input1,
                                   SAMPLER,
-                                   (int2)((in_chan_idx + 1) * width + width_idx, hb_idx));
+                                   (int2)(mad24((in_chan_idx + 1), width, width_idx), hb_idx));
    data = stitch_vector(data0, data1, input0_chan % 4, true);
  } else {
    DATA_TYPE4 data0 = READ_IMAGET(input0,
                                   SAMPLER,
-                                   (int2)(chan_blk_idx * width + width_idx, hb_idx));
+                                   (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
    DATA_TYPE4 data1 = READ_IMAGET(input1,
                                   SAMPLER,
                                   (int2)(width_idx, hb_idx));
@@ -68,7 +68,7 @@ __kernel void concat_channel(__read_only image2d_t input0,
  }
 #endif

-  WRITE_IMAGET(output, (int2)(chan_blk_idx * width + width_idx, hb_idx), data);
+  WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data);
 }

 //__kernel void concat_width(__read_only image2d_t input0,

--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -19,7 +19,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
  const int out_w_blk = get_global_id(1);
  const int out_w_blks = get_global_size(1);
  const int out_hb = get_global_id(2);
-  const int rounded_in_ch = in_ch_blks * 4;
+  const int rounded_in_ch = in_ch_blks << 2;

 #ifdef BIAS
  DATA_TYPE4 out0 =
@@ -41,29 +41,29 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
  int in_width3 = in_width2 + out_w_blks;
  const int height_idx = (out_hb % out_height) - padding_top;
 #else
-  int in_width0 = out_w_blk * 2 - padding_left;
-  int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
-  int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
-  int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
-  const int height_idx = (out_hb % out_height) * 2 - padding_top;
+  int in_width0 = (out_w_blk << 1) - padding_left;
+  int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left;
+  int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left;
+  int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left;
+  const int height_idx = ((out_hb % out_height) << 1) - padding_top;
 #endif

-  const int batch_idx = (out_hb / out_height) * in_height;
+  const int batch_idx = mul24((out_hb / out_height), in_height);
+  const int rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width);

  DATA_TYPE4 in0, in1, in2, in3;
  DATA_TYPE4 weights0, weights1, weights2, weights3;
-  int in_idx, in_width_idx;
-  // Unrolling this loop hurt perfmance
  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
+    const int in_idx = mul24(in_ch_blk, in_width);
+    int filter_x_part0 = in_ch_blk << 2;
    for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
-
      int in_hb_value = height_idx + hb_idx;
      in_hb_value = select(in_hb_value + batch_idx,
                           -1,
                           (in_hb_value < 0 || in_hb_value >= in_height));

+      int filter_x_part1 = 0;
      for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
-        in_idx = in_ch_blk * in_width;
        int in_width_value;
 #define READ_INPUT(i)                                                                \
        in_width_value = in_width##i + width_idx;                                    \
@@ -79,36 +79,37 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */

 #undef READ_INPUT

-        int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch;
+        // int filter_idx = (hb_idx * filter_width + width_idx) * rounded_in_ch + (in_ch_blk << 2);
+        int filter_idx = filter_x_part0 + filter_x_part1;
        weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
        weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
        weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
        weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));

-        // Will prefetch L2 improve performance? How to pretch image data?
+        out0 = mad(in0.x, weights0, out0);
+        out0 = mad(in0.y, weights1, out0);
+        out0 = mad(in0.z, weights2, out0);
+        out0 = mad(in0.w, weights3, out0);

-        // Interleaving load and mul does not improve performance as expected
-        out0 += in0.x * weights0;
-        out0 += in0.y * weights1;
-        out0 += in0.z * weights2;
-        out0 += in0.w * weights3;

-        out1 += in1.x * weights0;
-        out1 += in1.y * weights1;
-        out1 += in1.z * weights2;
-        out1 += in1.w * weights3;
+        out1 = mad(in1.x, weights0, out1);
+        out1 = mad(in1.y, weights1, out1);
+        out1 = mad(in1.z, weights2, out1);
+        out1 = mad(in1.w, weights3, out1);

-        out2 += in2.x * weights0;
-        out2 += in2.y * weights1;
-        out2 += in2.z * weights2;
-        out2 += in2.w * weights3;
+        out2 = mad(in2.x, weights0, out2);
+        out2 = mad(in2.y, weights1, out2);
+        out2 = mad(in2.z, weights2, out2);
+        out2 = mad(in2.w, weights3, out2);

-        out3 += in3.x * weights0;
-        out3 += in3.y * weights1;
-        out3 += in3.z * weights2;
-        out3 += in3.w * weights3;
+        out3 = mad(in3.x, weights0, out3);
+        out3 = mad(in3.y, weights1, out3);
+        out3 = mad(in3.z, weights2, out3);
+        out3 = mad(in3.w, weights3, out3);

+        filter_x_part1 += rounded_in_ch;
      }
+      filter_x_part0 += rounded_in_ch_x_filter_width;
    }
  }

@@ -120,28 +121,20 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
  out3 = fmax(out3, 0);
 #endif

-  const int out_x_base = out_ch_blk * out_width;
+  const int out_x_base = mul24(out_ch_blk, out_width);
  int w = out_w_blk;
-  WRITE_IMAGET(output,
-               (int2)(out_x_base + w, out_hb),
-               out0);
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0);

  w += out_w_blks;
  if (w >= out_width) return;
-  WRITE_IMAGET(output,
-               (int2)(out_x_base + w, out_hb),
-               out1);
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1);

  w += out_w_blks;
  if (w >= out_width) return;
-  WRITE_IMAGET(output,
-               (int2)(out_x_base + w, out_hb),
-               out2);
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2);

  w += out_w_blks;
  if (w >= out_width) return;
-  WRITE_IMAGET(output,
-               (int2)(out_x_base + w, out_hb),
-               out3);
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);

 }
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -36,11 +36,11 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  w.w = w.z + out_w_blks;
  int out_hb_idx = (out_hb % height);
 #else
-  w.x = out_w_blk * 2;
-  w.y = (out_w_blk + out_w_blks) * 2;
-  w.z = (out_w_blk + 2 * out_w_blks) * 2;
-  w.w = (out_w_blk + 3 * out_w_blks) * 2;
-  int out_hb_idx = (out_hb % height) * 2;
+  w.x = out_w_blk << 1;
+  w.y = (out_w_blk + out_w_blks) << 1;
+  w.z = (out_w_blk + (out_w_blks << 1)) << 1;
+  w.w = (out_w_blk + (out_w_blks << 1) + out_w_blks) << 1;
+  int out_hb_idx = (out_hb % height) << 1;
 #endif

  w.x = select(w.x, INT_MIN, w.x >= in_width);
@@ -48,47 +48,46 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  w.z = select(w.z, INT_MIN, w.z >= in_width);
  w.w = select(w.w, INT_MIN, w.w >= in_width);

-  out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height,
+  out_hb_idx = select(mad24((out_hb / height), in_height, out_hb_idx),
                      -1,
                      out_hb_idx >= in_height);

  // Unrolling this loop hurt perfmance
  int in_x_base = 0;
+  int filter_x_base = 0;
  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-
    DATA_TYPE4 in0 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.x, out_hb_idx));
    DATA_TYPE4 in1 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.y, out_hb_idx));
    DATA_TYPE4 in2 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.z, out_hb_idx));
    DATA_TYPE4 in3 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.w, out_hb_idx));

-    const int filter_x0 = in_ch_blk << 2;
-    DATA_TYPE4 weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0, out_ch_blk));
-    DATA_TYPE4 weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 1, out_ch_blk));
-    DATA_TYPE4 weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 2, out_ch_blk));
-    DATA_TYPE4 weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 3, out_ch_blk));
-    // Will prefetch L2 improve performance? How to pretch image data?
-
-    out0 += in0.x * weights0;
-    out0 += in0.y * weights1;
-    out0 += in0.z * weights2;
-    out0 += in0.w * weights3;
-
-    out1 += in1.x * weights0;
-    out1 += in1.y * weights1;
-    out1 += in1.z * weights2;
-    out1 += in1.w * weights3;
-
-    out2 += in2.x * weights0;
-    out2 += in2.y * weights1;
-    out2 += in2.z * weights2;
-    out2 += in2.w * weights3;
-
-    out3 += in3.x * weights0;
-    out3 += in3.y * weights1;
-    out3 += in3.z * weights2;
-    out3 += in3.w * weights3;
+    DATA_TYPE4 weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 0, out_ch_blk));
+    DATA_TYPE4 weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 1, out_ch_blk));
+    DATA_TYPE4 weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 2, out_ch_blk));
+    DATA_TYPE4 weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 3, out_ch_blk));
+
+    out0 = mad(in0.x, weights0, out0);
+    out0 = mad(in0.y, weights1, out0);
+    out0 = mad(in0.z, weights2, out0);
+    out0 = mad(in0.w, weights3, out0);
+
+    out1 = mad(in1.x, weights0, out1);
+    out1 = mad(in1.y, weights1, out1);
+    out1 = mad(in1.z, weights2, out1);
+    out1 = mad(in1.w, weights3, out1);
+
+    out2 = mad(in2.x, weights0, out2);
+    out2 = mad(in2.y, weights1, out2);
+    out2 = mad(in2.z, weights2, out2);
+    out2 = mad(in2.w, weights3, out2);
+
+    out3 = mad(in3.x, weights0, out3);
+    out3 = mad(in3.y, weights1, out3);
+    out3 = mad(in3.z, weights2, out3);
+    out3 = mad(in3.w, weights3, out3);

    in_x_base += in_width;
+    filter_x_base += 4;
  }

 #ifdef FUSED_RELU
@@ -99,7 +98,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  out3 = fmax(out3, 0);
 #endif

-  const int out_x_base = out_ch_blk * width;
+  const int out_x_base = mul24(out_ch_blk, width);
  int out_x_idx = out_w_blk;
  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);

@@ -114,5 +113,4 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  out_x_idx += out_w_blks;
  if (out_x_idx >= width) return;
  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
-
 }
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -17,7 +17,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  const int out_w_blk = get_global_id(1);
  const int out_w_blks = get_global_size(1);
  const int out_hb = get_global_id(2);
-  const int rounded_in_ch = in_ch_blks * 4;
+  const int rounded_in_ch = in_ch_blks << 2;

 #ifdef BIAS
  DATA_TYPE4 out0 =
@@ -42,29 +42,30 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  int in_width4 = in_width3 + out_w_blks;
  const int height_idx = (out_hb % out_height) - padding_top;
 #else
-  int in_width0 = out_w_blk * 2 - padding_left;
-  int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
-  int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
-  int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
-  int in_width4 = (out_w_blk + 4 * out_w_blks) * 2 - padding_left;
-  const int height_idx = (out_hb % out_height) * 2 - padding_top;
+  int in_width0 = (out_w_blk << 1) - padding_left;
+  int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left;
+  int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left;
+  int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left;
+  int in_width4 = ((out_w_blk + (out_w_blks << 2)) << 1) - padding_left;
+  const int height_idx = ((out_hb % out_height) << 1) - padding_top;
 #endif

-  const int batch_idx = (out_hb / out_height) * in_height;
+  const int batch_idx = mul24((out_hb / out_height), in_height);
+  const int rounded_in_ch_x_3 = (rounded_in_ch << 1) + rounded_in_ch;

  DATA_TYPE4 in0, in1, in2, in3, in4;
  DATA_TYPE4 weights0, weights1, weights2, weights3;
-  int in_idx, hb_idx, width_idx, in_width_idx;
-  // Unrolling this loop hurt perfmance
+  int hb_idx, width_idx, in_width_idx;
  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
+    const int in_idx = mul24(in_ch_blk, in_width);
+    int filter_x_part0 = in_ch_blk << 2;
    for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
      int in_hb_value = height_idx + hb_idx;
      in_hb_value = select(in_hb_value + batch_idx,
                           -1,
                           (in_hb_value < 0 || in_hb_value >= in_height));
+      int filter_x_part1 = 0;
      for (short width_idx = 0; width_idx < 3; ++width_idx) {
-
-        in_idx = in_ch_blk * in_width;
        int in_width_value;
 #define READ_INPUT(i)                                                                \
        in_width_value = in_width##i + width_idx;                                    \
@@ -81,40 +82,42 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]

 #undef READ_INPUT

-        int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch;
+        // int filter_idx = (hb_idx * 3 + width_idx) * rounded_in_ch + (in_ch_blk << 2);
+        int filter_idx = filter_x_part0 + filter_x_part1;
        weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
        weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
        weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
        weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));

-        // Will prefetch L2 improve performance? How to pretch image data?
-
-        // Interleaving load and mul does not improve performance as expected
-        out0 += in0.x * weights0;
-        out0 += in0.y * weights1;
-        out0 += in0.z * weights2;
-        out0 += in0.w * weights3;
-
-        out1 += in1.x * weights0;
-        out1 += in1.y * weights1;
-        out1 += in1.z * weights2;
-        out1 += in1.w * weights3;
-
-        out2 += in2.x * weights0;
-        out2 += in2.y * weights1;
-        out2 += in2.z * weights2;
-        out2 += in2.w * weights3;
-
-        out3 += in3.x * weights0;
-        out3 += in3.y * weights1;
-        out3 += in3.z * weights2;
-        out3 += in3.w * weights3;
-
-        out4 += in4.x * weights0;
-        out4 += in4.y * weights1;
-        out4 += in4.z * weights2;
-        out4 += in4.w * weights3;
+        out0 = mad(in0.x, weights0, out0);
+        out0 = mad(in0.y, weights1, out0);
+        out0 = mad(in0.z, weights2, out0);
+        out0 = mad(in0.w, weights3, out0);
+
+
+        out1 = mad(in1.x, weights0, out1);
+        out1 = mad(in1.y, weights1, out1);
+        out1 = mad(in1.z, weights2, out1);
+        out1 = mad(in1.w, weights3, out1);
+
+        out2 = mad(in2.x, weights0, out2);
+        out2 = mad(in2.y, weights1, out2);
+        out2 = mad(in2.z, weights2, out2);
+        out2 = mad(in2.w, weights3, out2);
+
+        out3 = mad(in3.x, weights0, out3);
+        out3 = mad(in3.y, weights1, out3);
+        out3 = mad(in3.z, weights2, out3);
+        out3 = mad(in3.w, weights3, out3);
+
+        out4 = mad(in4.x, weights0, out4);
+        out4 = mad(in4.y, weights1, out4);
+        out4 = mad(in4.z, weights2, out4);
+        out4 = mad(in4.w, weights3, out4);
+
+        filter_x_part1 += rounded_in_ch;
      }
+      filter_x_part0 += rounded_in_ch_x_3;
    }
  }

@@ -127,7 +130,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  out4 = fmax(out4, 0);
 #endif

-  const int out_x_base = out_ch_blk * out_width;
+  const int out_x_base = mul24(out_ch_blk, out_width);
  int w = out_w_blk;
  WRITE_IMAGET(output,
               (int2)(out_x_base + w, out_hb),

--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
@@ -15,7 +15,7 @@ inline int calculate_avg_block_size(const int pool_size,
  const int w_start = max(0, pos_w);
  const int h_end = min(pos_h + pool_size, h_size);
  const int w_end = min(pos_w + pool_size, w_size);
-  return (h_end - h_start) * (w_end - w_start);
+  return mul24((h_end - h_start), (w_end - w_start));
 }

 // Supported data type: half/float
@@ -33,10 +33,10 @@ __kernel void pooling(__read_only image2d_t input,
  const int out_width = get_global_size(1);
  const int out_hb_idx = get_global_id(2);

-  const int batch_idx = (out_hb_idx / out_height) * in_height;
-  const int in_height_start = (out_hb_idx % out_height) * stride - pad_top;
-  const int in_width_start = out_width_idx * stride - pad_left;
-  const int in_channel_offset = out_chan_idx * in_width;
+  const int batch_idx = mul24((out_hb_idx / out_height), in_height);
+  const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top;
+  const int in_width_start = mul24(out_width_idx, stride) - pad_left;
+  const int in_channel_offset = mul24(out_chan_idx, in_width);


 #ifdef POOL_AVG
@@ -83,5 +83,5 @@ __kernel void pooling(__read_only image2d_t input,
  }
 #endif

-  WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res);
+  WRITE_IMAGET(output, (int2)(mad24(out_chan_idx, out_width, out_width_idx), out_hb_idx), res);
 }
--- a/mace/kernels/opencl/cl/relu.cl
+++ b/mace/kernels/opencl/cl/relu.cl
@@ -8,7 +8,7 @@ __kernel void relu(__read_only image2d_t input,
  const int hb = get_global_id(2);
  const int width = get_global_size(1);

-  const int pos = ch_blk * width + w;
+  const int pos = mad24(ch_blk, width, w);
  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
  DATA_TYPE4 out = fmax(in, 0);
  WRITE_IMAGET(output, (int2)(pos, hb), out);
@@ -22,7 +22,7 @@ __kernel void relux(__read_only image2d_t input,
  const int hb = get_global_id(2);
  const int width = get_global_size(1);

-  const int pos = ch_blk * width + w;
+  const int pos = mad24(ch_blk, width, w);
  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
  DATA_TYPE4 out = clamp(in, 0, max_limit);
  WRITE_IMAGET(output, (int2)(pos, hb), out);

--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
@@ -25,8 +25,8 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
  const float h_lerp = h_in - h_lower;
  const float w_lerp = w_in - w_lower;

-  const int in_w_offset = ch_blk * in_width;
-  const int in_h_offset = b * in_height;
+  const int in_w_offset = mul24(ch_blk, in_width);
+  const int in_h_offset = mul24(b, in_height);

  DATA_TYPE4 top_left = READ_IMAGET(input, SAMPLER,
          (int2)(in_w_offset + w_lower, in_h_offset + h_lower));
@@ -37,13 +37,12 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
  DATA_TYPE4 bottom_right = READ_IMAGET(input, SAMPLER,
          (int2)(in_w_offset + w_upper, in_h_offset + h_upper));

-  DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp;
-  DATA_TYPE4 bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
+  DATA_TYPE4 top = mad((top_right - top_left), w_lerp, top_left);
+  DATA_TYPE4 bottom = mad((bottom_right - bottom_left), w_lerp, bottom_left);
+  DATA_TYPE4 out = mad((bottom - top), h_lerp, top);

-  DATA_TYPE4 out = top + (bottom - top) * h_lerp;
-
-  const int out_w_offset = ch_blk * out_width;
-  const int out_h_offset = b * out_height;
+  const int out_w_offset = mul24(ch_blk, out_width);
+  const int out_h_offset = mul24(b, out_height);
  WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out);
 }