diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl
index 93d249ea25a113c0bb1437b43631e1c9e3372488..027b678bfeac4345155dca29eea17ab0212ec7d6 100644
--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
@@ -17,12 +17,13 @@ __kernel void batch_norm(__read_only image2d_t input,
   DATA_TYPE4 mean_value = READ_IMAGET(mean, SAMPLER, (int2)(ch_blk, 0));
   DATA_TYPE4 var_value = READ_IMAGET(var, SAMPLER, (int2)(ch_blk, 0));
 
+  // native_rsqrt seems not faster than rsqrt
   DATA_TYPE4 new_scale = scale_value * rsqrt(var_value + (DATA_TYPE4)epsilon);
-  DATA_TYPE4 new_offset = offset_value - mean_value * new_scale;
+  DATA_TYPE4 new_offset = mad(0 - mean_value, new_scale, offset_value);
 
-  const int pos = ch_blk * width + w;
+  const int pos = mad24(ch_blk, width, w);
 
   DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
-  DATA_TYPE4 out = in * new_scale + new_offset;
+  DATA_TYPE4 out = mad(in, new_scale, new_offset);
   WRITE_IMAGET(output, (int2)(pos, hb), out);
 }
diff --git a/mace/kernels/opencl/cl/bias_add.cl b/mace/kernels/opencl/cl/bias_add.cl
index 3dc0eabe60f8fce68e03e8c8abe789fb35ace510..f5180a3c0d58b478d81d08e65743c4af1f77c189 100644
--- a/mace/kernels/opencl/cl/bias_add.cl
+++ b/mace/kernels/opencl/cl/bias_add.cl
@@ -8,8 +8,7 @@ __kernel void bias_add(__read_only image2d_t input,
   const int hb = get_global_id(2);
   const int width = get_global_size(1);
 
-
-  const int pos = ch_blk * width + w;
+  const int pos = mad24(ch_blk, width, w);
   DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
   DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(ch_blk, 0));
   DATA_TYPE4 out = in + bias_value;
diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl
index 5ae4fb04e141f6510529083b2fee9e8826494b6d..3a3efea4be125f6d7b94b39871838a7b0f7d96f4 100644
--- a/mace/kernels/opencl/cl/concat.cl
+++ b/mace/kernels/opencl/cl/concat.cl
@@ -30,37 +30,37 @@ __kernel void concat_channel(__read_only image2d_t input0,
   const int width_idx = get_global_id(1);
   const int width = get_global_size(1);
   const int hb_idx = get_global_id(2);
-  const int input0_chan_blk = (input0_chan + 3) / 4;
+  const int input0_chan_blk = (input0_chan + 3) >> 2;
 
   DATA_TYPE4 data = 0;
 #ifdef DIVISIBLE_FOUR
   if (chan_blk_idx + 1 <= input0_chan_blk) {
     data = READ_IMAGET(input0,
                        SAMPLER,
-                       (int2)(chan_blk_idx * width + width_idx, hb_idx));
+                       (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
   } else {
     data = READ_IMAGET(input1,
                        SAMPLER,
-                       (int2)((chan_blk_idx - input0_chan_blk) * width + width_idx, hb_idx));
+                       (int2)(mad24((chan_blk_idx - input0_chan_blk), width, width_idx), hb_idx));
   }
 #else
   if (chan_blk_idx + 1 < input0_chan_blk) {
     data = READ_IMAGET(input0,
                        SAMPLER,
-                       (int2)(chan_blk_idx * width + width_idx, hb_idx));
+                       (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
   } else if (chan_blk_idx >= input0_chan_blk) {
     const int in_chan_idx = chan_blk_idx - input0_chan_blk;
     DATA_TYPE4 data0 = READ_IMAGET(input1,
                                    SAMPLER,
-                                   (int2)(in_chan_idx * width + width_idx, hb_idx));
+                                   (int2)(mad24(in_chan_idx, width, width_idx), hb_idx));
     DATA_TYPE4 data1 = READ_IMAGET(input1,
                                    SAMPLER,
-                                   (int2)((in_chan_idx + 1) * width + width_idx, hb_idx));
+                                   (int2)(mad24((in_chan_idx + 1), width, width_idx), hb_idx));
     data = stitch_vector(data0, data1, input0_chan % 4, true);
   } else {
     DATA_TYPE4 data0 = READ_IMAGET(input0,
                                    SAMPLER,
-                                   (int2)(chan_blk_idx * width + width_idx, hb_idx));
+                                   (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
     DATA_TYPE4 data1 = READ_IMAGET(input1,
                                    SAMPLER,
                                    (int2)(width_idx, hb_idx));
@@ -68,7 +68,7 @@ __kernel void concat_channel(__read_only image2d_t input0,
   }
 #endif
 
-  WRITE_IMAGET(output, (int2)(chan_blk_idx * width + width_idx, hb_idx), data);
+  WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data);
 }
 
 //__kernel void concat_width(__read_only image2d_t input0,
diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl
index 060cb39a7921882a6fd2c733e3a370e7c2110ed4..ccfe86d85c83a24898ce9640747fe46b95be2d1f 100644
--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -19,7 +19,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
   const int out_w_blk = get_global_id(1);
   const int out_w_blks = get_global_size(1);
   const int out_hb = get_global_id(2);
-  const int rounded_in_ch = in_ch_blks * 4;
+  const int rounded_in_ch = in_ch_blks << 2;
 
 #ifdef BIAS
   DATA_TYPE4 out0 =
@@ -41,29 +41,29 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
   int in_width3 = in_width2 + out_w_blks;
   const int height_idx = (out_hb % out_height) - padding_top;
 #else
-  int in_width0 = out_w_blk * 2 - padding_left;
-  int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
-  int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
-  int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
-  const int height_idx = (out_hb % out_height) * 2 - padding_top;
+  int in_width0 = (out_w_blk << 1) - padding_left;
+  int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left;
+  int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left;
+  int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left;
+  const int height_idx = ((out_hb % out_height) << 1) - padding_top;
 #endif
 
-  const int batch_idx = (out_hb / out_height) * in_height;
+  const int batch_idx = mul24((out_hb / out_height), in_height);
+  const int rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width);
 
   DATA_TYPE4 in0, in1, in2, in3;
   DATA_TYPE4 weights0, weights1, weights2, weights3;
-  int in_idx, in_width_idx;
-  // Unrolling this loop hurt perfmance
   for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
+    const int in_idx = mul24(in_ch_blk, in_width);
+    int filter_x_part0 = in_ch_blk << 2;
     for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
-
       int in_hb_value = height_idx + hb_idx;
       in_hb_value = select(in_hb_value + batch_idx,
                            -1,
                            (in_hb_value < 0 || in_hb_value >= in_height));
 
+      int filter_x_part1 = 0;
       for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
-        in_idx = in_ch_blk * in_width;
         int in_width_value;
 #define READ_INPUT(i)                                                                \
         in_width_value = in_width##i + width_idx;                                    \
@@ -79,36 +79,37 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
 
 #undef READ_INPUT
 
-        int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch;
+        // int filter_idx = (hb_idx * filter_width + width_idx) * rounded_in_ch + (in_ch_blk << 2);
+        int filter_idx = filter_x_part0 + filter_x_part1;
         weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
         weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
         weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
         weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
 
-        // Will prefetch L2 improve performance? How to pretch image data?
+        out0 = mad(in0.x, weights0, out0);
+        out0 = mad(in0.y, weights1, out0);
+        out0 = mad(in0.z, weights2, out0);
+        out0 = mad(in0.w, weights3, out0);
 
-        // Interleaving load and mul does not improve performance as expected
-        out0 += in0.x * weights0;
-        out0 += in0.y * weights1;
-        out0 += in0.z * weights2;
-        out0 += in0.w * weights3;
 
-        out1 += in1.x * weights0;
-        out1 += in1.y * weights1;
-        out1 += in1.z * weights2;
-        out1 += in1.w * weights3;
+        out1 = mad(in1.x, weights0, out1);
+        out1 = mad(in1.y, weights1, out1);
+        out1 = mad(in1.z, weights2, out1);
+        out1 = mad(in1.w, weights3, out1);
 
-        out2 += in2.x * weights0;
-        out2 += in2.y * weights1;
-        out2 += in2.z * weights2;
-        out2 += in2.w * weights3;
+        out2 = mad(in2.x, weights0, out2);
+        out2 = mad(in2.y, weights1, out2);
+        out2 = mad(in2.z, weights2, out2);
+        out2 = mad(in2.w, weights3, out2);
 
-        out3 += in3.x * weights0;
-        out3 += in3.y * weights1;
-        out3 += in3.z * weights2;
-        out3 += in3.w * weights3;
+        out3 = mad(in3.x, weights0, out3);
+        out3 = mad(in3.y, weights1, out3);
+        out3 = mad(in3.z, weights2, out3);
+        out3 = mad(in3.w, weights3, out3);
 
+        filter_x_part1 += rounded_in_ch;
       }
+      filter_x_part0 += rounded_in_ch_x_filter_width;
     }
   }
 
@@ -120,28 +121,20 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
   out3 = fmax(out3, 0);
 #endif
 
-  const int out_x_base = out_ch_blk * out_width;
+  const int out_x_base = mul24(out_ch_blk, out_width);
   int w = out_w_blk;
-  WRITE_IMAGET(output,
-               (int2)(out_x_base + w, out_hb),
-               out0);
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0);
 
   w += out_w_blks;
   if (w >= out_width) return;
-  WRITE_IMAGET(output,
-               (int2)(out_x_base + w, out_hb),
-               out1);
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1);
 
   w += out_w_blks;
   if (w >= out_width) return;
-  WRITE_IMAGET(output,
-               (int2)(out_x_base + w, out_hb),
-               out2);
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2);
 
   w += out_w_blks;
   if (w >= out_width) return;
-  WRITE_IMAGET(output,
-               (int2)(out_x_base + w, out_hb),
-               out3);
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
 
 }
diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index 1249f1fcffa00b1719203bd2000ac4335df326c3..e741833a31fde27d0764110629b07dd861dfc3af 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -36,11 +36,11 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   w.w = w.z + out_w_blks;
   int out_hb_idx = (out_hb % height);
 #else
-  w.x = out_w_blk * 2;
-  w.y = (out_w_blk + out_w_blks) * 2;
-  w.z = (out_w_blk + 2 * out_w_blks) * 2;
-  w.w = (out_w_blk + 3 * out_w_blks) * 2;
-  int out_hb_idx = (out_hb % height) * 2;
+  w.x = out_w_blk << 1;
+  w.y = (out_w_blk + out_w_blks) << 1;
+  w.z = (out_w_blk + (out_w_blks << 1)) << 1;
+  w.w = (out_w_blk + (out_w_blks << 1) + out_w_blks) << 1;
+  int out_hb_idx = (out_hb % height) << 1;
 #endif
 
   w.x = select(w.x, INT_MIN, w.x >= in_width);
@@ -48,47 +48,46 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   w.z = select(w.z, INT_MIN, w.z >= in_width);
   w.w = select(w.w, INT_MIN, w.w >= in_width);
 
-  out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height,
+  out_hb_idx = select(mad24((out_hb / height), in_height, out_hb_idx),
                       -1,
                       out_hb_idx >= in_height);
 
   // Unrolling this loop hurt perfmance
   int in_x_base = 0;
+  int filter_x_base = 0;
   for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-
     DATA_TYPE4 in0 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.x, out_hb_idx));
     DATA_TYPE4 in1 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.y, out_hb_idx));
     DATA_TYPE4 in2 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.z, out_hb_idx));
     DATA_TYPE4 in3 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.w, out_hb_idx));
 
-    const int filter_x0 = in_ch_blk << 2;
-    DATA_TYPE4 weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0, out_ch_blk));
-    DATA_TYPE4 weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 1, out_ch_blk));
-    DATA_TYPE4 weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 2, out_ch_blk));
-    DATA_TYPE4 weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 3, out_ch_blk));
-    // Will prefetch L2 improve performance? How to pretch image data?
-
-    out0 += in0.x * weights0;
-    out0 += in0.y * weights1;
-    out0 += in0.z * weights2;
-    out0 += in0.w * weights3;
-
-    out1 += in1.x * weights0;
-    out1 += in1.y * weights1;
-    out1 += in1.z * weights2;
-    out1 += in1.w * weights3;
-
-    out2 += in2.x * weights0;
-    out2 += in2.y * weights1;
-    out2 += in2.z * weights2;
-    out2 += in2.w * weights3;
-
-    out3 += in3.x * weights0;
-    out3 += in3.y * weights1;
-    out3 += in3.z * weights2;
-    out3 += in3.w * weights3;
+    DATA_TYPE4 weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 0, out_ch_blk));
+    DATA_TYPE4 weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 1, out_ch_blk));
+    DATA_TYPE4 weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 2, out_ch_blk));
+    DATA_TYPE4 weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x_base + 3, out_ch_blk));
+
+    out0 = mad(in0.x, weights0, out0);
+    out0 = mad(in0.y, weights1, out0);
+    out0 = mad(in0.z, weights2, out0);
+    out0 = mad(in0.w, weights3, out0);
+
+    out1 = mad(in1.x, weights0, out1);
+    out1 = mad(in1.y, weights1, out1);
+    out1 = mad(in1.z, weights2, out1);
+    out1 = mad(in1.w, weights3, out1);
+
+    out2 = mad(in2.x, weights0, out2);
+    out2 = mad(in2.y, weights1, out2);
+    out2 = mad(in2.z, weights2, out2);
+    out2 = mad(in2.w, weights3, out2);
+
+    out3 = mad(in3.x, weights0, out3);
+    out3 = mad(in3.y, weights1, out3);
+    out3 = mad(in3.z, weights2, out3);
+    out3 = mad(in3.w, weights3, out3);
 
     in_x_base += in_width;
+    filter_x_base += 4;
   }
 
 #ifdef FUSED_RELU
@@ -99,7 +98,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   out3 = fmax(out3, 0);
 #endif
 
-  const int out_x_base = out_ch_blk * width;
+  const int out_x_base = mul24(out_ch_blk, width);
   int out_x_idx = out_w_blk;
   WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
 
@@ -114,5 +113,4 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   out_x_idx += out_w_blks;
   if (out_x_idx >= width) return;
   WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
-
 }
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index 9d032a729fbf767442e27c28f878d3ed1f89c474..8a5ece6b31d907fc0a564c7407c969d6102b4c3a 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -17,7 +17,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   const int out_w_blk = get_global_id(1);
   const int out_w_blks = get_global_size(1);
   const int out_hb = get_global_id(2);
-  const int rounded_in_ch = in_ch_blks * 4;
+  const int rounded_in_ch = in_ch_blks << 2;
 
 #ifdef BIAS
   DATA_TYPE4 out0 =
@@ -42,29 +42,30 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   int in_width4 = in_width3 + out_w_blks;
   const int height_idx = (out_hb % out_height) - padding_top;
 #else
-  int in_width0 = out_w_blk * 2 - padding_left;
-  int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left;
-  int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left;
-  int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left;
-  int in_width4 = (out_w_blk + 4 * out_w_blks) * 2 - padding_left;
-  const int height_idx = (out_hb % out_height) * 2 - padding_top;
+  int in_width0 = (out_w_blk << 1) - padding_left;
+  int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left;
+  int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left;
+  int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left;
+  int in_width4 = ((out_w_blk + (out_w_blks << 2)) << 1) - padding_left;
+  const int height_idx = ((out_hb % out_height) << 1) - padding_top;
 #endif
 
-  const int batch_idx = (out_hb / out_height) * in_height;
+  const int batch_idx = mul24((out_hb / out_height), in_height);
+  const int rounded_in_ch_x_3 = (rounded_in_ch << 1) + rounded_in_ch;
 
   DATA_TYPE4 in0, in1, in2, in3, in4;
   DATA_TYPE4 weights0, weights1, weights2, weights3;
-  int in_idx, hb_idx, width_idx, in_width_idx;
-  // Unrolling this loop hurt perfmance
+  int hb_idx, width_idx, in_width_idx;
   for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
+    const int in_idx = mul24(in_ch_blk, in_width);
+    int filter_x_part0 = in_ch_blk << 2;
     for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
       int in_hb_value = height_idx + hb_idx;
       in_hb_value = select(in_hb_value + batch_idx,
                            -1,
                            (in_hb_value < 0 || in_hb_value >= in_height));
+      int filter_x_part1 = 0;
       for (short width_idx = 0; width_idx < 3; ++width_idx) {
-
-        in_idx = in_ch_blk * in_width;
         int in_width_value;
 #define READ_INPUT(i)                                                                \
         in_width_value = in_width##i + width_idx;                                    \
@@ -81,40 +82,42 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 
 #undef READ_INPUT
 
-        int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch;
+        // int filter_idx = (hb_idx * 3 + width_idx) * rounded_in_ch + (in_ch_blk << 2);
+        int filter_idx = filter_x_part0 + filter_x_part1;
         weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
         weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
         weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
         weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
 
-        // Will prefetch L2 improve performance? How to pretch image data?
-
-        // Interleaving load and mul does not improve performance as expected
-        out0 += in0.x * weights0;
-        out0 += in0.y * weights1;
-        out0 += in0.z * weights2;
-        out0 += in0.w * weights3;
-
-        out1 += in1.x * weights0;
-        out1 += in1.y * weights1;
-        out1 += in1.z * weights2;
-        out1 += in1.w * weights3;
-
-        out2 += in2.x * weights0;
-        out2 += in2.y * weights1;
-        out2 += in2.z * weights2;
-        out2 += in2.w * weights3;
-
-        out3 += in3.x * weights0;
-        out3 += in3.y * weights1;
-        out3 += in3.z * weights2;
-        out3 += in3.w * weights3;
-
-        out4 += in4.x * weights0;
-        out4 += in4.y * weights1;
-        out4 += in4.z * weights2;
-        out4 += in4.w * weights3;
+        out0 = mad(in0.x, weights0, out0);
+        out0 = mad(in0.y, weights1, out0);
+        out0 = mad(in0.z, weights2, out0);
+        out0 = mad(in0.w, weights3, out0);
+
+
+        out1 = mad(in1.x, weights0, out1);
+        out1 = mad(in1.y, weights1, out1);
+        out1 = mad(in1.z, weights2, out1);
+        out1 = mad(in1.w, weights3, out1);
+
+        out2 = mad(in2.x, weights0, out2);
+        out2 = mad(in2.y, weights1, out2);
+        out2 = mad(in2.z, weights2, out2);
+        out2 = mad(in2.w, weights3, out2);
+
+        out3 = mad(in3.x, weights0, out3);
+        out3 = mad(in3.y, weights1, out3);
+        out3 = mad(in3.z, weights2, out3);
+        out3 = mad(in3.w, weights3, out3);
+
+        out4 = mad(in4.x, weights0, out4);
+        out4 = mad(in4.y, weights1, out4);
+        out4 = mad(in4.z, weights2, out4);
+        out4 = mad(in4.w, weights3, out4);
+
+        filter_x_part1 += rounded_in_ch;
       }
+      filter_x_part0 += rounded_in_ch_x_3;
     }
   }
 
@@ -127,7 +130,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   out4 = fmax(out4, 0);
 #endif
 
-  const int out_x_base = out_ch_blk * out_width;
+  const int out_x_base = mul24(out_ch_blk, out_width);
   int w = out_w_blk;
   WRITE_IMAGET(output,
                (int2)(out_x_base + w, out_hb),
diff --git a/mace/kernels/opencl/cl/pooling.cl b/mace/kernels/opencl/cl/pooling.cl
index bd2763fc8c7eb68c06e09b4822ccc5025807a151..f2298a93264e5a3ad79ac3977d226ef7dbb3058a 100644
--- a/mace/kernels/opencl/cl/pooling.cl
+++ b/mace/kernels/opencl/cl/pooling.cl
@@ -15,7 +15,7 @@ inline int calculate_avg_block_size(const int pool_size,
   const int w_start = max(0, pos_w);
   const int h_end = min(pos_h + pool_size, h_size);
   const int w_end = min(pos_w + pool_size, w_size);
-  return (h_end - h_start) * (w_end - w_start);
+  return mul24((h_end - h_start), (w_end - w_start));
 }
 
 // Supported data type: half/float
@@ -33,10 +33,10 @@ __kernel void pooling(__read_only image2d_t input,
   const int out_width = get_global_size(1);
   const int out_hb_idx = get_global_id(2);
 
-  const int batch_idx = (out_hb_idx / out_height) * in_height;
-  const int in_height_start = (out_hb_idx % out_height) * stride - pad_top;
-  const int in_width_start = out_width_idx * stride - pad_left;
-  const int in_channel_offset = out_chan_idx * in_width;
+  const int batch_idx = mul24((out_hb_idx / out_height), in_height);
+  const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top;
+  const int in_width_start = mul24(out_width_idx, stride) - pad_left;
+  const int in_channel_offset = mul24(out_chan_idx, in_width);
 
 
 #ifdef POOL_AVG
@@ -83,5 +83,5 @@ __kernel void pooling(__read_only image2d_t input,
   }
 #endif
 
-  WRITE_IMAGET(output, (int2)(out_chan_idx * out_width + out_width_idx, out_hb_idx), res);
+  WRITE_IMAGET(output, (int2)(mad24(out_chan_idx, out_width, out_width_idx), out_hb_idx), res);
 }
diff --git a/mace/kernels/opencl/cl/relu.cl b/mace/kernels/opencl/cl/relu.cl
index c8ea1c092f898b912db90555fd3309d7d21d349a..e0762bdb0312af28ae49a2b54927e36bfc5b78dc 100644
--- a/mace/kernels/opencl/cl/relu.cl
+++ b/mace/kernels/opencl/cl/relu.cl
@@ -8,7 +8,7 @@ __kernel void relu(__read_only image2d_t input,
   const int hb = get_global_id(2);
   const int width = get_global_size(1);
 
-  const int pos = ch_blk * width + w;
+  const int pos = mad24(ch_blk, width, w);
   DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
   DATA_TYPE4 out = fmax(in, 0);
   WRITE_IMAGET(output, (int2)(pos, hb), out);
@@ -22,7 +22,7 @@ __kernel void relux(__read_only image2d_t input,
   const int hb = get_global_id(2);
   const int width = get_global_size(1);
 
-  const int pos = ch_blk * width + w;
+  const int pos = mad24(ch_blk, width, w);
   DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
   DATA_TYPE4 out = clamp(in, 0, max_limit);
   WRITE_IMAGET(output, (int2)(pos, hb), out);
diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl
index 72f09c96cd266725237ea7afb8d88fe1915b2573..e0b4b83dcf2e6cc4610d664408db05550a58f0de 100644
--- a/mace/kernels/opencl/cl/resize_bilinear.cl
+++ b/mace/kernels/opencl/cl/resize_bilinear.cl
@@ -25,8 +25,8 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
   const float h_lerp = h_in - h_lower;
   const float w_lerp = w_in - w_lower;
 
-  const int in_w_offset = ch_blk * in_width;
-  const int in_h_offset = b * in_height;
+  const int in_w_offset = mul24(ch_blk, in_width);
+  const int in_h_offset = mul24(b, in_height);
 
   DATA_TYPE4 top_left = READ_IMAGET(input, SAMPLER,
           (int2)(in_w_offset + w_lower, in_h_offset + h_lower));
@@ -37,13 +37,12 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
   DATA_TYPE4 bottom_right = READ_IMAGET(input, SAMPLER,
           (int2)(in_w_offset + w_upper, in_h_offset + h_upper));
 
-  DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp;
-  DATA_TYPE4 bottom = bottom_left + (bottom_right - bottom_left) * w_lerp;
+  DATA_TYPE4 top = mad((top_right - top_left), w_lerp, top_left);
+  DATA_TYPE4 bottom = mad((bottom_right - bottom_left), w_lerp, bottom_left);
+  DATA_TYPE4 out = mad((bottom - top), h_lerp, top);
 
-  DATA_TYPE4 out = top + (bottom - top) * h_lerp;
-
-  const int out_w_offset = ch_blk * out_width;
-  const int out_h_offset = b * out_height;
+  const int out_w_offset = mul24(ch_blk, out_width);
+  const int out_h_offset = mul24(b, out_height);
   WRITE_IMAGET(output, (int2)(out_w_offset + w, out_h_offset + h), out);
 }