fix round bug in cl kernel

4ab4c17e · liutuo · fbc1d019 · 4ab4c17e · 4ab4c17e · 4ab4c17e
25 changed file
--- a/mace/ops/opencl/cl/batch_to_space.cl
+++ b/mace/ops/opencl/cl/batch_to_space.cl
@@ -25,23 +25,25 @@ __kernel void batch_to_space(OUT_OF_RANGE_PARAMS
 #endif
  const int batch_b_idx = batch_hb_idx / batch_height;
-  const int batch_h_idx = batch_hb_idx % batch_height;
+  const int batch_h_idx = batch_hb_idx - mul24(batch_b_idx, batch_height);
  const int block_size = mul24(block_height, block_width);
-  const int space_b_idx = batch_b_idx % batch_size;
  const int remaining_batch_idx = batch_b_idx / batch_size;
-  const int space_h_idx = (remaining_batch_idx / block_width) +
+  const int space_b_idx = batch_b_idx - mul24(remaining_batch_idx, batch_size);
-      mul24(batch_h_idx, block_height) - padding_height;
+  const int n_h = remaining_batch_idx / block_width;
-  const int space_w_idx = (remaining_batch_idx % block_width) +
+  const int mod_h = remaining_batch_idx - mul24(n_h, block_width);
-      mul24(batch_w_idx, block_width) - padding_width;
+  const int space_h_idx =
+      mad24(batch_h_idx, block_height, n_h - padding_height);
+  const int space_w_idx =
+      mad24(batch_w_idx, block_width, mod_h - padding_width);
  if (0 <= space_w_idx && space_w_idx < space_width &&
      0 <= space_h_idx && space_h_idx < space_height) {
-    int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx);
+    int2 batch_coord = (int2)(mad24(chan_idx, batch_width, batch_w_idx), batch_hb_idx);
    DATA_TYPE4 value = READ_IMAGET(batch_data, SAMPLER, batch_coord);
-    int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx,
+    int2 space_coord = (int2)(mad24(chan_idx, space_width, space_w_idx),
-                              space_b_idx * space_height + space_h_idx);
+                              mad24(space_b_idx, space_height, space_h_idx));
    WRITE_IMAGET(space_data, space_coord, value);
  }

--- a/mace/ops/opencl/cl/buffer_to_image.cl
+++ b/mace/ops/opencl/cl/buffer_to_image.cl
@@ -19,11 +19,12 @@ __kernel void filter_buffer_to_image(OUT_OF_RANGE_PARAMS
 #endif
  const int in_channel_idx = w;
-  const int hw_size = filter_w * filter_h;
+  const int hw_size = mul24(filter_w, filter_h);
-  const int out_channel_idx = h / hw_size * 4;
+  int out_channel_idx = h / hw_size;
-  const int hw_idx = h % hw_size;
+  const int hw_idx = h - mul24(out_channel_idx, hw_size);
+  out_channel_idx = out_channel_idx << 2;
  const int h_idx = hw_idx / filter_w;
-  const int w_idx = hw_idx % filter_w;
+  const int w_idx = hw_idx - mul24(h_idx, filter_w);
  const int offset = input_offset +
      mad24(out_channel_idx, inner_size,
          mad24(mad24(in_channel_idx, filter_h, h_idx), filter_w, w_idx));
@@ -70,11 +71,12 @@ __kernel void filter_image_to_buffer(OUT_OF_RANGE_PARAMS
 #endif
  const int in_channel_idx = w;
-  const int hw_size = filter_w * filter_h;
+  const int hw_size = mul24(filter_w, filter_h);
-  const int out_channel_idx = h / hw_size * 4;
+  int out_channel_idx = h / hw_size;
-  const int hw_idx = h % hw_size;
+  const int hw_idx = h - mul24(out_channel_idx, hw_size);
+  out_channel_idx = out_channel_idx << 2;
  const int h_idx = hw_idx / filter_w;
-  const int w_idx = hw_idx % filter_w;
+  const int w_idx = hw_idx - mul24(h_idx, filter_w);
  const int offset =
      mad24(out_channel_idx, inner_size,
            mad24(mad24(in_channel_idx, filter_h, h_idx), filter_w, w_idx));
@@ -86,16 +88,16 @@ __kernel void filter_image_to_buffer(OUT_OF_RANGE_PARAMS
    if (size < 4) {
      switch (size) {
        case 3:
-          output[offset + 2 * inner_size] = values.z;
+          output[offset + (inner_size << 1)] = values.z;
        case 2:
-          output[offset + 1 * inner_size] = values.y;
+          output[offset + inner_size] = values.y;
        case 1:
          output[offset] = values.x;
      }
    } else {
      output[offset + 3 * inner_size] = values.w;
      output[offset + 2 * inner_size] = values.z;
-      output[offset + 1 * inner_size] = values.y;
+      output[offset + inner_size] = values.y;
      output[offset] = values.x;
    }
  }
@@ -124,7 +126,7 @@ __kernel void dw_filter_buffer_to_image(OUT_OF_RANGE_PARAMS
  if (multiplier == 1) {
    const int in_channel_idx = h << 2;
    const int h_idx = w / filter_w;
-    const int w_idx = w % filter_w;
+    const int w_idx = w - mul24(h_idx, filter_w);
    const int offset = input_offset
        + mad24(mad24(in_channel_idx, filter_h, h_idx), filter_w, w_idx);
@@ -137,14 +139,14 @@ __kernel void dw_filter_buffer_to_image(OUT_OF_RANGE_PARAMS
          case 3:
            values.z = *(input + offset + 2 * hw_size);
          case 2:
-            values.y = *(input + offset + 1 * hw_size);
+            values.y = *(input + offset + hw_size);
          case 1:
            values.x = *(input + offset);
        }
      } else {
        values.x = *(input + offset);
-        values.y = *(input + offset + 1 * hw_size);
+        values.y = *(input + offset + hw_size);
-        values.z = *(input + offset + 2 * hw_size);
+        values.z = *(input + offset + (hw_size << 1));
        values.w = *(input + offset + 3 * hw_size);
      }
    }
@@ -172,11 +174,14 @@ __kernel void in_out_buffer_to_image(OUT_OF_RANGE_PARAMS
 #endif
  const int batch_idx = h / height;
-  const int height_idx = h % height;
+  const int height_idx = h - mul24(batch_idx, height);
-  const int width_idx = w % width;
+  int channel_idx = w / width;
-  const int channel_idx = w / width * 4;
+  const int width_idx = w - mul24(channel_idx, width);
-  const int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
+  channel_idx = channel_idx << 2;
-                           + channel_idx;
+  const int offset =
+      mad24(mad24(mad24(batch_idx, height, height_idx), width, width_idx),
+            channels,
+            input_offset + channel_idx);
  const int size = channels - channel_idx;
  DATA_TYPE4 values = 0;
@@ -213,11 +218,13 @@ __kernel void in_out_image_to_buffer(OUT_OF_RANGE_PARAMS
 #endif
  const int batch_idx = h / height;
-  const int height_idx = h % height;
+  const int height_idx = h - mul24(batch_idx, height);
-  const int width_idx = w % width;
+  int channel_idx = w / width;
-  const int channel_idx = w / width * 4;
+  const int width_idx = w - mul24(channel_idx, width);
-  const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+  channel_idx = channel_idx << 2;
-                           + channel_idx;
+  const int offset = mad24(mad24(mad24(batch_idx, height, height_idx), width, width_idx),
+                           channels,
+                           channel_idx);
  int2 coord = (int2)(w, h);
  DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
@@ -251,8 +258,8 @@ __kernel void arg_buffer_to_image(OUT_OF_RANGE_PARAMS
  }
 #endif
-  const int offset = input_offset + w * 4;
+  const int offset = input_offset + (w << 2);
-  const int size = count - w * 4;
+  const int size = count - (w << 2);
  DATA_TYPE4 values = 0;
@@ -286,7 +293,7 @@ __kernel void arg_image_to_buffer(OUT_OF_RANGE_PARAMS
  }
 #endif
-  const int offset = w * 4;
+  const int offset = (w << 2);
  int2 coord = (int2)(w, h);
  DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
@@ -323,14 +330,15 @@ __kernel void in_out_height_buffer_to_image(OUT_OF_RANGE_PARAMS
  }
 #endif
-  const int wc = width * channels;
+  const int wc = mul24(width, channels);
-  const int height_blks = (height + 3) / 4;
+  const int height_blks = (height + 3) >> 2;
  const int batch_idx = h / height_blks;
-  const int height_idx = (h % height_blks) << 2;
+  const int height_idx = (h - mul24(batch_idx, height_blks)) << 2;
-  const int width_idx = w % width;
  const int channel_idx = w / width;
-  int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
+  const int width_idx = w - mul24(channel_idx, width);
-      + channel_idx;
+  int offset = mad24(mad24(mad24(batch_idx, height, height_idx), width, width_idx),
+                     channels,
+                     input_offset+ channel_idx);
  int size = height - height_idx;
  size = size >= 4 ? 0 : size;
@@ -365,25 +373,26 @@ __kernel void in_out_height_image_to_buffer(OUT_OF_RANGE_PARAMS
  }
  #endif
-  const int height_blks = (height + 3) / 4;
+  const int height_blks = (height + 3) >> 2;
  const int batch_idx = h / height_blks;
-  const int height_idx = (h % height_blks) << 2;
+  const int height_idx = (h - mul24(batch_idx, height_blks)) << 2;
-  const int width_idx = w % width;
  const int channel_idx = w / width;
-  int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+  const int width_idx = w - mul24(channel_idx, width);
-      + channel_idx;
+  int offset = mad24(mad24(mad24(batch_idx, height, height_idx), width, width_idx),
+                     channels,
+                     channel_idx);
  int2 coord = (int2)(w, h);
  DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
  output[offset] = values.x;
  if (height_idx + 1 >= height) return;
-  offset += width * channels;
+  offset = mad24(width, channels, offset);
  output[offset] = values.y;
  if (height_idx + 2 >= height) return;
-  offset += width * channels;
+  offset = mad24(width, channels, offset);
  output[offset] = values.z;
  if (height_idx + 3 >= height) return;
-  offset += width * channels;
+  offset = mad24(width, channels, offset);
  output[offset] = values.w;
 }
@@ -404,14 +413,15 @@ __kernel void in_out_width_buffer_to_image(OUT_OF_RANGE_PARAMS
  }
  #endif
-  const int width_blks = (width + 3) / 4;
+  const int width_blks = (width + 3) >> 2;
  const int batch_idx = h / height;
-  const int height_idx = h % height;
+  const int height_idx = h - mul24(batch_idx, height);
-  const int width_idx = (w % width_blks) << 2;
  const int channel_idx = w / width_blks;
-  const int offset = input_offset
+  const int width_idx = (w - mul24(channel_idx, width_blks)) << 2;
-      + ((batch_idx * height + height_idx) * width + width_idx) * channels
+  const int offset =
-      + channel_idx;
+      mad24(mad24(mad24(batch_idx, height, height_idx), width, width_idx),
+            channels,
+            channel_idx + input_offset);
  int size = width - width_idx;
  size = size >= 4 ? 0 : size;
@@ -450,13 +460,13 @@ __kernel void weight_height_buffer_to_image(OUT_OF_RANGE_PARAMS
  const int inner_size = global_size_dim0;
  const int out_chan_idx = h << 2;
-  const int in_chan_idx = w % in_channels;
  const int hw_idx = w / in_channels;
+  const int in_chan_idx = w - mul24(hw_idx, in_channels);
  const int height_idx = hw_idx / width;
-  const int width_idx = hw_idx % width;
+  const int width_idx = hw_idx - mul24(height_idx, width);
-  int offset = input_offset +
+  int offset =
      mad24(out_chan_idx, inner_size,
-            mad24(mad24(in_chan_idx, height, height_idx), width, width_idx));
+            mad24(mad24(in_chan_idx, height, height_idx), width, width_idx + input_offset));
  int size = out_channels - out_chan_idx;
  size = size >= 4 ? 0 : size;
@@ -494,10 +504,10 @@ __kernel void weight_height_image_to_buffer(OUT_OF_RANGE_PARAMS
  const int inner_size = global_size_dim0;
  const int out_chan_idx = h << 2;
-  const int in_chan_idx = w % in_channels;
  const int hw_idx = w / in_channels;
+  const int in_chan_idx = w - mul24(hw_idx, in_channels);
  const int height_idx = hw_idx / width;
-  const int width_idx = hw_idx % width;
+  const int width_idx = hw_idx - mul24(height_idx, width);
  int offset =
      mad24(out_chan_idx, inner_size,
            mad24(mad24(in_chan_idx, height, height_idx), width, width_idx));
@@ -535,17 +545,17 @@ __kernel void weight_width_buffer_to_image(OUT_OF_RANGE_PARAMS
 #endif
  const int out_channels = global_size_dim1;
  const int in_chan_blks = (in_channels + 3) >> 2;
-  const int hw_size = height * width;
+  const int hw_size = mul24(height, width);
-  const int inner_size = in_channels * hw_size;
+  const int inner_size = mul24(in_channels, hw_size);
  const int out_chan_idx = h;
-  const int in_chan_idx = (w % in_chan_blks) << 2;
  const int hw_idx = w / in_chan_blks;
+  const int in_chan_idx = (w - mul24(hw_idx, in_chan_blks)) << 2;
  const int height_idx = hw_idx / width;
-  const int width_idx = hw_idx % width;
+  const int width_idx = hw_idx - mul24(height_idx, width);
-  int offset = input_offset +
+  int offset =
      mad24(out_chan_idx, inner_size,
-            mad24(mad24(in_chan_idx, height, height_idx), width, width_idx));
+            mad24(mad24(in_chan_idx, height, height_idx), width, width_idx + input_offset));
  int size = in_channels - in_chan_idx;
@@ -582,14 +592,14 @@ __kernel void weight_width_image_to_buffer(OUT_OF_RANGE_PARAMS
 #endif
  const int out_channels = global_size_dim1;
  const int in_chan_blks = (in_channels + 3) >> 2;
-  const int hw_size = height * width;
+  const int hw_size = mul24(height, width);
-  const int inner_size = in_channels * hw_size;
+  const int inner_size = mul24(in_channels, hw_size);
  const int out_chan_idx = h;
-  const int in_chan_idx = (w % in_chan_blks) << 2;
  const int hw_idx = w / in_chan_blks;
+  const int in_chan_idx = (w - mul24(hw_idx, in_chan_blks)) << 2;
  const int height_idx = hw_idx / width;
-  const int width_idx = hw_idx % width;
+  const int width_idx = hw_idx - mul24(height_idx, width);
  int offset =
      mad24(out_chan_idx, inner_size,
            mad24(mad24(in_chan_idx, height, height_idx), width, width_idx));
@@ -629,7 +639,11 @@ __kernel void winograd_filter_buffer_to_image_2x2(OUT_OF_RANGE_PARAMS
  const int out_channel_idx = h;
  const int in_channel_idx = w << 2;
-  const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
+  const int hw = mul24(height, width);
+  const int offset =
+      mad24(mad24(out_channel_idx, in_channels, in_channel_idx),
+            hw,
+            input_offset);
  const int length = min((in_channels - in_channel_idx) * 9, 36);
  DATA_TYPE in[36] = {0};
  DATA_TYPE4 tt;
@@ -732,7 +746,7 @@ __kernel void winograd_filter_image_to_buffer_2x2(OUT_OF_RANGE_PARAMS
  const int width_idx = w << 2;
  const int size = width - width_idx;
-  int offset = h * width + width_idx;
+  int offset = mad24(h, width, width_idx);
  int2 coord = (int2)(w, h);
  DATA_TYPE4 values;
@@ -752,7 +766,7 @@ __kernel void winograd_filter_image_to_buffer_2x2(OUT_OF_RANGE_PARAMS
    }
    coord.y += height;
-    offset += height * width;
+    offset = mad24(height, width, offset);
  }
 }
@@ -777,7 +791,9 @@ __kernel void winograd_filter_buffer_to_image_6x6(OUT_OF_RANGE_PARAMS
  const int out_channel_idx = h;
  const int in_channel_idx = w << 2;
-  const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
+  const int offset =mad24(mad24(out_channel_idx, in_channels, in_channel_idx),
+                          mul24(height, width),
+                          input_offset);
  const int length = min((in_channels - in_channel_idx) * 9, 36);
  DATA_TYPE in[36] = {0};
  DATA_TYPE4 tt0, tt1, t1;
@@ -909,7 +925,7 @@ __kernel void winograd_filter_image_to_buffer_6x6(OUT_OF_RANGE_PARAMS
  const int width_idx = w << 2;
  const int size = width - width_idx;
-  int offset = h * width + width_idx;
+  int offset = mad24(h, width, width_idx);
  int2 coord = (int2)(w, h);
  DATA_TYPE4 values;
@@ -928,7 +944,7 @@ __kernel void winograd_filter_image_to_buffer_6x6(OUT_OF_RANGE_PARAMS
      vstore4(values, 0, output + offset);
    }
    coord.y += height;
-    offset += height * width;
+    offset = mad24(height, width, offset);
  }
 }
@@ -953,7 +969,9 @@ __kernel void winograd_filter_buffer_to_image_4x4(OUT_OF_RANGE_PARAMS
  const int out_channel_idx = h;
  const int in_channel_idx = w << 2;
-  const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
+  const int offset = mad24(mad24(out_channel_idx, in_channels, in_channel_idx),
+                           mul24(height, width),
+                           input_offset);
  const int length = min((in_channels - in_channel_idx) * 9, 36);
  DATA_TYPE in[36] = {0};
  DATA_TYPE4 tt0, tt1, tt2;
@@ -1058,7 +1076,7 @@ __kernel void winograd_filter_image_to_buffer_4x4(OUT_OF_RANGE_PARAMS
  const int width_idx = w << 2;
  const int size = width - width_idx;
-  int offset = h * width + width_idx;
+  int offset = mad24(h, width, width_idx);
  int2 coord = (int2)(w, h);
  DATA_TYPE4 values;
@@ -1077,6 +1095,6 @@ __kernel void winograd_filter_image_to_buffer_4x4(OUT_OF_RANGE_PARAMS
      vstore4(values, 0, output + offset);
    }
    coord.y += height;
-    offset += height * width;
+    offset = mad24(height, width, offset);
  }
 }
\ No newline at end of file
--- a/mace/ops/opencl/cl/buffer_transform.cl
+++ b/mace/ops/opencl/cl/buffer_transform.cl
@@ -23,9 +23,11 @@ __kernel void pad_input(BUFFER_OUT_OF_RANGE_PARAMS
 #endif
  const int padded_chan_blk = (padded_chan + 3) >> 2;
  const int padded_width_idx = padded_wc_blk_idx / padded_chan_blk;
-  const int padded_chan_blk_idx = padded_wc_blk_idx % padded_chan_blk;
+  const int padded_chan_blk_idx =
+      padded_wc_blk_idx - mul24(padded_width_idx, padded_chan_blk);
  const int batch_idx = padded_hb_idx / padded_height;
-  const int padded_height_idx = padded_hb_idx % padded_height;
+  const int padded_height_idx =
+      padded_hb_idx - mul24(batch_idx, padded_height);
  const int padded_chan_idx = padded_chan_blk_idx << 2;
  const int in_height_idx = padded_height_idx - pad_top;
  const int in_width_idx = padded_width_idx - pad_left;
@@ -81,7 +83,7 @@ __kernel void transform_conv_filter(BUFFER_OUT_OF_RANGE_PARAMS
  const int out_chan_blk = global_size_dim1;
  const int h_idx = hw_idx / width;
-  const int w_idx = hw_idx % width;
+  const int w_idx = hw_idx - mul24(h_idx, width);
  const int out_chan_idx = out_chan_blk_idx << 2;
  const int in_offset = mad24(mad24(mad24(out_chan_idx, in_chan, in_chan_idx),
      height, h_idx), width, w_idx) + input_offset;

--- a/mace/ops/opencl/cl/channel_shuffle.cl
+++ b/mace/ops/opencl/cl/channel_shuffle.cl
@@ -19,10 +19,10 @@ __kernel void channel_shuffle(OUT_OF_RANGE_PARAMS
 #endif
  const int width = global_size_dim1;
-  const int group_blks = groups / 4;
+  const int group_blks = groups >> 2;
-  const int groups_blks_width = group_blks * width;
+  const int groups_blks_width = mul24(group_blks, width);
-  const int channels_per_group_blks = channels_per_group / 4;
+  const int channels_per_group_blks = channels_per_group >> 2;
-  const int channels_per_group_blks_width = channels_per_group_blks * width;
+  const int channels_per_group_blks_width = mul24(channels_per_group_blks, width);
  DATA_TYPE4 in_chan_data0, in_chan_data1, in_chan_data2, in_chan_data3;
  DATA_TYPE4 out_chan_data0, out_chan_data1, out_chan_data2, out_chan_data3;

--- a/mace/ops/opencl/cl/conv_2d_1x1.cl
+++ b/mace/ops/opencl/cl/conv_2d_1x1.cl
@@ -45,14 +45,16 @@ __kernel void conv_2d_1x1(OUT_OF_RANGE_PARAMS
  w.y = w.x + in_width_stride;
  w.z = w.y + in_width_stride;
  w.w = w.z + in_width_stride;
-  int out_hb_idx = mul24((out_hb % height), stride);
+  int batch = out_hb / height;
+  int h_idx = out_hb - mul24(batch, height);
+  int out_hb_idx = mul24(h_idx, stride);
  w.x = select(w.x, INT_MIN, w.x >= in_width);
  w.y = select(w.y, INT_MIN, w.y >= in_width);
  w.z = select(w.z, INT_MIN, w.z >= in_width);
  w.w = select(w.w, INT_MIN, w.w >= in_width);
-  out_hb_idx = select(mad24((out_hb / height), in_height, out_hb_idx),
+  out_hb_idx = select(mad24(batch, in_height, out_hb_idx),
                      -1,
                      out_hb_idx >= in_height);

--- a/mace/ops/opencl/cl/conv_2d_1x1_buffer.cl
+++ b/mace/ops/opencl/cl/conv_2d_1x1_buffer.cl
@@ -30,10 +30,11 @@ __kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS
  const int out_chan_blk = (out_chan + 3) >> 2;
  const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk;
-  const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk;
+  const int out_chan_blk_idx =
+      out_wc_blk_idx - mul24(out_width_blk_idx, out_chan_blk);
  const int batch_idx = out_hb_idx / out_height;
-  const int out_height_idx = out_hb_idx % out_height;
+  const int out_height_idx = out_hb_idx - mul24(batch_idx, out_height);
  const int out_width_idx = out_width_blk_idx << 1;
  const int out_chan_idx = out_chan_blk_idx << 2;

--- a/mace/ops/opencl/cl/conv_2d_buffer.cl
+++ b/mace/ops/opencl/cl/conv_2d_buffer.cl
@@ -35,10 +35,11 @@ __kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS
  const int out_chan_blk = (out_chan + 3) >> 2;
  const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk;
-  const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk;
+  const int out_chan_blk_idx =
+      out_wc_blk_idx - mul24(out_width_blk_idx, out_chan_blk);
  const int batch_idx = out_hb_idx / out_height;
-  const int out_height_idx = out_hb_idx % out_height;
+  const int out_height_idx = out_hb_idx - mul24(batch_idx, out_height);
  const int out_width_idx = out_width_blk_idx << 2;
  const int out_chan_idx = out_chan_blk_idx << 2;

--- a/mace/ops/opencl/cl/crop.cl
+++ b/mace/ops/opencl/cl/crop.cl
@@ -27,7 +27,7 @@ __kernel void crop(OUT_OF_RANGE_PARAMS
 #endif
  const int b = hb_idx / out_height;
-  const int h = hb_idx % out_height;
+  const int h = hb_idx - mul24(b, out_height);
  const int in_chan_blk_idx = chan_blk_idx + offset_chan_blk;
  const int in_width_idx = width_idx + offset_w;
  const int in_h = h + offset_h;

--- a/mace/ops/opencl/cl/depth_to_space.cl
+++ b/mace/ops/opencl/cl/depth_to_space.cl
@@ -24,10 +24,10 @@ __kernel void depth_to_space(OUT_OF_RANGE_PARAMS
  const int out_pos = mad24(out_d, output_width, out_w);
  const int in_hb = out_hb / block_size;
-  const int offset_h = out_hb % block_size;
+  const int offset_h = out_hb - mul24(in_hb, block_size);
  const int in_w = out_w / block_size;
-  const int offset_w = out_w % block_size;
+  const int offset_w = out_w - mul24(in_w, block_size);
-  const int offset_d = (offset_h * block_size + offset_w) * output_depth_blocks;
+  const int offset_d = mul24(mad24(offset_h, block_size, offset_w), output_depth_blocks);
  const int in_d = out_d + offset_d;
  if (in_hb >= input_hb || in_w >= input_width || in_d >= input_depth_blocks) {

--- a/mace/ops/opencl/cl/depthwise_conv2d_buffer.cl
+++ b/mace/ops/opencl/cl/depthwise_conv2d_buffer.cl
@@ -35,10 +35,10 @@ __kernel void depthwise_conv2d(BUFFER_OUT_OF_RANGE_PARAMS
  const int out_chan_blk = (out_chan + 3) >> 2;
  const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk;
-  const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk;
+  const int out_chan_blk_idx = out_wc_blk_idx - mul24(out_width_blk_idx, out_chan_blk);
  const int batch_idx = out_hb_idx / out_height;
-  const int out_height_idx = out_hb_idx % out_height;
+  const int out_height_idx = out_hb_idx - mul24(batch_idx, out_height);
  const int out_width_idx = out_width_blk_idx << 2;
  const int out_chan_idx = out_chan_blk_idx << 2;
  const int in_chan_idx = out_chan_idx;

--- a/mace/ops/opencl/cl/fully_connected.cl
+++ b/mace/ops/opencl/cl/fully_connected.cl
@@ -35,7 +35,7 @@ __kernel void fully_connected(OUT_OF_RANGE_PARAMS
  for (short h_idx = 0; h_idx < input_height; ++h_idx) {
    for (short w_idx = 0; w_idx < input_width; ++w_idx) {
      input_coord.x = w_idx;
-      weight_x = (h_idx * input_width + w_idx) * input_channel;
+      weight_x = mul24(mad24(h_idx, input_width, w_idx), input_channel);
 #pragma unroll
      for (short chan_idx = 0; chan_idx < input_chan_blk; ++chan_idx) {
        input_value = READ_IMAGET(input, SAMPLER, input_coord);
@@ -84,7 +84,7 @@ __kernel void fully_connected_width(OUT_OF_RANGE_PARAMS
  const int batch_out_blk_idx = get_global_id(2);
  const int batch_idx = batch_out_blk_idx / out_blks;
-  const int out_blk_idx = batch_out_blk_idx % out_blks;
+  const int out_blk_idx = batch_out_blk_idx - mul24(batch_idx, out_blks);
  const short in_outer_size = mul24(input_width, in_chan_blks);
  const short weight_y = mad24(out_blk_idx, 4, inter_out_idx);

--- a/mace/ops/opencl/cl/lstmcell.cl
+++ b/mace/ops/opencl/cl/lstmcell.cl
@@ -20,117 +20,118 @@ __kernel void lstmcell(OUT_OF_RANGE_PARAMS
  if (w_blk_idx >= global_size_dim0 || h_idx >= global_size_dim1) return;
 #endif
-  // fc_res0 -> i
-  // fc_res1 -> j
-  // fc_res2 -> f
-  // fc_res3 -> o
-  DATA_TYPE4 fc_res0 = 0.0, fc_res1 = 0.0, fc_res2 = 0.0, fc_res3 = 0.0;
  DATA_TYPE4 in, pre_h;
  DATA_TYPE4 w0, w1, w2, w3;
  int k_offset;
  // concat matmul
+  const int pos_x0 = w_blk_idx;
+  const int pos_x1 = pos_x0 + global_size_dim0;
+  const int pos_x2 = pos_x1 + global_size_dim0;
+  const int pos_x3 = pos_x2 + global_size_dim0;
+  // bias
+  // fc_res0 -> i
+  // fc_res1 -> j
+  // fc_res2 -> f
+  // fc_res3 -> o
+  DATA_TYPE4 fc_res0, fc_res1, fc_res2, fc_res3;
+  fc_res0 = READ_IMAGET(bias, SAMPLER, (int2)(pos_x0, 0));
+  fc_res1 = READ_IMAGET(bias, SAMPLER, (int2)(pos_x1, 0));
+  fc_res2 = READ_IMAGET(bias, SAMPLER, (int2)(pos_x2, 0));
+  fc_res3 = READ_IMAGET(bias, SAMPLER, (int2)(pos_x3, 0));
  for (short i = 0; i < in_w_blk; ++i) {
    in = READ_IMAGET(input, SAMPLER, (int2)(i, h_idx));
    int k = i << 2;
-    w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k));
+    w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k));
-    w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k));
+    w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k));
-    w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k));
+    w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k));
-    w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k));
+    w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k));
-    fc_res0 += in.x * w0;
+    fc_res0 = mad(in.x, w0, fc_res0);
-    fc_res1 += in.x * w1;
+    fc_res1 = mad(in.x, w1, fc_res1);
-    fc_res2 += in.x * w2;
+    fc_res2 = mad(in.x, w2, fc_res2);
-    fc_res3 += in.x * w3;
+    fc_res3 = mad(in.x, w3, fc_res3);
    k += 1;
    k_offset = select(-1, k, k < width);
-    w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k_offset));
+    w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k_offset));
-    w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k_offset));
+    w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k_offset));
-    w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k_offset));
+    w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k_offset));
-    w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k_offset));
+    w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k_offset));
-    fc_res0 += in.y * w0;
+    fc_res0 = mad(in.y, w0, fc_res0);
-    fc_res1 += in.y * w1;
+    fc_res1 = mad(in.y, w1, fc_res1);
-    fc_res2 += in.y * w2;
+    fc_res2 = mad(in.y, w2, fc_res2);
-    fc_res3 += in.y * w3;
+    fc_res3 = mad(in.y, w3, fc_res3);
    k += 1;
    k_offset = select(-1, k, k < width);
-    w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k_offset));
+    w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k_offset));
-    w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k_offset));
+    w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k_offset));
-    w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k_offset));
+    w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k_offset));
-    w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k_offset));
+    w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k_offset));
-    fc_res0 += in.z * w0;
+    fc_res0 = mad(in.z, w0, fc_res0);
-    fc_res1 += in.z * w1;
+    fc_res1 = mad(in.z, w1, fc_res1);
-    fc_res2 += in.z * w2;
+    fc_res2 = mad(in.z, w2, fc_res2);
-    fc_res3 += in.z * w3;
+    fc_res3 = mad(in.z, w3, fc_res3);
    k += 1;
    k_offset = select(-1, k, k < width);
-    w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k_offset));
+    w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k_offset));
-    w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k_offset));
+    w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k_offset));
-    w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k_offset));
+    w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k_offset));
-    w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k_offset));
+    w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k_offset));
-    fc_res0 += in.w * w0;
+    fc_res0 = mad(in.w, w0, fc_res0);
-    fc_res1 += in.w * w1;
+    fc_res1 = mad(in.w, w1, fc_res1);
-    fc_res2 += in.w * w2;
+    fc_res2 = mad(in.w, w2, fc_res2);
-    fc_res3 += in.w * w3;
+    fc_res3 = mad(in.w, w3, fc_res3);
  }
  for (short i = 0; i < global_size_dim0; ++i) {
    pre_h = READ_IMAGET(pre_output, SAMPLER, (int2)(i, h_idx));
    int k = (i << 2) + width;
-    w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k));
+    w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k));
-    w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k));
+    w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k));
-    w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k));
+    w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k));
-    w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k));
+    w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k));
-    fc_res0 += pre_h.x * w0;
+    fc_res0 = mad(pre_h.x, w0, fc_res0);
-    fc_res1 += pre_h.x * w1;
+    fc_res1 = mad(pre_h.x, w1, fc_res1);
-    fc_res2 += pre_h.x * w2;
+    fc_res2 = mad(pre_h.x, w2, fc_res2);
-    fc_res3 += pre_h.x * w3;
+    fc_res3 = mad(pre_h.x, w3, fc_res3);
    k += 1;
-    w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k));
+    w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k));
-    w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k));
+    w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k));
-    w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k));
+    w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k));
-    w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k));
+    w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k));
-    fc_res0 += pre_h.y * w0;
+    fc_res0 = mad(pre_h.y, w0, fc_res0);
-    fc_res1 += pre_h.y * w1;
+    fc_res1 = mad(pre_h.y, w1, fc_res1);
-    fc_res2 += pre_h.y * w2;
+    fc_res2 = mad(pre_h.y, w2, fc_res2);
-    fc_res3 += pre_h.y * w3;
+    fc_res3 = mad(pre_h.y, w3, fc_res3);
    k += 1;
-    w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k));
+    w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k));
-    w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k));
+    w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k));
-    w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k));
+    w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k));
-    w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k));
+    w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k));
-    fc_res0 += pre_h.z * w0;
+    fc_res0 = mad(pre_h.z, w0, fc_res0);
-    fc_res1 += pre_h.z * w1;
+    fc_res1 = mad(pre_h.z, w1, fc_res1);
-    fc_res2 += pre_h.z * w2;
+    fc_res2 = mad(pre_h.z, w2, fc_res2);
-    fc_res3 += pre_h.z * w3;
+    fc_res3 = mad(pre_h.z, w3, fc_res3);
    k += 1;
-    w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k));
+    w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k));
-    w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k));
+    w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k));
-    w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k));
+    w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k));
-    w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k));
+    w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k));
-    fc_res0 += pre_h.w * w0;
+    fc_res0 = mad(pre_h.w, w0, fc_res0);
-    fc_res1 += pre_h.w * w1;
+    fc_res1 = mad(pre_h.w, w1, fc_res1);
-    fc_res2 += pre_h.w * w2;
+    fc_res2 = mad(pre_h.w, w2, fc_res2);
-    fc_res3 += pre_h.w * w3;
+    fc_res3 = mad(pre_h.w, w3, fc_res3);
  }
-  // bias
-  DATA_TYPE4 b0, b1, b2, b3;
-  b0 = READ_IMAGET(bias, SAMPLER, (int2)(w_blk_idx, 0));
-  b1 = READ_IMAGET(bias, SAMPLER, (int2)(w_blk_idx + global_size_dim0, 0));
-  b2 = READ_IMAGET(bias, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, 0));
-  b3 = READ_IMAGET(bias, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, 0));
-  fc_res0 += b0;
-  fc_res1 += b1;
-  fc_res2 += b2;
-  fc_res3 += b3;
  // gate
  DATA_TYPE4 pre_c, c, h;
  pre_c = READ_IMAGET(pre_cell, SAMPLER, (int2)(w_blk_idx, h_idx));

--- a/mace/ops/opencl/cl/matmul.cl
+++ b/mace/ops/opencl/cl/matmul.cl
@@ -19,7 +19,7 @@ __kernel void matmul(OUT_OF_RANGE_PARAMS
 #endif
  const int batch = hb / height_blocks;
-  const int ty = (hb % height_blocks);
+  const int ty = hb - mul24(batch, height_blocks);
  const int gy = mad24(batch, height_blocks, ty);
  const int bm = mad24(batch, M, ty << 2);
  const int bk = mul24(batch, k_blocks);

--- a/mace/ops/opencl/cl/pad.cl
+++ b/mace/ops/opencl/cl/pad.cl
@@ -14,7 +14,7 @@ __kernel void pad(OUT_OF_RANGE_PARAMS
  const int width_idx = get_global_id(1);
  const int hb_idx = get_global_id(2);
  const int batch_idx = hb_idx / output_height;
-  const int height_idx = hb_idx % output_height;
+  const int height_idx = hb_idx - mul24(batch_idx, output_height);
  const int input_padded_height = input_height + height_padding;
  const int input_padded_width = input_width + width_padding;

--- a/mace/ops/opencl/cl/pooling.cl
+++ b/mace/ops/opencl/cl/pooling.cl
@@ -42,9 +42,11 @@ __kernel void pooling(OUT_OF_RANGE_PARAMS
 #endif
  const int out_width = global_size_dim1;
-  const int batch_idx = mul24((out_hb_idx / out_height), in_height);
+  const int n_b = out_hb_idx / out_height;
-  const int in_height_start = mul24((out_hb_idx % out_height), stride_h) - pad_top;
+  const int mod_b = out_hb_idx - mul24(n_b, out_height);
-  const int in_width_start = mul24(out_width_idx, stride_w) - pad_left;
+  const int batch_idx = mul24(n_b, in_height);
+  const int in_height_start = mad24(mod_b, stride_h, -pad_top);
+  const int in_width_start = mad24(out_width_idx, stride_w, -pad_left);
  const int in_channel_offset = mul24(out_chan_idx, in_width);

--- a/mace/ops/opencl/cl/pooling_buffer.cl
+++ b/mace/ops/opencl/cl/pooling_buffer.cl
@@ -47,7 +47,7 @@ __kernel void pooling(BUFFER_OUT_OF_RANGE_PARAMS
  const int in_wc_size = mul24(in_width, in_chan);
  const int batch_idx = out_hb_idx / out_height;
-  const int out_height_idx = out_hb_idx % out_height;
+  const int out_height_idx = out_hb_idx - mul24(batch_idx, out_height);
  const int chan_idx = out_chan_blk_idx << 2;
  const int in_height_start = mul24(out_height_idx, stride_h) - pad_top;
  const int in_width_start = mul24(out_width_idx, stride_w) - pad_left;

--- a/mace/ops/opencl/cl/reduce_mean.cl
+++ b/mace/ops/opencl/cl/reduce_mean.cl
@@ -11,9 +11,7 @@ __kernel void reduce_mean(OUT_OF_RANGE_PARAMS
                          __private const int in_height,
                          __private const int in_width,
                          __private const float image_size_reciprocal,
-                          __private const float in_width_reciprocal,
                          __private const int channel_blocks,
-                          __private const float channel_blocks_reciprocal,
                          __write_only image2d_t output) {
  const int i = get_local_id(0);
  const int j = get_local_id(1);
@@ -26,7 +24,7 @@ __kernel void reduce_mean(OUT_OF_RANGE_PARAMS
  const int dim0_size = get_local_size(0);
  float4 tmp = (float4){0, 0, 0, 0};
  const int index = mad24(j, dim0_size, i);
-  const int b = floor(k * channel_blocks_reciprocal);
+  const int b = k / channel_blocks;
  const int ch = mad24(b, -channel_blocks, k);
  DATA_TYPE4 in;
@@ -40,7 +38,7 @@ __kernel void reduce_mean(OUT_OF_RANGE_PARAMS
 #pragma unroll
  for (int l = 0; l < valid_part_len; ++l) {
    int offset = base_offset + l;
-    int h_id = floor(offset * in_width_reciprocal);
+    int h_id = offset / in_width;
    int w_id = mad24(h_id, -in_width, offset);
    int pos_x = mad24(ch, in_width, w_id);
    int pos_y = mad24(b, in_height, h_id);

--- a/mace/ops/opencl/cl/resize_bicubic.cl
+++ b/mace/ops/opencl/cl/resize_bicubic.cl
@@ -36,7 +36,7 @@ __kernel void resize_bicubic_nocache(OUT_OF_RANGE_PARAMS
 #endif
  const int b = hb / out_height;
-  const int h = hb % out_height;
+  const int h = hb - mul24(b, out_height);
  const float h_in = h * height_scale;
  const float w_in = w * width_scale;

--- a/mace/ops/opencl/cl/resize_bilinear.cl
+++ b/mace/ops/opencl/cl/resize_bilinear.cl
@@ -24,7 +24,7 @@ __kernel void resize_bilinear_nocache(OUT_OF_RANGE_PARAMS
  const int out_width = global_size_dim1;
  const int b = hb / out_height;
-  const int h = hb % out_height;
+  const int h = hb - mul24(b, out_height);
  const float h_in = h * height_scale;
  const float w_in = w * width_scale;

--- a/mace/ops/opencl/cl/softmax_buffer.cl
+++ b/mace/ops/opencl/cl/softmax_buffer.cl
@@ -20,7 +20,7 @@ __kernel void softmax(BUFFER_OUT_OF_RANGE_PARAMS
  const int chan_blks = global_size_dim0 - 1;
  const int width = global_size_dim1;
  const int batch_idx = hb_idx / height;
-  const int height_idx = hb_idx % height;
+  const int height_idx = hb_idx - mul24(batch_idx, height);
  const int chan_idx = chan_blk_idx << 2;
  const int offset_base = mul24(mad24(mad24(batch_idx, height, height_idx),

--- a/mace/ops/opencl/cl/space_to_batch.cl
+++ b/mace/ops/opencl/cl/space_to_batch.cl
@@ -25,14 +25,19 @@ __kernel void space_to_batch(OUT_OF_RANGE_PARAMS
 #endif
  const int batch_b_idx = batch_hb_idx / batch_height;
-  const int batch_h_idx = batch_hb_idx % batch_height;
+  const int batch_h_idx = batch_hb_idx - mul24(batch_b_idx, batch_height);
  const int block_size = mul24(block_height, block_width);
-  const int space_b_idx = batch_b_idx % batch_size;
  const int remaining_batch_idx = batch_b_idx / batch_size;
-  const int space_h_idx = (remaining_batch_idx / block_width) +
+  const int space_b_idx =
+      batch_b_idx - mul24(remaining_batch_idx, batch_size);
+  const int n_remain_blk_w = remaining_batch_idx / block_width;
+  const int mod_remain_blk_w =
+      remaining_batch_idx - mul24(n_remain_blk_w, block_width);
+  const int space_h_idx = n_remain_blk_w +
      mul24(batch_h_idx, block_height) - padding_height;
-  const int space_w_idx = (remaining_batch_idx % block_width) +
+  const int space_w_idx = mod_remain_blk_w +
      mul24(batch_w_idx, block_width) - padding_width;
  const int space_coord_x = select(mul24(chan_idx, space_width) + space_w_idx,

--- a/mace/ops/opencl/cl/space_to_depth.cl
+++ b/mace/ops/opencl/cl/space_to_depth.cl
@@ -24,10 +24,10 @@ __kernel void space_to_depth(OUT_OF_RANGE_PARAMS
  const int in_pos = mad24(d, input_width, w);
  const int out_hb = hb / block_size;
-  const int offset_h = hb % block_size;
+  const int offset_h = hb - mul24(out_hb, block_size);
  const int out_w = w / block_size;
-  const int offset_w = w % block_size;
+  const int offset_w = w - mul24(out_w, block_size);
-  const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks;
+  const int offset_d = mul24(input_depth_blocks, mad24(offset_h, block_size, offset_w));
  const int out_d = d + offset_d;
  if (out_d >= output_depth_blocks || out_hb >= output_hb || out_w >= output_width) {

--- a/mace/ops/opencl/cl/sqrdiff_mean.cl
+++ b/mace/ops/opencl/cl/sqrdiff_mean.cl
@@ -12,9 +12,7 @@ __kernel void sqrdiff_mean(OUT_OF_RANGE_PARAMS
                           __private const int in_height,
                           __private const int in_width,
                           __private const float image_size_reciprocal,
-                           __private const float in_width_reciprocal,
                           __private const int channel_blocks,
-                           __private const float channel_blocks_reciprocal,
                           __write_only image2d_t output) {
  const int i = get_local_id(0);
  const int j = get_local_id(1);
@@ -27,7 +25,7 @@ __kernel void sqrdiff_mean(OUT_OF_RANGE_PARAMS
  const int dim0_size = get_local_size(0);
  float4 tmp = (float4){0, 0, 0, 0};
  const int index = mad24(j, dim0_size, i);
-  const int b = floor(k * channel_blocks_reciprocal);
+  const int b = k / channel_blocks;
  const int ch = mad24(b, -channel_blocks, k);
  DATA_TYPE4 in;
@@ -43,7 +41,7 @@ __kernel void sqrdiff_mean(OUT_OF_RANGE_PARAMS
 #pragma unroll
  for (int l = 0; l < valid_part_len; ++l) {
    int offset = base_offset + l;
-    int h_id = floor(offset * in_width_reciprocal);
+    int h_id = offset / in_width;
    int w_id = mad24(h_id, -in_width, offset);
    int pos_x = mad24(ch, in_width, w_id);
    int pos_y = mad24(b, in_height, h_id);

--- a/mace/ops/opencl/image/reduce_mean.h
+++ b/mace/ops/opencl/image/reduce_mean.h
@@ -111,9 +111,7 @@ MaceStatus ReduceMeanKernel<T>::Compute(
  const int group_size = lws[0] * lws[1] * lws[2];
  const int partial_len = (image_size + group_size - 1) / group_size;
  const int remain_index = image_size % group_size;
-  const float in_width_reciprocal = 1.f / in_width;
  const float img_size_reciprocal = 1.f / (in_width * in_height);
-  const float channel_blk_reciprocal = 1.f / channel_blocks;
  MACE_OUT_OF_RANGE_INIT(kernel_);
  if (!IsVecEqual(input_shape_, input->shape())) {
@@ -130,9 +128,7 @@ MaceStatus ReduceMeanKernel<T>::Compute(
    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
    kernel_.setArg(idx++, img_size_reciprocal);
-    kernel_.setArg(idx++, in_width_reciprocal);
    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
-    kernel_.setArg(idx++, channel_blk_reciprocal);
    kernel_.setArg(idx++, *(output->opencl_image()));
    input_shape_ = input->shape();

--- a/mace/ops/opencl/image/sqrdiff_mean.h
+++ b/mace/ops/opencl/image/sqrdiff_mean.h
@@ -107,9 +107,7 @@ MaceStatus SqrDiffMeanKernel<T>::Compute(
  const int group_size = lws[0] * lws[1] * lws[2];
  const int partial_len = (image_size + group_size - 1) / group_size;
  const int remain_index = image_size % group_size;
-  const float in_width_reciprocal = 1.f / in_width;
  const float img_size_reciprocal = 1.f / (in_width * in_height);
-  const float channel_blk_reciprocal = 1.f / channel_blocks;
  MACE_OUT_OF_RANGE_INIT(kernel_);
  if (!IsVecEqual(input_shape_, input0->shape())) {
@@ -127,9 +125,7 @@ MaceStatus SqrDiffMeanKernel<T>::Compute(
    kernel_.setArg(idx++, static_cast<int32_t>(in_height));
    kernel_.setArg(idx++, static_cast<int32_t>(in_width));
    kernel_.setArg(idx++, img_size_reciprocal);
-    kernel_.setArg(idx++, in_width_reciprocal);
    kernel_.setArg(idx++, static_cast<int32_t>(channel_blocks));
-    kernel_.setArg(idx++, channel_blk_reciprocal);
    kernel_.setArg(idx++, *(output->opencl_image()));
    input_shape_ = input0->shape();