diff --git a/mace/ops/opencl/cl/batch_to_space.cl b/mace/ops/opencl/cl/batch_to_space.cl index de59bb86e2f4586fb223bbb44c1262ec9809ca90..73f58dcc99545c7a5abd5b62cfef07769f11a73a 100644 --- a/mace/ops/opencl/cl/batch_to_space.cl +++ b/mace/ops/opencl/cl/batch_to_space.cl @@ -25,23 +25,25 @@ __kernel void batch_to_space(OUT_OF_RANGE_PARAMS #endif const int batch_b_idx = batch_hb_idx / batch_height; - const int batch_h_idx = batch_hb_idx % batch_height; + const int batch_h_idx = batch_hb_idx - mul24(batch_b_idx, batch_height); const int block_size = mul24(block_height, block_width); - const int space_b_idx = batch_b_idx % batch_size; const int remaining_batch_idx = batch_b_idx / batch_size; - const int space_h_idx = (remaining_batch_idx / block_width) + - mul24(batch_h_idx, block_height) - padding_height; - const int space_w_idx = (remaining_batch_idx % block_width) + - mul24(batch_w_idx, block_width) - padding_width; + const int space_b_idx = batch_b_idx - mul24(remaining_batch_idx, batch_size); + const int n_h = remaining_batch_idx / block_width; + const int mod_h = remaining_batch_idx - mul24(n_h, block_width); + const int space_h_idx = + mad24(batch_h_idx, block_height, n_h - padding_height); + const int space_w_idx = + mad24(batch_w_idx, block_width, mod_h - padding_width); if (0 <= space_w_idx && space_w_idx < space_width && 0 <= space_h_idx && space_h_idx < space_height) { - int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx); + int2 batch_coord = (int2)(mad24(chan_idx, batch_width, batch_w_idx), batch_hb_idx); DATA_TYPE4 value = READ_IMAGET(batch_data, SAMPLER, batch_coord); - int2 space_coord = (int2)(mul24(chan_idx, space_width) + space_w_idx, - space_b_idx * space_height + space_h_idx); + int2 space_coord = (int2)(mad24(chan_idx, space_width, space_w_idx), + mad24(space_b_idx, space_height, space_h_idx)); WRITE_IMAGET(space_data, space_coord, value); } diff --git a/mace/ops/opencl/cl/buffer_to_image.cl b/mace/ops/opencl/cl/buffer_to_image.cl index ac0fb7788f0f3e95cbf00cc04623b3ef015f2038..9a116aea7328d3adc6d5510924b49783ce713bc6 100644 --- a/mace/ops/opencl/cl/buffer_to_image.cl +++ b/mace/ops/opencl/cl/buffer_to_image.cl @@ -19,11 +19,12 @@ __kernel void filter_buffer_to_image(OUT_OF_RANGE_PARAMS #endif const int in_channel_idx = w; - const int hw_size = filter_w * filter_h; - const int out_channel_idx = h / hw_size * 4; - const int hw_idx = h % hw_size; + const int hw_size = mul24(filter_w, filter_h); + int out_channel_idx = h / hw_size; + const int hw_idx = h - mul24(out_channel_idx, hw_size); + out_channel_idx = out_channel_idx << 2; const int h_idx = hw_idx / filter_w; - const int w_idx = hw_idx % filter_w; + const int w_idx = hw_idx - mul24(h_idx, filter_w); const int offset = input_offset + mad24(out_channel_idx, inner_size, mad24(mad24(in_channel_idx, filter_h, h_idx), filter_w, w_idx)); @@ -70,11 +71,12 @@ __kernel void filter_image_to_buffer(OUT_OF_RANGE_PARAMS #endif const int in_channel_idx = w; - const int hw_size = filter_w * filter_h; - const int out_channel_idx = h / hw_size * 4; - const int hw_idx = h % hw_size; + const int hw_size = mul24(filter_w, filter_h); + int out_channel_idx = h / hw_size; + const int hw_idx = h - mul24(out_channel_idx, hw_size); + out_channel_idx = out_channel_idx << 2; const int h_idx = hw_idx / filter_w; - const int w_idx = hw_idx % filter_w; + const int w_idx = hw_idx - mul24(h_idx, filter_w); const int offset = mad24(out_channel_idx, inner_size, mad24(mad24(in_channel_idx, filter_h, h_idx), filter_w, w_idx)); @@ -86,16 +88,16 @@ __kernel void filter_image_to_buffer(OUT_OF_RANGE_PARAMS if (size < 4) { switch (size) { case 3: - output[offset + 2 * inner_size] = values.z; + output[offset + (inner_size << 1)] = values.z; case 2: - output[offset + 1 * inner_size] = values.y; + output[offset + inner_size] = values.y; case 1: output[offset] = values.x; } } else { output[offset + 3 * inner_size] = values.w; output[offset + 2 * inner_size] = values.z; - output[offset + 1 * inner_size] = values.y; + output[offset + inner_size] = values.y; output[offset] = values.x; } } @@ -124,7 +126,7 @@ __kernel void dw_filter_buffer_to_image(OUT_OF_RANGE_PARAMS if (multiplier == 1) { const int in_channel_idx = h << 2; const int h_idx = w / filter_w; - const int w_idx = w % filter_w; + const int w_idx = w - mul24(h_idx, filter_w); const int offset = input_offset + mad24(mad24(in_channel_idx, filter_h, h_idx), filter_w, w_idx); @@ -137,14 +139,14 @@ __kernel void dw_filter_buffer_to_image(OUT_OF_RANGE_PARAMS case 3: values.z = *(input + offset + 2 * hw_size); case 2: - values.y = *(input + offset + 1 * hw_size); + values.y = *(input + offset + hw_size); case 1: values.x = *(input + offset); } } else { values.x = *(input + offset); - values.y = *(input + offset + 1 * hw_size); - values.z = *(input + offset + 2 * hw_size); + values.y = *(input + offset + hw_size); + values.z = *(input + offset + (hw_size << 1)); values.w = *(input + offset + 3 * hw_size); } } @@ -172,11 +174,14 @@ __kernel void in_out_buffer_to_image(OUT_OF_RANGE_PARAMS #endif const int batch_idx = h / height; - const int height_idx = h % height; - const int width_idx = w % width; - const int channel_idx = w / width * 4; - const int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels - + channel_idx; + const int height_idx = h - mul24(batch_idx, height); + int channel_idx = w / width; + const int width_idx = w - mul24(channel_idx, width); + channel_idx = channel_idx << 2; + const int offset = + mad24(mad24(mad24(batch_idx, height, height_idx), width, width_idx), + channels, + input_offset + channel_idx); const int size = channels - channel_idx; DATA_TYPE4 values = 0; @@ -213,11 +218,13 @@ __kernel void in_out_image_to_buffer(OUT_OF_RANGE_PARAMS #endif const int batch_idx = h / height; - const int height_idx = h % height; - const int width_idx = w % width; - const int channel_idx = w / width * 4; - const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels - + channel_idx; + const int height_idx = h - mul24(batch_idx, height); + int channel_idx = w / width; + const int width_idx = w - mul24(channel_idx, width); + channel_idx = channel_idx << 2; + const int offset = mad24(mad24(mad24(batch_idx, height, height_idx), width, width_idx), + channels, + channel_idx); int2 coord = (int2)(w, h); DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord); @@ -251,8 +258,8 @@ __kernel void arg_buffer_to_image(OUT_OF_RANGE_PARAMS } #endif - const int offset = input_offset + w * 4; - const int size = count - w * 4; + const int offset = input_offset + (w << 2); + const int size = count - (w << 2); DATA_TYPE4 values = 0; @@ -286,7 +293,7 @@ __kernel void arg_image_to_buffer(OUT_OF_RANGE_PARAMS } #endif - const int offset = w * 4; + const int offset = (w << 2); int2 coord = (int2)(w, h); DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord); @@ -323,14 +330,15 @@ __kernel void in_out_height_buffer_to_image(OUT_OF_RANGE_PARAMS } #endif - const int wc = width * channels; - const int height_blks = (height + 3) / 4; + const int wc = mul24(width, channels); + const int height_blks = (height + 3) >> 2; const int batch_idx = h / height_blks; - const int height_idx = (h % height_blks) << 2; - const int width_idx = w % width; + const int height_idx = (h - mul24(batch_idx, height_blks)) << 2; const int channel_idx = w / width; - int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels - + channel_idx; + const int width_idx = w - mul24(channel_idx, width); + int offset = mad24(mad24(mad24(batch_idx, height, height_idx), width, width_idx), + channels, + input_offset+ channel_idx); int size = height - height_idx; size = size >= 4 ? 0 : size; @@ -365,25 +373,26 @@ __kernel void in_out_height_image_to_buffer(OUT_OF_RANGE_PARAMS } #endif - const int height_blks = (height + 3) / 4; + const int height_blks = (height + 3) >> 2; const int batch_idx = h / height_blks; - const int height_idx = (h % height_blks) << 2; - const int width_idx = w % width; + const int height_idx = (h - mul24(batch_idx, height_blks)) << 2; const int channel_idx = w / width; - int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels - + channel_idx; + const int width_idx = w - mul24(channel_idx, width); + int offset = mad24(mad24(mad24(batch_idx, height, height_idx), width, width_idx), + channels, + channel_idx); int2 coord = (int2)(w, h); DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord); output[offset] = values.x; if (height_idx + 1 >= height) return; - offset += width * channels; + offset = mad24(width, channels, offset); output[offset] = values.y; if (height_idx + 2 >= height) return; - offset += width * channels; + offset = mad24(width, channels, offset); output[offset] = values.z; if (height_idx + 3 >= height) return; - offset += width * channels; + offset = mad24(width, channels, offset); output[offset] = values.w; } @@ -404,14 +413,15 @@ __kernel void in_out_width_buffer_to_image(OUT_OF_RANGE_PARAMS } #endif - const int width_blks = (width + 3) / 4; + const int width_blks = (width + 3) >> 2; const int batch_idx = h / height; - const int height_idx = h % height; - const int width_idx = (w % width_blks) << 2; + const int height_idx = h - mul24(batch_idx, height); const int channel_idx = w / width_blks; - const int offset = input_offset - + ((batch_idx * height + height_idx) * width + width_idx) * channels - + channel_idx; + const int width_idx = (w - mul24(channel_idx, width_blks)) << 2; + const int offset = + mad24(mad24(mad24(batch_idx, height, height_idx), width, width_idx), + channels, + channel_idx + input_offset); int size = width - width_idx; size = size >= 4 ? 0 : size; @@ -450,13 +460,13 @@ __kernel void weight_height_buffer_to_image(OUT_OF_RANGE_PARAMS const int inner_size = global_size_dim0; const int out_chan_idx = h << 2; - const int in_chan_idx = w % in_channels; const int hw_idx = w / in_channels; + const int in_chan_idx = w - mul24(hw_idx, in_channels); const int height_idx = hw_idx / width; - const int width_idx = hw_idx % width; - int offset = input_offset + + const int width_idx = hw_idx - mul24(height_idx, width); + int offset = mad24(out_chan_idx, inner_size, - mad24(mad24(in_chan_idx, height, height_idx), width, width_idx)); + mad24(mad24(in_chan_idx, height, height_idx), width, width_idx + input_offset)); int size = out_channels - out_chan_idx; size = size >= 4 ? 0 : size; @@ -494,10 +504,10 @@ __kernel void weight_height_image_to_buffer(OUT_OF_RANGE_PARAMS const int inner_size = global_size_dim0; const int out_chan_idx = h << 2; - const int in_chan_idx = w % in_channels; const int hw_idx = w / in_channels; + const int in_chan_idx = w - mul24(hw_idx, in_channels); const int height_idx = hw_idx / width; - const int width_idx = hw_idx % width; + const int width_idx = hw_idx - mul24(height_idx, width); int offset = mad24(out_chan_idx, inner_size, mad24(mad24(in_chan_idx, height, height_idx), width, width_idx)); @@ -535,17 +545,17 @@ __kernel void weight_width_buffer_to_image(OUT_OF_RANGE_PARAMS #endif const int out_channels = global_size_dim1; const int in_chan_blks = (in_channels + 3) >> 2; - const int hw_size = height * width; - const int inner_size = in_channels * hw_size; + const int hw_size = mul24(height, width); + const int inner_size = mul24(in_channels, hw_size); const int out_chan_idx = h; - const int in_chan_idx = (w % in_chan_blks) << 2; const int hw_idx = w / in_chan_blks; + const int in_chan_idx = (w - mul24(hw_idx, in_chan_blks)) << 2; const int height_idx = hw_idx / width; - const int width_idx = hw_idx % width; - int offset = input_offset + + const int width_idx = hw_idx - mul24(height_idx, width); + int offset = mad24(out_chan_idx, inner_size, - mad24(mad24(in_chan_idx, height, height_idx), width, width_idx)); + mad24(mad24(in_chan_idx, height, height_idx), width, width_idx + input_offset)); int size = in_channels - in_chan_idx; @@ -582,14 +592,14 @@ __kernel void weight_width_image_to_buffer(OUT_OF_RANGE_PARAMS #endif const int out_channels = global_size_dim1; const int in_chan_blks = (in_channels + 3) >> 2; - const int hw_size = height * width; - const int inner_size = in_channels * hw_size; + const int hw_size = mul24(height, width); + const int inner_size = mul24(in_channels, hw_size); const int out_chan_idx = h; - const int in_chan_idx = (w % in_chan_blks) << 2; const int hw_idx = w / in_chan_blks; + const int in_chan_idx = (w - mul24(hw_idx, in_chan_blks)) << 2; const int height_idx = hw_idx / width; - const int width_idx = hw_idx % width; + const int width_idx = hw_idx - mul24(height_idx, width); int offset = mad24(out_chan_idx, inner_size, mad24(mad24(in_chan_idx, height, height_idx), width, width_idx)); @@ -629,7 +639,11 @@ __kernel void winograd_filter_buffer_to_image_2x2(OUT_OF_RANGE_PARAMS const int out_channel_idx = h; const int in_channel_idx = w << 2; - const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width; + const int hw = mul24(height, width); + const int offset = + mad24(mad24(out_channel_idx, in_channels, in_channel_idx), + hw, + input_offset); const int length = min((in_channels - in_channel_idx) * 9, 36); DATA_TYPE in[36] = {0}; DATA_TYPE4 tt; @@ -732,7 +746,7 @@ __kernel void winograd_filter_image_to_buffer_2x2(OUT_OF_RANGE_PARAMS const int width_idx = w << 2; const int size = width - width_idx; - int offset = h * width + width_idx; + int offset = mad24(h, width, width_idx); int2 coord = (int2)(w, h); DATA_TYPE4 values; @@ -752,7 +766,7 @@ __kernel void winograd_filter_image_to_buffer_2x2(OUT_OF_RANGE_PARAMS } coord.y += height; - offset += height * width; + offset = mad24(height, width, offset); } } @@ -777,7 +791,9 @@ __kernel void winograd_filter_buffer_to_image_6x6(OUT_OF_RANGE_PARAMS const int out_channel_idx = h; const int in_channel_idx = w << 2; - const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width; + const int offset =mad24(mad24(out_channel_idx, in_channels, in_channel_idx), + mul24(height, width), + input_offset); const int length = min((in_channels - in_channel_idx) * 9, 36); DATA_TYPE in[36] = {0}; DATA_TYPE4 tt0, tt1, t1; @@ -909,7 +925,7 @@ __kernel void winograd_filter_image_to_buffer_6x6(OUT_OF_RANGE_PARAMS const int width_idx = w << 2; const int size = width - width_idx; - int offset = h * width + width_idx; + int offset = mad24(h, width, width_idx); int2 coord = (int2)(w, h); DATA_TYPE4 values; @@ -928,7 +944,7 @@ __kernel void winograd_filter_image_to_buffer_6x6(OUT_OF_RANGE_PARAMS vstore4(values, 0, output + offset); } coord.y += height; - offset += height * width; + offset = mad24(height, width, offset); } } @@ -953,7 +969,9 @@ __kernel void winograd_filter_buffer_to_image_4x4(OUT_OF_RANGE_PARAMS const int out_channel_idx = h; const int in_channel_idx = w << 2; - const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width; + const int offset = mad24(mad24(out_channel_idx, in_channels, in_channel_idx), + mul24(height, width), + input_offset); const int length = min((in_channels - in_channel_idx) * 9, 36); DATA_TYPE in[36] = {0}; DATA_TYPE4 tt0, tt1, tt2; @@ -1058,7 +1076,7 @@ __kernel void winograd_filter_image_to_buffer_4x4(OUT_OF_RANGE_PARAMS const int width_idx = w << 2; const int size = width - width_idx; - int offset = h * width + width_idx; + int offset = mad24(h, width, width_idx); int2 coord = (int2)(w, h); DATA_TYPE4 values; @@ -1077,6 +1095,6 @@ __kernel void winograd_filter_image_to_buffer_4x4(OUT_OF_RANGE_PARAMS vstore4(values, 0, output + offset); } coord.y += height; - offset += height * width; + offset = mad24(height, width, offset); } } \ No newline at end of file diff --git a/mace/ops/opencl/cl/buffer_transform.cl b/mace/ops/opencl/cl/buffer_transform.cl index d5f00e31b19aed182e63e3405322c0184c6c2f17..0a7674d66f5fbc08b4443da288f26ac5ff5c1d47 100644 --- a/mace/ops/opencl/cl/buffer_transform.cl +++ b/mace/ops/opencl/cl/buffer_transform.cl @@ -23,9 +23,11 @@ __kernel void pad_input(BUFFER_OUT_OF_RANGE_PARAMS #endif const int padded_chan_blk = (padded_chan + 3) >> 2; const int padded_width_idx = padded_wc_blk_idx / padded_chan_blk; - const int padded_chan_blk_idx = padded_wc_blk_idx % padded_chan_blk; + const int padded_chan_blk_idx = + padded_wc_blk_idx - mul24(padded_width_idx, padded_chan_blk); const int batch_idx = padded_hb_idx / padded_height; - const int padded_height_idx = padded_hb_idx % padded_height; + const int padded_height_idx = + padded_hb_idx - mul24(batch_idx, padded_height); const int padded_chan_idx = padded_chan_blk_idx << 2; const int in_height_idx = padded_height_idx - pad_top; const int in_width_idx = padded_width_idx - pad_left; @@ -81,7 +83,7 @@ __kernel void transform_conv_filter(BUFFER_OUT_OF_RANGE_PARAMS const int out_chan_blk = global_size_dim1; const int h_idx = hw_idx / width; - const int w_idx = hw_idx % width; + const int w_idx = hw_idx - mul24(h_idx, width); const int out_chan_idx = out_chan_blk_idx << 2; const int in_offset = mad24(mad24(mad24(out_chan_idx, in_chan, in_chan_idx), height, h_idx), width, w_idx) + input_offset; diff --git a/mace/ops/opencl/cl/channel_shuffle.cl b/mace/ops/opencl/cl/channel_shuffle.cl index 556de82ebcc4701cd236ad2732403ca16d89ad1c..1db71fdc8b103e984342c5b58a4e73501534ae1e 100644 --- a/mace/ops/opencl/cl/channel_shuffle.cl +++ b/mace/ops/opencl/cl/channel_shuffle.cl @@ -19,10 +19,10 @@ __kernel void channel_shuffle(OUT_OF_RANGE_PARAMS #endif const int width = global_size_dim1; - const int group_blks = groups / 4; - const int groups_blks_width = group_blks * width; - const int channels_per_group_blks = channels_per_group / 4; - const int channels_per_group_blks_width = channels_per_group_blks * width; + const int group_blks = groups >> 2; + const int groups_blks_width = mul24(group_blks, width); + const int channels_per_group_blks = channels_per_group >> 2; + const int channels_per_group_blks_width = mul24(channels_per_group_blks, width); DATA_TYPE4 in_chan_data0, in_chan_data1, in_chan_data2, in_chan_data3; DATA_TYPE4 out_chan_data0, out_chan_data1, out_chan_data2, out_chan_data3; diff --git a/mace/ops/opencl/cl/conv_2d_1x1.cl b/mace/ops/opencl/cl/conv_2d_1x1.cl index ffb8a434ef033075382a3d639cf2097ce92e4407..db508ac9ab6cd90f5e6c56fbbd1fe9dd0dbae892 100644 --- a/mace/ops/opencl/cl/conv_2d_1x1.cl +++ b/mace/ops/opencl/cl/conv_2d_1x1.cl @@ -45,14 +45,16 @@ __kernel void conv_2d_1x1(OUT_OF_RANGE_PARAMS w.y = w.x + in_width_stride; w.z = w.y + in_width_stride; w.w = w.z + in_width_stride; - int out_hb_idx = mul24((out_hb % height), stride); + int batch = out_hb / height; + int h_idx = out_hb - mul24(batch, height); + int out_hb_idx = mul24(h_idx, stride); w.x = select(w.x, INT_MIN, w.x >= in_width); w.y = select(w.y, INT_MIN, w.y >= in_width); w.z = select(w.z, INT_MIN, w.z >= in_width); w.w = select(w.w, INT_MIN, w.w >= in_width); - out_hb_idx = select(mad24((out_hb / height), in_height, out_hb_idx), + out_hb_idx = select(mad24(batch, in_height, out_hb_idx), -1, out_hb_idx >= in_height); diff --git a/mace/ops/opencl/cl/conv_2d_1x1_buffer.cl b/mace/ops/opencl/cl/conv_2d_1x1_buffer.cl index d49895fa695f95f8c3f033675cd6b9090cb52d45..15cf5b59cd13850efaab15224531b26147566321 100644 --- a/mace/ops/opencl/cl/conv_2d_1x1_buffer.cl +++ b/mace/ops/opencl/cl/conv_2d_1x1_buffer.cl @@ -30,10 +30,11 @@ __kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS const int out_chan_blk = (out_chan + 3) >> 2; const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk; - const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk; + const int out_chan_blk_idx = + out_wc_blk_idx - mul24(out_width_blk_idx, out_chan_blk); const int batch_idx = out_hb_idx / out_height; - const int out_height_idx = out_hb_idx % out_height; + const int out_height_idx = out_hb_idx - mul24(batch_idx, out_height); const int out_width_idx = out_width_blk_idx << 1; const int out_chan_idx = out_chan_blk_idx << 2; diff --git a/mace/ops/opencl/cl/conv_2d_buffer.cl b/mace/ops/opencl/cl/conv_2d_buffer.cl index e0c0c56412e7a0d02b5aa723ae2fd4dece421983..225e3a3b20b084c5690e632e0c5774d7b2406a2b 100644 --- a/mace/ops/opencl/cl/conv_2d_buffer.cl +++ b/mace/ops/opencl/cl/conv_2d_buffer.cl @@ -35,10 +35,11 @@ __kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS const int out_chan_blk = (out_chan + 3) >> 2; const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk; - const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk; + const int out_chan_blk_idx = + out_wc_blk_idx - mul24(out_width_blk_idx, out_chan_blk); const int batch_idx = out_hb_idx / out_height; - const int out_height_idx = out_hb_idx % out_height; + const int out_height_idx = out_hb_idx - mul24(batch_idx, out_height); const int out_width_idx = out_width_blk_idx << 2; const int out_chan_idx = out_chan_blk_idx << 2; diff --git a/mace/ops/opencl/cl/crop.cl b/mace/ops/opencl/cl/crop.cl index 3145b2c41a323e385f2162bfcd80d9e520885d6c..3dbc974cd4462ae00f3da4df368025d1590adb16 100644 --- a/mace/ops/opencl/cl/crop.cl +++ b/mace/ops/opencl/cl/crop.cl @@ -27,7 +27,7 @@ __kernel void crop(OUT_OF_RANGE_PARAMS #endif const int b = hb_idx / out_height; - const int h = hb_idx % out_height; + const int h = hb_idx - mul24(b, out_height); const int in_chan_blk_idx = chan_blk_idx + offset_chan_blk; const int in_width_idx = width_idx + offset_w; const int in_h = h + offset_h; diff --git a/mace/ops/opencl/cl/depth_to_space.cl b/mace/ops/opencl/cl/depth_to_space.cl index 8ac80a8fbfa7d88911cdbb8410114fb2e67e000e..c1dc806a231e8d632b8a60ef885dba96d24fd9dc 100644 --- a/mace/ops/opencl/cl/depth_to_space.cl +++ b/mace/ops/opencl/cl/depth_to_space.cl @@ -24,10 +24,10 @@ __kernel void depth_to_space(OUT_OF_RANGE_PARAMS const int out_pos = mad24(out_d, output_width, out_w); const int in_hb = out_hb / block_size; - const int offset_h = out_hb % block_size; + const int offset_h = out_hb - mul24(in_hb, block_size); const int in_w = out_w / block_size; - const int offset_w = out_w % block_size; - const int offset_d = (offset_h * block_size + offset_w) * output_depth_blocks; + const int offset_w = out_w - mul24(in_w, block_size); + const int offset_d = mul24(mad24(offset_h, block_size, offset_w), output_depth_blocks); const int in_d = out_d + offset_d; if (in_hb >= input_hb || in_w >= input_width || in_d >= input_depth_blocks) { diff --git a/mace/ops/opencl/cl/depthwise_conv2d_buffer.cl b/mace/ops/opencl/cl/depthwise_conv2d_buffer.cl index 31f2460b2aa8a20718c8d4440d44749e8b787b3f..efbd75c7852f07d05888e95b8cc31fdda2195d5d 100644 --- a/mace/ops/opencl/cl/depthwise_conv2d_buffer.cl +++ b/mace/ops/opencl/cl/depthwise_conv2d_buffer.cl @@ -35,10 +35,10 @@ __kernel void depthwise_conv2d(BUFFER_OUT_OF_RANGE_PARAMS const int out_chan_blk = (out_chan + 3) >> 2; const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk; - const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk; + const int out_chan_blk_idx = out_wc_blk_idx - mul24(out_width_blk_idx, out_chan_blk); const int batch_idx = out_hb_idx / out_height; - const int out_height_idx = out_hb_idx % out_height; + const int out_height_idx = out_hb_idx - mul24(batch_idx, out_height); const int out_width_idx = out_width_blk_idx << 2; const int out_chan_idx = out_chan_blk_idx << 2; const int in_chan_idx = out_chan_idx; diff --git a/mace/ops/opencl/cl/fully_connected.cl b/mace/ops/opencl/cl/fully_connected.cl index 9a76dfed2dabe1d4f099741064bfc618cf154120..14e3ee64d39fac750ed00d27eed605ace0cef7ba 100644 --- a/mace/ops/opencl/cl/fully_connected.cl +++ b/mace/ops/opencl/cl/fully_connected.cl @@ -35,7 +35,7 @@ __kernel void fully_connected(OUT_OF_RANGE_PARAMS for (short h_idx = 0; h_idx < input_height; ++h_idx) { for (short w_idx = 0; w_idx < input_width; ++w_idx) { input_coord.x = w_idx; - weight_x = (h_idx * input_width + w_idx) * input_channel; + weight_x = mul24(mad24(h_idx, input_width, w_idx), input_channel); #pragma unroll for (short chan_idx = 0; chan_idx < input_chan_blk; ++chan_idx) { input_value = READ_IMAGET(input, SAMPLER, input_coord); @@ -84,7 +84,7 @@ __kernel void fully_connected_width(OUT_OF_RANGE_PARAMS const int batch_out_blk_idx = get_global_id(2); const int batch_idx = batch_out_blk_idx / out_blks; - const int out_blk_idx = batch_out_blk_idx % out_blks; + const int out_blk_idx = batch_out_blk_idx - mul24(batch_idx, out_blks); const short in_outer_size = mul24(input_width, in_chan_blks); const short weight_y = mad24(out_blk_idx, 4, inter_out_idx); diff --git a/mace/ops/opencl/cl/lstmcell.cl b/mace/ops/opencl/cl/lstmcell.cl index 909c63d0453d90e10baa71cc327b6ac2a8a0596e..c439ba88a3dcf4b5a250e730093652904a3a8f1b 100644 --- a/mace/ops/opencl/cl/lstmcell.cl +++ b/mace/ops/opencl/cl/lstmcell.cl @@ -20,117 +20,118 @@ __kernel void lstmcell(OUT_OF_RANGE_PARAMS if (w_blk_idx >= global_size_dim0 || h_idx >= global_size_dim1) return; #endif - // fc_res0 -> i - // fc_res1 -> j - // fc_res2 -> f - // fc_res3 -> o - DATA_TYPE4 fc_res0 = 0.0, fc_res1 = 0.0, fc_res2 = 0.0, fc_res3 = 0.0; DATA_TYPE4 in, pre_h; DATA_TYPE4 w0, w1, w2, w3; int k_offset; // concat matmul + const int pos_x0 = w_blk_idx; + const int pos_x1 = pos_x0 + global_size_dim0; + const int pos_x2 = pos_x1 + global_size_dim0; + const int pos_x3 = pos_x2 + global_size_dim0; + + // bias + // fc_res0 -> i + // fc_res1 -> j + // fc_res2 -> f + // fc_res3 -> o + DATA_TYPE4 fc_res0, fc_res1, fc_res2, fc_res3; + fc_res0 = READ_IMAGET(bias, SAMPLER, (int2)(pos_x0, 0)); + fc_res1 = READ_IMAGET(bias, SAMPLER, (int2)(pos_x1, 0)); + fc_res2 = READ_IMAGET(bias, SAMPLER, (int2)(pos_x2, 0)); + fc_res3 = READ_IMAGET(bias, SAMPLER, (int2)(pos_x3, 0)); + + for (short i = 0; i < in_w_blk; ++i) { in = READ_IMAGET(input, SAMPLER, (int2)(i, h_idx)); int k = i << 2; - w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k)); - w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k)); - w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k)); - w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k)); - fc_res0 += in.x * w0; - fc_res1 += in.x * w1; - fc_res2 += in.x * w2; - fc_res3 += in.x * w3; + w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k)); + w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k)); + w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k)); + w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k)); + fc_res0 = mad(in.x, w0, fc_res0); + fc_res1 = mad(in.x, w1, fc_res1); + fc_res2 = mad(in.x, w2, fc_res2); + fc_res3 = mad(in.x, w3, fc_res3); k += 1; k_offset = select(-1, k, k < width); - w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k_offset)); - w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k_offset)); - w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k_offset)); - w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k_offset)); - fc_res0 += in.y * w0; - fc_res1 += in.y * w1; - fc_res2 += in.y * w2; - fc_res3 += in.y * w3; + w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k_offset)); + w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k_offset)); + w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k_offset)); + w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k_offset)); + fc_res0 = mad(in.y, w0, fc_res0); + fc_res1 = mad(in.y, w1, fc_res1); + fc_res2 = mad(in.y, w2, fc_res2); + fc_res3 = mad(in.y, w3, fc_res3); k += 1; k_offset = select(-1, k, k < width); - w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k_offset)); - w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k_offset)); - w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k_offset)); - w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k_offset)); - fc_res0 += in.z * w0; - fc_res1 += in.z * w1; - fc_res2 += in.z * w2; - fc_res3 += in.z * w3; + w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k_offset)); + w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k_offset)); + w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k_offset)); + w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k_offset)); + fc_res0 = mad(in.z, w0, fc_res0); + fc_res1 = mad(in.z, w1, fc_res1); + fc_res2 = mad(in.z, w2, fc_res2); + fc_res3 = mad(in.z, w3, fc_res3); k += 1; k_offset = select(-1, k, k < width); - w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k_offset)); - w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k_offset)); - w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k_offset)); - w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k_offset)); - fc_res0 += in.w * w0; - fc_res1 += in.w * w1; - fc_res2 += in.w * w2; - fc_res3 += in.w * w3; + w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k_offset)); + w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k_offset)); + w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k_offset)); + w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k_offset)); + fc_res0 = mad(in.w, w0, fc_res0); + fc_res1 = mad(in.w, w1, fc_res1); + fc_res2 = mad(in.w, w2, fc_res2); + fc_res3 = mad(in.w, w3, fc_res3); } for (short i = 0; i < global_size_dim0; ++i) { pre_h = READ_IMAGET(pre_output, SAMPLER, (int2)(i, h_idx)); int k = (i << 2) + width; - w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k)); - w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k)); - w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k)); - w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k)); - fc_res0 += pre_h.x * w0; - fc_res1 += pre_h.x * w1; - fc_res2 += pre_h.x * w2; - fc_res3 += pre_h.x * w3; + w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k)); + w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k)); + w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k)); + w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k)); + fc_res0 = mad(pre_h.x, w0, fc_res0); + fc_res1 = mad(pre_h.x, w1, fc_res1); + fc_res2 = mad(pre_h.x, w2, fc_res2); + fc_res3 = mad(pre_h.x, w3, fc_res3); k += 1; - w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k)); - w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k)); - w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k)); - w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k)); - fc_res0 += pre_h.y * w0; - fc_res1 += pre_h.y * w1; - fc_res2 += pre_h.y * w2; - fc_res3 += pre_h.y * w3; + w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k)); + w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k)); + w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k)); + w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k)); + fc_res0 = mad(pre_h.y, w0, fc_res0); + fc_res1 = mad(pre_h.y, w1, fc_res1); + fc_res2 = mad(pre_h.y, w2, fc_res2); + fc_res3 = mad(pre_h.y, w3, fc_res3); k += 1; - w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k)); - w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k)); - w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k)); - w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k)); - fc_res0 += pre_h.z * w0; - fc_res1 += pre_h.z * w1; - fc_res2 += pre_h.z * w2; - fc_res3 += pre_h.z * w3; + w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k)); + w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k)); + w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k)); + w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k)); + fc_res0 = mad(pre_h.z, w0, fc_res0); + fc_res1 = mad(pre_h.z, w1, fc_res1); + fc_res2 = mad(pre_h.z, w2, fc_res2); + fc_res3 = mad(pre_h.z, w3, fc_res3); k += 1; - w0 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx, k)); - w1 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0, k)); - w2 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, k)); - w3 = READ_IMAGET(weight, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, k)); - fc_res0 += pre_h.w * w0; - fc_res1 += pre_h.w * w1; - fc_res2 += pre_h.w * w2; - fc_res3 += pre_h.w * w3; + w0 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x0, k)); + w1 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x1, k)); + w2 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x2, k)); + w3 = READ_IMAGET(weight, SAMPLER, (int2)(pos_x3, k)); + fc_res0 = mad(pre_h.w, w0, fc_res0); + fc_res1 = mad(pre_h.w, w1, fc_res1); + fc_res2 = mad(pre_h.w, w2, fc_res2); + fc_res3 = mad(pre_h.w, w3, fc_res3); } - // bias - DATA_TYPE4 b0, b1, b2, b3; - b0 = READ_IMAGET(bias, SAMPLER, (int2)(w_blk_idx, 0)); - b1 = READ_IMAGET(bias, SAMPLER, (int2)(w_blk_idx + global_size_dim0, 0)); - b2 = READ_IMAGET(bias, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 2, 0)); - b3 = READ_IMAGET(bias, SAMPLER, (int2)(w_blk_idx + global_size_dim0 * 3, 0)); - fc_res0 += b0; - fc_res1 += b1; - fc_res2 += b2; - fc_res3 += b3; - // gate DATA_TYPE4 pre_c, c, h; pre_c = READ_IMAGET(pre_cell, SAMPLER, (int2)(w_blk_idx, h_idx)); diff --git a/mace/ops/opencl/cl/matmul.cl b/mace/ops/opencl/cl/matmul.cl index c272e040e9802fa52b3dd9c7cf1e4e313013e127..462db3a08168018da5e97bc255346f5a380afa6c 100644 --- a/mace/ops/opencl/cl/matmul.cl +++ b/mace/ops/opencl/cl/matmul.cl @@ -19,7 +19,7 @@ __kernel void matmul(OUT_OF_RANGE_PARAMS #endif const int batch = hb / height_blocks; - const int ty = (hb % height_blocks); + const int ty = hb - mul24(batch, height_blocks); const int gy = mad24(batch, height_blocks, ty); const int bm = mad24(batch, M, ty << 2); const int bk = mul24(batch, k_blocks); diff --git a/mace/ops/opencl/cl/pad.cl b/mace/ops/opencl/cl/pad.cl index ad323b76c32c15469d70002b45c579ba831ebc9c..d482cf1d18acbb6603d32b25272b17e3c3848183 100644 --- a/mace/ops/opencl/cl/pad.cl +++ b/mace/ops/opencl/cl/pad.cl @@ -14,7 +14,7 @@ __kernel void pad(OUT_OF_RANGE_PARAMS const int width_idx = get_global_id(1); const int hb_idx = get_global_id(2); const int batch_idx = hb_idx / output_height; - const int height_idx = hb_idx % output_height; + const int height_idx = hb_idx - mul24(batch_idx, output_height); const int input_padded_height = input_height + height_padding; const int input_padded_width = input_width + width_padding; diff --git a/mace/ops/opencl/cl/pooling.cl b/mace/ops/opencl/cl/pooling.cl index 28987d3ca28c45e7c79c137e0c8a2973f6e8c4e4..0752cbc303acd41283ad0d5b76212745092f0058 100644 --- a/mace/ops/opencl/cl/pooling.cl +++ b/mace/ops/opencl/cl/pooling.cl @@ -42,9 +42,11 @@ __kernel void pooling(OUT_OF_RANGE_PARAMS #endif const int out_width = global_size_dim1; - const int batch_idx = mul24((out_hb_idx / out_height), in_height); - const int in_height_start = mul24((out_hb_idx % out_height), stride_h) - pad_top; - const int in_width_start = mul24(out_width_idx, stride_w) - pad_left; + const int n_b = out_hb_idx / out_height; + const int mod_b = out_hb_idx - mul24(n_b, out_height); + const int batch_idx = mul24(n_b, in_height); + const int in_height_start = mad24(mod_b, stride_h, -pad_top); + const int in_width_start = mad24(out_width_idx, stride_w, -pad_left); const int in_channel_offset = mul24(out_chan_idx, in_width); diff --git a/mace/ops/opencl/cl/pooling_buffer.cl b/mace/ops/opencl/cl/pooling_buffer.cl index c4ecff9e19dfc737b57df69e24abc47add54bd6c..fa2a39b334839df72339736b9e346da05d24f026 100644 --- a/mace/ops/opencl/cl/pooling_buffer.cl +++ b/mace/ops/opencl/cl/pooling_buffer.cl @@ -47,7 +47,7 @@ __kernel void pooling(BUFFER_OUT_OF_RANGE_PARAMS const int in_wc_size = mul24(in_width, in_chan); const int batch_idx = out_hb_idx / out_height; - const int out_height_idx = out_hb_idx % out_height; + const int out_height_idx = out_hb_idx - mul24(batch_idx, out_height); const int chan_idx = out_chan_blk_idx << 2; const int in_height_start = mul24(out_height_idx, stride_h) - pad_top; const int in_width_start = mul24(out_width_idx, stride_w) - pad_left; diff --git a/mace/ops/opencl/cl/reduce_mean.cl b/mace/ops/opencl/cl/reduce_mean.cl index 93a318b3c0da82ae6dae64d05f6cfe123f6af1fd..c2810f4876e57f3fb82836c231ed4277ff186055 100644 --- a/mace/ops/opencl/cl/reduce_mean.cl +++ b/mace/ops/opencl/cl/reduce_mean.cl @@ -11,9 +11,7 @@ __kernel void reduce_mean(OUT_OF_RANGE_PARAMS __private const int in_height, __private const int in_width, __private const float image_size_reciprocal, - __private const float in_width_reciprocal, __private const int channel_blocks, - __private const float channel_blocks_reciprocal, __write_only image2d_t output) { const int i = get_local_id(0); const int j = get_local_id(1); @@ -26,7 +24,7 @@ __kernel void reduce_mean(OUT_OF_RANGE_PARAMS const int dim0_size = get_local_size(0); float4 tmp = (float4){0, 0, 0, 0}; const int index = mad24(j, dim0_size, i); - const int b = floor(k * channel_blocks_reciprocal); + const int b = k / channel_blocks; const int ch = mad24(b, -channel_blocks, k); DATA_TYPE4 in; @@ -40,7 +38,7 @@ __kernel void reduce_mean(OUT_OF_RANGE_PARAMS #pragma unroll for (int l = 0; l < valid_part_len; ++l) { int offset = base_offset + l; - int h_id = floor(offset * in_width_reciprocal); + int h_id = offset / in_width; int w_id = mad24(h_id, -in_width, offset); int pos_x = mad24(ch, in_width, w_id); int pos_y = mad24(b, in_height, h_id); diff --git a/mace/ops/opencl/cl/resize_bicubic.cl b/mace/ops/opencl/cl/resize_bicubic.cl index a2863a47f21e59bc91eed3eb91a089088002aa24..396c0f1019d79d1c500cb262be58bb0694046a07 100644 --- a/mace/ops/opencl/cl/resize_bicubic.cl +++ b/mace/ops/opencl/cl/resize_bicubic.cl @@ -36,7 +36,7 @@ __kernel void resize_bicubic_nocache(OUT_OF_RANGE_PARAMS #endif const int b = hb / out_height; - const int h = hb % out_height; + const int h = hb - mul24(b, out_height); const float h_in = h * height_scale; const float w_in = w * width_scale; diff --git a/mace/ops/opencl/cl/resize_bilinear.cl b/mace/ops/opencl/cl/resize_bilinear.cl index 4aa3af9f3b66bcdc68c9b3c555f7ac2201f55147..1850efd843b0f4f81388727faa70e1801316bd6d 100644 --- a/mace/ops/opencl/cl/resize_bilinear.cl +++ b/mace/ops/opencl/cl/resize_bilinear.cl @@ -24,7 +24,7 @@ __kernel void resize_bilinear_nocache(OUT_OF_RANGE_PARAMS const int out_width = global_size_dim1; const int b = hb / out_height; - const int h = hb % out_height; + const int h = hb - mul24(b, out_height); const float h_in = h * height_scale; const float w_in = w * width_scale; diff --git a/mace/ops/opencl/cl/softmax_buffer.cl b/mace/ops/opencl/cl/softmax_buffer.cl index 8ec5d84c084d2c59811299aba70ad9bf9f4a05ef..2a96a237d91c9d05fe7516527a331c681f62a3be 100644 --- a/mace/ops/opencl/cl/softmax_buffer.cl +++ b/mace/ops/opencl/cl/softmax_buffer.cl @@ -20,7 +20,7 @@ __kernel void softmax(BUFFER_OUT_OF_RANGE_PARAMS const int chan_blks = global_size_dim0 - 1; const int width = global_size_dim1; const int batch_idx = hb_idx / height; - const int height_idx = hb_idx % height; + const int height_idx = hb_idx - mul24(batch_idx, height); const int chan_idx = chan_blk_idx << 2; const int offset_base = mul24(mad24(mad24(batch_idx, height, height_idx), diff --git a/mace/ops/opencl/cl/space_to_batch.cl b/mace/ops/opencl/cl/space_to_batch.cl index dbdcea205be29ee904b17e5b3c1fe66de10f7584..ce7e3d3507f1abb531040e0659f2b24adb356602 100644 --- a/mace/ops/opencl/cl/space_to_batch.cl +++ b/mace/ops/opencl/cl/space_to_batch.cl @@ -25,14 +25,19 @@ __kernel void space_to_batch(OUT_OF_RANGE_PARAMS #endif const int batch_b_idx = batch_hb_idx / batch_height; - const int batch_h_idx = batch_hb_idx % batch_height; + const int batch_h_idx = batch_hb_idx - mul24(batch_b_idx, batch_height); const int block_size = mul24(block_height, block_width); - const int space_b_idx = batch_b_idx % batch_size; const int remaining_batch_idx = batch_b_idx / batch_size; - const int space_h_idx = (remaining_batch_idx / block_width) + + const int space_b_idx = + batch_b_idx - mul24(remaining_batch_idx, batch_size); + + const int n_remain_blk_w = remaining_batch_idx / block_width; + const int mod_remain_blk_w = + remaining_batch_idx - mul24(n_remain_blk_w, block_width); + const int space_h_idx = n_remain_blk_w + mul24(batch_h_idx, block_height) - padding_height; - const int space_w_idx = (remaining_batch_idx % block_width) + + const int space_w_idx = mod_remain_blk_w + mul24(batch_w_idx, block_width) - padding_width; const int space_coord_x = select(mul24(chan_idx, space_width) + space_w_idx, diff --git a/mace/ops/opencl/cl/space_to_depth.cl b/mace/ops/opencl/cl/space_to_depth.cl index 94dd38839406b269e02d3f6e69f04d20b1a5a72b..6dc821d7b28b3f6ae6c2d60206ebc69a6cf8ca79 100644 --- a/mace/ops/opencl/cl/space_to_depth.cl +++ b/mace/ops/opencl/cl/space_to_depth.cl @@ -24,10 +24,10 @@ __kernel void space_to_depth(OUT_OF_RANGE_PARAMS const int in_pos = mad24(d, input_width, w); const int out_hb = hb / block_size; - const int offset_h = hb % block_size; + const int offset_h = hb - mul24(out_hb, block_size); const int out_w = w / block_size; - const int offset_w = w % block_size; - const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks; + const int offset_w = w - mul24(out_w, block_size); + const int offset_d = mul24(input_depth_blocks, mad24(offset_h, block_size, offset_w)); const int out_d = d + offset_d; if (out_d >= output_depth_blocks || out_hb >= output_hb || out_w >= output_width) { diff --git a/mace/ops/opencl/cl/sqrdiff_mean.cl b/mace/ops/opencl/cl/sqrdiff_mean.cl index 2a297bea18d0d01fd7ddedcf719fc8c90925e7e9..4598f89ce0e3ea3bcbfbb84d5241e8608545b9e9 100644 --- a/mace/ops/opencl/cl/sqrdiff_mean.cl +++ b/mace/ops/opencl/cl/sqrdiff_mean.cl @@ -12,9 +12,7 @@ __kernel void sqrdiff_mean(OUT_OF_RANGE_PARAMS __private const int in_height, __private const int in_width, __private const float image_size_reciprocal, - __private const float in_width_reciprocal, __private const int channel_blocks, - __private const float channel_blocks_reciprocal, __write_only image2d_t output) { const int i = get_local_id(0); const int j = get_local_id(1); @@ -27,7 +25,7 @@ __kernel void sqrdiff_mean(OUT_OF_RANGE_PARAMS const int dim0_size = get_local_size(0); float4 tmp = (float4){0, 0, 0, 0}; const int index = mad24(j, dim0_size, i); - const int b = floor(k * channel_blocks_reciprocal); + const int b = k / channel_blocks; const int ch = mad24(b, -channel_blocks, k); DATA_TYPE4 in; @@ -43,7 +41,7 @@ __kernel void sqrdiff_mean(OUT_OF_RANGE_PARAMS #pragma unroll for (int l = 0; l < valid_part_len; ++l) { int offset = base_offset + l; - int h_id = floor(offset * in_width_reciprocal); + int h_id = offset / in_width; int w_id = mad24(h_id, -in_width, offset); int pos_x = mad24(ch, in_width, w_id); int pos_y = mad24(b, in_height, h_id); diff --git a/mace/ops/opencl/image/reduce_mean.h b/mace/ops/opencl/image/reduce_mean.h index 78c2e7e51dd75d889b0ad714abc8259b90b9ff10..953742cbbec2e24f257f28d4684a80729cadf9ac 100644 --- a/mace/ops/opencl/image/reduce_mean.h +++ b/mace/ops/opencl/image/reduce_mean.h @@ -111,9 +111,7 @@ MaceStatus ReduceMeanKernel::Compute( const int group_size = lws[0] * lws[1] * lws[2]; const int partial_len = (image_size + group_size - 1) / group_size; const int remain_index = image_size % group_size; - const float in_width_reciprocal = 1.f / in_width; const float img_size_reciprocal = 1.f / (in_width * in_height); - const float channel_blk_reciprocal = 1.f / channel_blocks; MACE_OUT_OF_RANGE_INIT(kernel_); if (!IsVecEqual(input_shape_, input->shape())) { @@ -130,9 +128,7 @@ MaceStatus ReduceMeanKernel::Compute( kernel_.setArg(idx++, static_cast(in_height)); kernel_.setArg(idx++, static_cast(in_width)); kernel_.setArg(idx++, img_size_reciprocal); - kernel_.setArg(idx++, in_width_reciprocal); kernel_.setArg(idx++, static_cast(channel_blocks)); - kernel_.setArg(idx++, channel_blk_reciprocal); kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input->shape(); diff --git a/mace/ops/opencl/image/sqrdiff_mean.h b/mace/ops/opencl/image/sqrdiff_mean.h index 791566f8e2458f47ccedb591d799a91ca627ac4e..d356b89859ee9a9c24541a1270f919f188be62eb 100644 --- a/mace/ops/opencl/image/sqrdiff_mean.h +++ b/mace/ops/opencl/image/sqrdiff_mean.h @@ -107,9 +107,7 @@ MaceStatus SqrDiffMeanKernel::Compute( const int group_size = lws[0] * lws[1] * lws[2]; const int partial_len = (image_size + group_size - 1) / group_size; const int remain_index = image_size % group_size; - const float in_width_reciprocal = 1.f / in_width; const float img_size_reciprocal = 1.f / (in_width * in_height); - const float channel_blk_reciprocal = 1.f / channel_blocks; MACE_OUT_OF_RANGE_INIT(kernel_); if (!IsVecEqual(input_shape_, input0->shape())) { @@ -127,9 +125,7 @@ MaceStatus SqrDiffMeanKernel::Compute( kernel_.setArg(idx++, static_cast(in_height)); kernel_.setArg(idx++, static_cast(in_width)); kernel_.setArg(idx++, img_size_reciprocal); - kernel_.setArg(idx++, in_width_reciprocal); kernel_.setArg(idx++, static_cast(channel_blocks)); - kernel_.setArg(idx++, channel_blk_reciprocal); kernel_.setArg(idx++, *(output->opencl_image())); input_shape_ = input0->shape();