Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
1eb50c6c
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
1eb50c6c
编写于
11月 16, 2018
作者:
刘
刘琦
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'fix-round-bug-in-cl' into 'master'
fix round bug in cl kernel See merge request !871
上级
7dee1ad0
4ab4c17e
变更
25
隐藏空白更改
内联
并排
Showing
25 changed file
with
237 addition
and
215 deletion
+237
-215
mace/ops/opencl/cl/batch_to_space.cl
mace/ops/opencl/cl/batch_to_space.cl
+11
-9
mace/ops/opencl/cl/buffer_to_image.cl
mace/ops/opencl/cl/buffer_to_image.cl
+91
-73
mace/ops/opencl/cl/buffer_transform.cl
mace/ops/opencl/cl/buffer_transform.cl
+5
-3
mace/ops/opencl/cl/channel_shuffle.cl
mace/ops/opencl/cl/channel_shuffle.cl
+4
-4
mace/ops/opencl/cl/conv_2d_1x1.cl
mace/ops/opencl/cl/conv_2d_1x1.cl
+4
-2
mace/ops/opencl/cl/conv_2d_1x1_buffer.cl
mace/ops/opencl/cl/conv_2d_1x1_buffer.cl
+3
-2
mace/ops/opencl/cl/conv_2d_buffer.cl
mace/ops/opencl/cl/conv_2d_buffer.cl
+3
-2
mace/ops/opencl/cl/crop.cl
mace/ops/opencl/cl/crop.cl
+1
-1
mace/ops/opencl/cl/depth_to_space.cl
mace/ops/opencl/cl/depth_to_space.cl
+3
-3
mace/ops/opencl/cl/depthwise_conv2d_buffer.cl
mace/ops/opencl/cl/depthwise_conv2d_buffer.cl
+2
-2
mace/ops/opencl/cl/fully_connected.cl
mace/ops/opencl/cl/fully_connected.cl
+2
-2
mace/ops/opencl/cl/lstmcell.cl
mace/ops/opencl/cl/lstmcell.cl
+81
-80
mace/ops/opencl/cl/matmul.cl
mace/ops/opencl/cl/matmul.cl
+1
-1
mace/ops/opencl/cl/pad.cl
mace/ops/opencl/cl/pad.cl
+1
-1
mace/ops/opencl/cl/pooling.cl
mace/ops/opencl/cl/pooling.cl
+5
-3
mace/ops/opencl/cl/pooling_buffer.cl
mace/ops/opencl/cl/pooling_buffer.cl
+1
-1
mace/ops/opencl/cl/reduce_mean.cl
mace/ops/opencl/cl/reduce_mean.cl
+2
-4
mace/ops/opencl/cl/resize_bicubic.cl
mace/ops/opencl/cl/resize_bicubic.cl
+1
-1
mace/ops/opencl/cl/resize_bilinear.cl
mace/ops/opencl/cl/resize_bilinear.cl
+1
-1
mace/ops/opencl/cl/softmax_buffer.cl
mace/ops/opencl/cl/softmax_buffer.cl
+1
-1
mace/ops/opencl/cl/space_to_batch.cl
mace/ops/opencl/cl/space_to_batch.cl
+9
-4
mace/ops/opencl/cl/space_to_depth.cl
mace/ops/opencl/cl/space_to_depth.cl
+3
-3
mace/ops/opencl/cl/sqrdiff_mean.cl
mace/ops/opencl/cl/sqrdiff_mean.cl
+2
-4
mace/ops/opencl/image/reduce_mean.h
mace/ops/opencl/image/reduce_mean.h
+0
-4
mace/ops/opencl/image/sqrdiff_mean.h
mace/ops/opencl/image/sqrdiff_mean.h
+0
-4
未找到文件。
mace/ops/opencl/cl/batch_to_space.cl
浏览文件 @
1eb50c6c
...
...
@@ -25,23 +25,25 @@ __kernel void batch_to_space(OUT_OF_RANGE_PARAMS
#
endif
const
int
batch_b_idx
=
batch_hb_idx
/
batch_height
;
const
int
batch_h_idx
=
batch_hb_idx
%
batch_height
;
const
int
batch_h_idx
=
batch_hb_idx
-
mul24
(
batch_b_idx,
batch_height
)
;
const
int
block_size
=
mul24
(
block_height,
block_width
)
;
const
int
space_b_idx
=
batch_b_idx
%
batch_size
;
const
int
remaining_batch_idx
=
batch_b_idx
/
batch_size
;
const
int
space_h_idx
=
(
remaining_batch_idx
/
block_width
)
+
mul24
(
batch_h_idx,
block_height
)
-
padding_height
;
const
int
space_w_idx
=
(
remaining_batch_idx
%
block_width
)
+
mul24
(
batch_w_idx,
block_width
)
-
padding_width
;
const
int
space_b_idx
=
batch_b_idx
-
mul24
(
remaining_batch_idx,
batch_size
)
;
const
int
n_h
=
remaining_batch_idx
/
block_width
;
const
int
mod_h
=
remaining_batch_idx
-
mul24
(
n_h,
block_width
)
;
const
int
space_h_idx
=
mad24
(
batch_h_idx,
block_height,
n_h
-
padding_height
)
;
const
int
space_w_idx
=
mad24
(
batch_w_idx,
block_width,
mod_h
-
padding_width
)
;
if
(
0
<=
space_w_idx
&&
space_w_idx
<
space_width
&&
0
<=
space_h_idx
&&
space_h_idx
<
space_height
)
{
int2
batch_coord
=
(
int2
)(
m
ul24
(
chan_idx,
batch_width
)
+
batch_w_idx
,
batch_hb_idx
)
;
int2
batch_coord
=
(
int2
)(
m
ad24
(
chan_idx,
batch_width,
batch_w_idx
)
,
batch_hb_idx
)
;
DATA_TYPE4
value
=
READ_IMAGET
(
batch_data,
SAMPLER,
batch_coord
)
;
int2
space_coord
=
(
int2
)(
m
ul24
(
chan_idx,
space_width
)
+
space_w_idx
,
space_b_idx
*
space_height
+
space_h_idx
)
;
int2
space_coord
=
(
int2
)(
m
ad24
(
chan_idx,
space_width,
space_w_idx
)
,
mad24
(
space_b_idx,
space_height,
space_h_idx
)
)
;
WRITE_IMAGET
(
space_data,
space_coord,
value
)
;
}
...
...
mace/ops/opencl/cl/buffer_to_image.cl
浏览文件 @
1eb50c6c
...
...
@@ -19,11 +19,12 @@ __kernel void filter_buffer_to_image(OUT_OF_RANGE_PARAMS
#endif
const int in_channel_idx = w;
const int hw_size = filter_w * filter_h;
const int out_channel_idx = h / hw_size * 4;
const int hw_idx = h % hw_size;
const int hw_size = mul24(filter_w, filter_h);
int out_channel_idx = h / hw_size;
const int hw_idx = h - mul24(out_channel_idx, hw_size);
out_channel_idx = out_channel_idx << 2;
const int h_idx = hw_idx / filter_w;
const int w_idx = hw_idx
% filter_w
;
const int w_idx = hw_idx
- mul24(h_idx, filter_w)
;
const int offset = input_offset +
mad24(out_channel_idx, inner_size,
mad24(mad24(in_channel_idx, filter_h, h_idx), filter_w, w_idx));
...
...
@@ -70,11 +71,12 @@ __kernel void filter_image_to_buffer(OUT_OF_RANGE_PARAMS
#endif
const int in_channel_idx = w;
const int hw_size = filter_w * filter_h;
const int out_channel_idx = h / hw_size * 4;
const int hw_idx = h % hw_size;
const int hw_size = mul24(filter_w, filter_h);
int out_channel_idx = h / hw_size;
const int hw_idx = h - mul24(out_channel_idx, hw_size);
out_channel_idx = out_channel_idx << 2;
const int h_idx = hw_idx / filter_w;
const int w_idx = hw_idx
% filter_w
;
const int w_idx = hw_idx
- mul24(h_idx, filter_w)
;
const int offset =
mad24(out_channel_idx, inner_size,
mad24(mad24(in_channel_idx, filter_h, h_idx), filter_w, w_idx));
...
...
@@ -86,16 +88,16 @@ __kernel void filter_image_to_buffer(OUT_OF_RANGE_PARAMS
if (size < 4) {
switch (size) {
case 3:
output[offset +
2 * inner_size
] = values.z;
output[offset +
(inner_size << 1)
] = values.z;
case 2:
output[offset +
1 *
inner_size] = values.y;
output[offset + inner_size] = values.y;
case 1:
output[offset] = values.x;
}
} else {
output[offset + 3 * inner_size] = values.w;
output[offset + 2 * inner_size] = values.z;
output[offset +
1 *
inner_size] = values.y;
output[offset + inner_size] = values.y;
output[offset] = values.x;
}
}
...
...
@@ -124,7 +126,7 @@ __kernel void dw_filter_buffer_to_image(OUT_OF_RANGE_PARAMS
if (multiplier == 1) {
const int in_channel_idx = h << 2;
const int h_idx = w / filter_w;
const int w_idx = w
% filter_w
;
const int w_idx = w
- mul24(h_idx, filter_w)
;
const int offset = input_offset
+ mad24(mad24(in_channel_idx, filter_h, h_idx), filter_w, w_idx);
...
...
@@ -137,14 +139,14 @@ __kernel void dw_filter_buffer_to_image(OUT_OF_RANGE_PARAMS
case 3:
values.z = *(input + offset + 2 * hw_size);
case 2:
values.y = *(input + offset +
1 *
hw_size);
values.y = *(input + offset + hw_size);
case 1:
values.x = *(input + offset);
}
} else {
values.x = *(input + offset);
values.y = *(input + offset +
1 *
hw_size);
values.z = *(input + offset +
2 * hw_size
);
values.y = *(input + offset + hw_size);
values.z = *(input + offset +
(hw_size << 1)
);
values.w = *(input + offset + 3 * hw_size);
}
}
...
...
@@ -172,11 +174,14 @@ __kernel void in_out_buffer_to_image(OUT_OF_RANGE_PARAMS
#endif
const int batch_idx = h / height;
const int height_idx = h % height;
const int width_idx = w % width;
const int channel_idx = w / width * 4;
const int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
const int height_idx = h - mul24(batch_idx, height);
int channel_idx = w / width;
const int width_idx = w - mul24(channel_idx, width);
channel_idx = channel_idx << 2;
const int offset =
mad24(mad24(mad24(batch_idx, height, height_idx), width, width_idx),
channels,
input_offset + channel_idx);
const int size = channels - channel_idx;
DATA_TYPE4 values = 0;
...
...
@@ -213,11 +218,13 @@ __kernel void in_out_image_to_buffer(OUT_OF_RANGE_PARAMS
#endif
const int batch_idx = h / height;
const int height_idx = h % height;
const int width_idx = w % width;
const int channel_idx = w / width * 4;
const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
const int height_idx = h - mul24(batch_idx, height);
int channel_idx = w / width;
const int width_idx = w - mul24(channel_idx, width);
channel_idx = channel_idx << 2;
const int offset = mad24(mad24(mad24(batch_idx, height, height_idx), width, width_idx),
channels,
channel_idx);
int2 coord = (int2)(w, h);
DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
...
...
@@ -251,8 +258,8 @@ __kernel void arg_buffer_to_image(OUT_OF_RANGE_PARAMS
}
#endif
const int offset = input_offset +
w * 4
;
const int size = count -
w * 4
;
const int offset = input_offset +
(w << 2)
;
const int size = count -
(w << 2)
;
DATA_TYPE4 values = 0;
...
...
@@ -286,7 +293,7 @@ __kernel void arg_image_to_buffer(OUT_OF_RANGE_PARAMS
}
#endif
const int offset =
w * 4
;
const int offset =
(w << 2)
;
int2 coord = (int2)(w, h);
DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
...
...
@@ -323,14 +330,15 @@ __kernel void in_out_height_buffer_to_image(OUT_OF_RANGE_PARAMS
}
#endif
const int wc =
width * channels
;
const int height_blks = (height + 3)
/ 4
;
const int wc =
mul24(width, channels)
;
const int height_blks = (height + 3)
>> 2
;
const int batch_idx = h / height_blks;
const int height_idx = (h % height_blks) << 2;
const int width_idx = w % width;
const int height_idx = (h - mul24(batch_idx, height_blks)) << 2;
const int channel_idx = w / width;
int offset = input_offset + ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
const int width_idx = w - mul24(channel_idx, width);
int offset = mad24(mad24(mad24(batch_idx, height, height_idx), width, width_idx),
channels,
input_offset+ channel_idx);
int size = height - height_idx;
size = size >= 4 ? 0 : size;
...
...
@@ -365,25 +373,26 @@ __kernel void in_out_height_image_to_buffer(OUT_OF_RANGE_PARAMS
}
#endif
const int height_blks = (height + 3)
/ 4
;
const int height_blks = (height + 3)
>> 2
;
const int batch_idx = h / height_blks;
const int height_idx = (h % height_blks) << 2;
const int width_idx = w % width;
const int height_idx = (h - mul24(batch_idx, height_blks)) << 2;
const int channel_idx = w / width;
int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
const int width_idx = w - mul24(channel_idx, width);
int offset = mad24(mad24(mad24(batch_idx, height, height_idx), width, width_idx),
channels,
channel_idx);
int2 coord = (int2)(w, h);
DATA_TYPE4 values = READ_IMAGET(input, SAMPLER, coord);
output[offset] = values.x;
if (height_idx + 1 >= height) return;
offset
+= width * channels
;
offset
= mad24(width, channels, offset)
;
output[offset] = values.y;
if (height_idx + 2 >= height) return;
offset
+= width * channels
;
offset
= mad24(width, channels, offset)
;
output[offset] = values.z;
if (height_idx + 3 >= height) return;
offset
+= width * channels
;
offset
= mad24(width, channels, offset)
;
output[offset] = values.w;
}
...
...
@@ -404,14 +413,15 @@ __kernel void in_out_width_buffer_to_image(OUT_OF_RANGE_PARAMS
}
#endif
const int width_blks = (width + 3)
/ 4
;
const int width_blks = (width + 3)
>> 2
;
const int batch_idx = h / height;
const int height_idx = h % height;
const int width_idx = (w % width_blks) << 2;
const int height_idx = h - mul24(batch_idx, height);
const int channel_idx = w / width_blks;
const int offset = input_offset
+ ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
const int width_idx = (w - mul24(channel_idx, width_blks)) << 2;
const int offset =
mad24(mad24(mad24(batch_idx, height, height_idx), width, width_idx),
channels,
channel_idx + input_offset);
int size = width - width_idx;
size = size >= 4 ? 0 : size;
...
...
@@ -450,13 +460,13 @@ __kernel void weight_height_buffer_to_image(OUT_OF_RANGE_PARAMS
const int inner_size = global_size_dim0;
const int out_chan_idx = h << 2;
const int in_chan_idx = w % in_channels;
const int hw_idx = w / in_channels;
const int in_chan_idx = w - mul24(hw_idx, in_channels);
const int height_idx = hw_idx / width;
const int width_idx = hw_idx
% width
;
int offset =
input_offset +
const int width_idx = hw_idx
- mul24(height_idx, width)
;
int offset =
mad24(out_chan_idx, inner_size,
mad24(mad24(in_chan_idx, height, height_idx), width, width_idx));
mad24(mad24(in_chan_idx, height, height_idx), width, width_idx
+ input_offset
));
int size = out_channels - out_chan_idx;
size = size >= 4 ? 0 : size;
...
...
@@ -494,10 +504,10 @@ __kernel void weight_height_image_to_buffer(OUT_OF_RANGE_PARAMS
const int inner_size = global_size_dim0;
const int out_chan_idx = h << 2;
const int in_chan_idx = w % in_channels;
const int hw_idx = w / in_channels;
const int in_chan_idx = w - mul24(hw_idx, in_channels);
const int height_idx = hw_idx / width;
const int width_idx = hw_idx
% width
;
const int width_idx = hw_idx
- mul24(height_idx, width)
;
int offset =
mad24(out_chan_idx, inner_size,
mad24(mad24(in_chan_idx, height, height_idx), width, width_idx));
...
...
@@ -535,17 +545,17 @@ __kernel void weight_width_buffer_to_image(OUT_OF_RANGE_PARAMS
#endif
const int out_channels = global_size_dim1;
const int in_chan_blks = (in_channels + 3) >> 2;
const int hw_size =
height * width
;
const int inner_size =
in_channels * hw_size
;
const int hw_size =
mul24(height, width)
;
const int inner_size =
mul24(in_channels, hw_size)
;
const int out_chan_idx = h;
const int in_chan_idx = (w % in_chan_blks) << 2;
const int hw_idx = w / in_chan_blks;
const int in_chan_idx = (w - mul24(hw_idx, in_chan_blks)) << 2;
const int height_idx = hw_idx / width;
const int width_idx = hw_idx
% width
;
int offset =
input_offset +
const int width_idx = hw_idx
- mul24(height_idx, width)
;
int offset =
mad24(out_chan_idx, inner_size,
mad24(mad24(in_chan_idx, height, height_idx), width, width_idx));
mad24(mad24(in_chan_idx, height, height_idx), width, width_idx
+ input_offset
));
int size = in_channels - in_chan_idx;
...
...
@@ -582,14 +592,14 @@ __kernel void weight_width_image_to_buffer(OUT_OF_RANGE_PARAMS
#endif
const int out_channels = global_size_dim1;
const int in_chan_blks = (in_channels + 3) >> 2;
const int hw_size =
height * width
;
const int inner_size =
in_channels * hw_size
;
const int hw_size =
mul24(height, width)
;
const int inner_size =
mul24(in_channels, hw_size)
;
const int out_chan_idx = h;
const int in_chan_idx = (w % in_chan_blks) << 2;
const int hw_idx = w / in_chan_blks;
const int in_chan_idx = (w - mul24(hw_idx, in_chan_blks)) << 2;
const int height_idx = hw_idx / width;
const int width_idx = hw_idx
% width
;
const int width_idx = hw_idx
- mul24(height_idx, width)
;
int offset =
mad24(out_chan_idx, inner_size,
mad24(mad24(in_chan_idx, height, height_idx), width, width_idx));
...
...
@@ -629,7 +639,11 @@ __kernel void winograd_filter_buffer_to_image_2x2(OUT_OF_RANGE_PARAMS
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
const int hw = mul24(height, width);
const int offset =
mad24(mad24(out_channel_idx, in_channels, in_channel_idx),
hw,
input_offset);
const int length = min((in_channels - in_channel_idx) * 9, 36);
DATA_TYPE in[36] = {0};
DATA_TYPE4 tt;
...
...
@@ -732,7 +746,7 @@ __kernel void winograd_filter_image_to_buffer_2x2(OUT_OF_RANGE_PARAMS
const int width_idx = w << 2;
const int size = width - width_idx;
int offset =
h * width + width_idx
;
int offset =
mad24(h, width, width_idx)
;
int2 coord = (int2)(w, h);
DATA_TYPE4 values;
...
...
@@ -752,7 +766,7 @@ __kernel void winograd_filter_image_to_buffer_2x2(OUT_OF_RANGE_PARAMS
}
coord.y += height;
offset
+= height * width
;
offset
= mad24(height, width, offset)
;
}
}
...
...
@@ -777,7 +791,9 @@ __kernel void winograd_filter_buffer_to_image_6x6(OUT_OF_RANGE_PARAMS
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
const int offset =mad24(mad24(out_channel_idx, in_channels, in_channel_idx),
mul24(height, width),
input_offset);
const int length = min((in_channels - in_channel_idx) * 9, 36);
DATA_TYPE in[36] = {0};
DATA_TYPE4 tt0, tt1, t1;
...
...
@@ -909,7 +925,7 @@ __kernel void winograd_filter_image_to_buffer_6x6(OUT_OF_RANGE_PARAMS
const int width_idx = w << 2;
const int size = width - width_idx;
int offset =
h * width + width_idx
;
int offset =
mad24(h, width, width_idx)
;
int2 coord = (int2)(w, h);
DATA_TYPE4 values;
...
...
@@ -928,7 +944,7 @@ __kernel void winograd_filter_image_to_buffer_6x6(OUT_OF_RANGE_PARAMS
vstore4(values, 0, output + offset);
}
coord.y += height;
offset
+= height * width
;
offset
= mad24(height, width, offset)
;
}
}
...
...
@@ -953,7 +969,9 @@ __kernel void winograd_filter_buffer_to_image_4x4(OUT_OF_RANGE_PARAMS
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
const int offset = mad24(mad24(out_channel_idx, in_channels, in_channel_idx),
mul24(height, width),
input_offset);
const int length = min((in_channels - in_channel_idx) * 9, 36);
DATA_TYPE in[36] = {0};
DATA_TYPE4 tt0, tt1, tt2;
...
...
@@ -1058,7 +1076,7 @@ __kernel void winograd_filter_image_to_buffer_4x4(OUT_OF_RANGE_PARAMS
const
int
width_idx
=
w
<<
2
;
const
int
size
=
width
-
width_idx
;
int
offset
=
h
*
width
+
width_idx
;
int
offset
=
mad24
(
h,
width,
width_idx
)
;
int2
coord
=
(
int2
)(
w,
h
)
;
DATA_TYPE4
values
;
...
...
@@ -1077,6 +1095,6 @@ __kernel void winograd_filter_image_to_buffer_4x4(OUT_OF_RANGE_PARAMS
vstore4
(
values,
0
,
output
+
offset
)
;
}
coord.y
+=
height
;
offset
+=
height
*
width
;
offset
=
mad24
(
height,
width,
offset
)
;
}
}
\ No newline at end of file
mace/ops/opencl/cl/buffer_transform.cl
浏览文件 @
1eb50c6c
...
...
@@ -23,9 +23,11 @@ __kernel void pad_input(BUFFER_OUT_OF_RANGE_PARAMS
#endif
const int padded_chan_blk = (padded_chan + 3) >> 2;
const int padded_width_idx = padded_wc_blk_idx / padded_chan_blk;
const int padded_chan_blk_idx = padded_wc_blk_idx % padded_chan_blk;
const int padded_chan_blk_idx =
padded_wc_blk_idx - mul24(padded_width_idx, padded_chan_blk);
const int batch_idx = padded_hb_idx / padded_height;
const int padded_height_idx = padded_hb_idx % padded_height;
const int padded_height_idx =
padded_hb_idx - mul24(batch_idx, padded_height);
const int padded_chan_idx = padded_chan_blk_idx << 2;
const int in_height_idx = padded_height_idx - pad_top;
const int in_width_idx = padded_width_idx - pad_left;
...
...
@@ -81,7 +83,7 @@ __kernel void transform_conv_filter(BUFFER_OUT_OF_RANGE_PARAMS
const int out_chan_blk = global_size_dim1;
const int h_idx = hw_idx / width;
const int w_idx = hw_idx
% width
;
const int w_idx = hw_idx
- mul24(h_idx, width)
;
const int out_chan_idx = out_chan_blk_idx << 2;
const int in_offset = mad24(mad24(mad24(out_chan_idx, in_chan, in_chan_idx),
height, h_idx), width, w_idx) + input_offset;
...
...
mace/ops/opencl/cl/channel_shuffle.cl
浏览文件 @
1eb50c6c
...
...
@@ -19,10 +19,10 @@ __kernel void channel_shuffle(OUT_OF_RANGE_PARAMS
#
endif
const
int
width
=
global_size_dim1
;
const
int
group_blks
=
groups
/
4
;
const
int
groups_blks_width
=
group_blks
*
width
;
const
int
channels_per_group_blks
=
channels_per_group
/
4
;
const
int
channels_per_group_blks_width
=
channels_per_group_blks
*
width
;
const
int
group_blks
=
groups
>>
2
;
const
int
groups_blks_width
=
mul24
(
group_blks,
width
)
;
const
int
channels_per_group_blks
=
channels_per_group
>>
2
;
const
int
channels_per_group_blks_width
=
mul24
(
channels_per_group_blks,
width
)
;
DATA_TYPE4
in_chan_data0,
in_chan_data1,
in_chan_data2,
in_chan_data3
;
DATA_TYPE4
out_chan_data0,
out_chan_data1,
out_chan_data2,
out_chan_data3
;
...
...
mace/ops/opencl/cl/conv_2d_1x1.cl
浏览文件 @
1eb50c6c
...
...
@@ -45,14 +45,16 @@ __kernel void conv_2d_1x1(OUT_OF_RANGE_PARAMS
w.y = w.x + in_width_stride;
w.z = w.y + in_width_stride;
w.w = w.z + in_width_stride;
int out_hb_idx = mul24((out_hb % height), stride);
int batch = out_hb / height;
int h_idx = out_hb - mul24(batch, height);
int out_hb_idx = mul24(h_idx, stride);
w.x = select(w.x, INT_MIN, w.x >= in_width);
w.y = select(w.y, INT_MIN, w.y >= in_width);
w.z = select(w.z, INT_MIN, w.z >= in_width);
w.w = select(w.w, INT_MIN, w.w >= in_width);
out_hb_idx = select(mad24(
(out_hb / height)
, in_height, out_hb_idx),
out_hb_idx = select(mad24(
batch
, in_height, out_hb_idx),
-1,
out_hb_idx >= in_height);
...
...
mace/ops/opencl/cl/conv_2d_1x1_buffer.cl
浏览文件 @
1eb50c6c
...
...
@@ -30,10 +30,11 @@ __kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS
const int out_chan_blk = (out_chan + 3) >> 2;
const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk;
const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk;
const int out_chan_blk_idx =
out_wc_blk_idx - mul24(out_width_blk_idx, out_chan_blk);
const int batch_idx = out_hb_idx / out_height;
const int out_height_idx = out_hb_idx
% out_height
;
const int out_height_idx = out_hb_idx
- mul24(batch_idx, out_height)
;
const int out_width_idx = out_width_blk_idx << 1;
const int out_chan_idx = out_chan_blk_idx << 2;
...
...
mace/ops/opencl/cl/conv_2d_buffer.cl
浏览文件 @
1eb50c6c
...
...
@@ -35,10 +35,11 @@ __kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS
const int out_chan_blk = (out_chan + 3) >> 2;
const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk;
const int out_chan_blk_idx = out_wc_blk_idx % out_chan_blk;
const int out_chan_blk_idx =
out_wc_blk_idx - mul24(out_width_blk_idx, out_chan_blk);
const int batch_idx = out_hb_idx / out_height;
const int out_height_idx = out_hb_idx
% out_height
;
const int out_height_idx = out_hb_idx
- mul24(batch_idx, out_height)
;
const int out_width_idx = out_width_blk_idx << 2;
const int out_chan_idx = out_chan_blk_idx << 2;
...
...
mace/ops/opencl/cl/crop.cl
浏览文件 @
1eb50c6c
...
...
@@ -27,7 +27,7 @@ __kernel void crop(OUT_OF_RANGE_PARAMS
#
endif
const
int
b
=
hb_idx
/
out_height
;
const
int
h
=
hb_idx
%
out_height
;
const
int
h
=
hb_idx
-
mul24
(
b,
out_height
)
;
const
int
in_chan_blk_idx
=
chan_blk_idx
+
offset_chan_blk
;
const
int
in_width_idx
=
width_idx
+
offset_w
;
const
int
in_h
=
h
+
offset_h
;
...
...
mace/ops/opencl/cl/depth_to_space.cl
浏览文件 @
1eb50c6c
...
...
@@ -24,10 +24,10 @@ __kernel void depth_to_space(OUT_OF_RANGE_PARAMS
const int out_pos = mad24(out_d, output_width, out_w);
const int in_hb = out_hb / block_size;
const int offset_h = out_hb
% block_size
;
const int offset_h = out_hb
- mul24(in_hb, block_size)
;
const int in_w = out_w / block_size;
const int offset_w = out_w
% block_size
;
const int offset_d =
(offset_h * block_size + offset_w) * output_depth_blocks
;
const int offset_w = out_w
- mul24(in_w, block_size)
;
const int offset_d =
mul24(mad24(offset_h, block_size, offset_w), output_depth_blocks)
;
const int in_d = out_d + offset_d;
if (in_hb >= input_hb || in_w >= input_width |
|
in_d
>=
input_depth_blocks
)
{
...
...
mace/ops/opencl/cl/depthwise_conv2d_buffer.cl
浏览文件 @
1eb50c6c
...
...
@@ -35,10 +35,10 @@ __kernel void depthwise_conv2d(BUFFER_OUT_OF_RANGE_PARAMS
const int out_chan_blk = (out_chan + 3) >> 2;
const int out_width_blk_idx = out_wc_blk_idx / out_chan_blk;
const int out_chan_blk_idx = out_wc_blk_idx
% out_chan_blk
;
const int out_chan_blk_idx = out_wc_blk_idx
- mul24(out_width_blk_idx, out_chan_blk)
;
const int batch_idx = out_hb_idx / out_height;
const int out_height_idx = out_hb_idx
% out_height
;
const int out_height_idx = out_hb_idx
- mul24(batch_idx, out_height)
;
const int out_width_idx = out_width_blk_idx << 2;
const int out_chan_idx = out_chan_blk_idx << 2;
const int in_chan_idx = out_chan_idx;
...
...
mace/ops/opencl/cl/fully_connected.cl
浏览文件 @
1eb50c6c
...
...
@@ -35,7 +35,7 @@ __kernel void fully_connected(OUT_OF_RANGE_PARAMS
for (short h_idx = 0; h_idx < input_height; ++h_idx) {
for (short w_idx = 0; w_idx < input_width; ++w_idx) {
input_coord.x = w_idx;
weight_x =
(h_idx * input_width + w_idx) * input_channel
;
weight_x =
mul24(mad24(h_idx, input_width, w_idx), input_channel)
;
#pragma unroll
for (short chan_idx = 0; chan_idx < input_chan_blk; ++chan_idx) {
input_value = READ_IMAGET(input, SAMPLER, input_coord);
...
...
@@ -84,7 +84,7 @@ __kernel void fully_connected_width(OUT_OF_RANGE_PARAMS
const int batch_out_blk_idx = get_global_id(2);
const int batch_idx = batch_out_blk_idx / out_blks;
const int out_blk_idx = batch_out_blk_idx
% out_blks
;
const int out_blk_idx = batch_out_blk_idx
- mul24(batch_idx, out_blks)
;
const short in_outer_size = mul24(input_width, in_chan_blks);
const short weight_y = mad24(out_blk_idx, 4, inter_out_idx);
...
...
mace/ops/opencl/cl/lstmcell.cl
浏览文件 @
1eb50c6c
...
...
@@ -20,117 +20,118 @@ __kernel void lstmcell(OUT_OF_RANGE_PARAMS
if
(
w_blk_idx
>=
global_size_dim0
||
h_idx
>=
global_size_dim1
)
return
;
#
endif
//
fc_res0
->
i
//
fc_res1
->
j
//
fc_res2
->
f
//
fc_res3
->
o
DATA_TYPE4
fc_res0
=
0.0
,
fc_res1
=
0.0
,
fc_res2
=
0.0
,
fc_res3
=
0.0
;
DATA_TYPE4
in,
pre_h
;
DATA_TYPE4
w0,
w1,
w2,
w3
;
int
k_offset
;
//
concat
matmul
const
int
pos_x0
=
w_blk_idx
;
const
int
pos_x1
=
pos_x0
+
global_size_dim0
;
const
int
pos_x2
=
pos_x1
+
global_size_dim0
;
const
int
pos_x3
=
pos_x2
+
global_size_dim0
;
//
bias
//
fc_res0
->
i
//
fc_res1
->
j
//
fc_res2
->
f
//
fc_res3
->
o
DATA_TYPE4
fc_res0,
fc_res1,
fc_res2,
fc_res3
;
fc_res0
=
READ_IMAGET
(
bias,
SAMPLER,
(
int2
)(
pos_x0,
0
))
;
fc_res1
=
READ_IMAGET
(
bias,
SAMPLER,
(
int2
)(
pos_x1,
0
))
;
fc_res2
=
READ_IMAGET
(
bias,
SAMPLER,
(
int2
)(
pos_x2,
0
))
;
fc_res3
=
READ_IMAGET
(
bias,
SAMPLER,
(
int2
)(
pos_x3,
0
))
;
for
(
short
i
=
0
; i < in_w_blk; ++i) {
in
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
i,
h_idx
))
;
int
k
=
i
<<
2
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
,
k
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
,
k
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
2
,
k
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
3
,
k
))
;
fc_res0
+=
in.x
*
w0
;
fc_res1
+=
in.x
*
w1
;
fc_res2
+=
in.x
*
w2
;
fc_res3
+=
in.x
*
w3
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x0
,
k
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x1
,
k
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
2,
k
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
3,
k
))
;
fc_res0
=
mad
(
in.x,
w0,
fc_res0
)
;
fc_res1
=
mad
(
in.x,
w1,
fc_res1
)
;
fc_res2
=
mad
(
in.x,
w2,
fc_res2
)
;
fc_res3
=
mad
(
in.x,
w3,
fc_res3
)
;
k
+=
1
;
k_offset
=
select
(
-1
,
k,
k
<
width
)
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
,
k_offset
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
,
k_offset
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
2
,
k_offset
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
3
,
k_offset
))
;
fc_res0
+=
in.y
*
w0
;
fc_res1
+=
in.y
*
w1
;
fc_res2
+=
in.y
*
w2
;
fc_res3
+=
in.y
*
w3
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x0
,
k_offset
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x1
,
k_offset
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
2,
k_offset
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
3,
k_offset
))
;
fc_res0
=
mad
(
in.y,
w0,
fc_res0
)
;
fc_res1
=
mad
(
in.y,
w1,
fc_res1
)
;
fc_res2
=
mad
(
in.y,
w2,
fc_res2
)
;
fc_res3
=
mad
(
in.y,
w3,
fc_res3
)
;
k
+=
1
;
k_offset
=
select
(
-1
,
k,
k
<
width
)
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
,
k_offset
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
,
k_offset
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
2
,
k_offset
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
3
,
k_offset
))
;
fc_res0
+=
in.z
*
w0
;
fc_res1
+=
in.z
*
w1
;
fc_res2
+=
in.z
*
w2
;
fc_res3
+=
in.z
*
w3
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x0
,
k_offset
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x1
,
k_offset
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
2,
k_offset
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
3,
k_offset
))
;
fc_res0
=
mad
(
in.z,
w0,
fc_res0
)
;
fc_res1
=
mad
(
in.z,
w1,
fc_res1
)
;
fc_res2
=
mad
(
in.z,
w2,
fc_res2
)
;
fc_res3
=
mad
(
in.z,
w3,
fc_res3
)
;
k
+=
1
;
k_offset
=
select
(
-1
,
k,
k
<
width
)
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
,
k_offset
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
,
k_offset
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
2
,
k_offset
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
3
,
k_offset
))
;
fc_res0
+=
in.w
*
w0
;
fc_res1
+=
in.w
*
w1
;
fc_res2
+=
in.w
*
w2
;
fc_res3
+=
in.w
*
w3
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x0
,
k_offset
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x1
,
k_offset
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
2,
k_offset
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
3,
k_offset
))
;
fc_res0
=
mad
(
in.w,
w0,
fc_res0
)
;
fc_res1
=
mad
(
in.w,
w1,
fc_res1
)
;
fc_res2
=
mad
(
in.w,
w2,
fc_res2
)
;
fc_res3
=
mad
(
in.w,
w3,
fc_res3
)
;
}
for
(
short
i
=
0
; i < global_size_dim0; ++i) {
pre_h
=
READ_IMAGET
(
pre_output,
SAMPLER,
(
int2
)(
i,
h_idx
))
;
int
k
=
(
i
<<
2
)
+
width
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
,
k
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
,
k
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
2
,
k
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
3
,
k
))
;
fc_res0
+=
pre_h.x
*
w0
;
fc_res1
+=
pre_h.x
*
w1
;
fc_res2
+=
pre_h.x
*
w2
;
fc_res3
+=
pre_h.x
*
w3
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x0
,
k
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x1
,
k
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
2,
k
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
3,
k
))
;
fc_res0
=
mad
(
pre_h.x,
w0,
fc_res0
)
;
fc_res1
=
mad
(
pre_h.x,
w1,
fc_res1
)
;
fc_res2
=
mad
(
pre_h.x,
w2,
fc_res2
)
;
fc_res3
=
mad
(
pre_h.x,
w3,
fc_res3
)
;
k
+=
1
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
,
k
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
,
k
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
2
,
k
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
3
,
k
))
;
fc_res0
+=
pre_h.y
*
w0
;
fc_res1
+=
pre_h.y
*
w1
;
fc_res2
+=
pre_h.y
*
w2
;
fc_res3
+=
pre_h.y
*
w3
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x0
,
k
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x1
,
k
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
2,
k
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
3,
k
))
;
fc_res0
=
mad
(
pre_h.y,
w0,
fc_res0
)
;
fc_res1
=
mad
(
pre_h.y,
w1,
fc_res1
)
;
fc_res2
=
mad
(
pre_h.y,
w2,
fc_res2
)
;
fc_res3
=
mad
(
pre_h.y,
w3,
fc_res3
)
;
k
+=
1
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
,
k
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
,
k
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
2
,
k
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
3
,
k
))
;
fc_res0
+=
pre_h.z
*
w0
;
fc_res1
+=
pre_h.z
*
w1
;
fc_res2
+=
pre_h.z
*
w2
;
fc_res3
+=
pre_h.z
*
w3
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x0
,
k
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x1
,
k
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
2,
k
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
3,
k
))
;
fc_res0
=
mad
(
pre_h.z,
w0,
fc_res0
)
;
fc_res1
=
mad
(
pre_h.z,
w1,
fc_res1
)
;
fc_res2
=
mad
(
pre_h.z,
w2,
fc_res2
)
;
fc_res3
=
mad
(
pre_h.z,
w3,
fc_res3
)
;
k
+=
1
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
,
k
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
,
k
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
2
,
k
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
3
,
k
))
;
fc_res0
+=
pre_h.w
*
w0
;
fc_res1
+=
pre_h.w
*
w1
;
fc_res2
+=
pre_h.w
*
w2
;
fc_res3
+=
pre_h.w
*
w3
;
w0
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x0
,
k
))
;
w1
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x1
,
k
))
;
w2
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
2,
k
))
;
w3
=
READ_IMAGET
(
weight,
SAMPLER,
(
int2
)(
pos_x
3,
k
))
;
fc_res0
=
mad
(
pre_h.w,
w0,
fc_res0
)
;
fc_res1
=
mad
(
pre_h.w,
w1,
fc_res1
)
;
fc_res2
=
mad
(
pre_h.w,
w2,
fc_res2
)
;
fc_res3
=
mad
(
pre_h.w,
w3,
fc_res3
)
;
}
//
bias
DATA_TYPE4
b0,
b1,
b2,
b3
;
b0
=
READ_IMAGET
(
bias,
SAMPLER,
(
int2
)(
w_blk_idx,
0
))
;
b1
=
READ_IMAGET
(
bias,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0,
0
))
;
b2
=
READ_IMAGET
(
bias,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
2
,
0
))
;
b3
=
READ_IMAGET
(
bias,
SAMPLER,
(
int2
)(
w_blk_idx
+
global_size_dim0
*
3
,
0
))
;
fc_res0
+=
b0
;
fc_res1
+=
b1
;
fc_res2
+=
b2
;
fc_res3
+=
b3
;
//
gate
DATA_TYPE4
pre_c,
c,
h
;
pre_c
=
READ_IMAGET
(
pre_cell,
SAMPLER,
(
int2
)(
w_blk_idx,
h_idx
))
;
...
...
mace/ops/opencl/cl/matmul.cl
浏览文件 @
1eb50c6c
...
...
@@ -19,7 +19,7 @@ __kernel void matmul(OUT_OF_RANGE_PARAMS
#
endif
const
int
batch
=
hb
/
height_blocks
;
const
int
ty
=
(
hb
%
height_blocks
)
;
const
int
ty
=
hb
-
mul24
(
batch,
height_blocks
)
;
const
int
gy
=
mad24
(
batch,
height_blocks,
ty
)
;
const
int
bm
=
mad24
(
batch,
M,
ty
<<
2
)
;
const
int
bk
=
mul24
(
batch,
k_blocks
)
;
...
...
mace/ops/opencl/cl/pad.cl
浏览文件 @
1eb50c6c
...
...
@@ -14,7 +14,7 @@ __kernel void pad(OUT_OF_RANGE_PARAMS
const
int
width_idx
=
get_global_id
(
1
)
;
const
int
hb_idx
=
get_global_id
(
2
)
;
const
int
batch_idx
=
hb_idx
/
output_height
;
const
int
height_idx
=
hb_idx
%
output_height
;
const
int
height_idx
=
hb_idx
-
mul24
(
batch_idx,
output_height
)
;
const
int
input_padded_height
=
input_height
+
height_padding
;
const
int
input_padded_width
=
input_width
+
width_padding
;
...
...
mace/ops/opencl/cl/pooling.cl
浏览文件 @
1eb50c6c
...
...
@@ -42,9 +42,11 @@ __kernel void pooling(OUT_OF_RANGE_PARAMS
#endif
const int out_width = global_size_dim1;
const int batch_idx = mul24((out_hb_idx / out_height), in_height);
const int in_height_start = mul24((out_hb_idx % out_height), stride_h) - pad_top;
const int in_width_start = mul24(out_width_idx, stride_w) - pad_left;
const int n_b = out_hb_idx / out_height;
const int mod_b = out_hb_idx - mul24(n_b, out_height);
const int batch_idx = mul24(n_b, in_height);
const int in_height_start = mad24(mod_b, stride_h, -pad_top);
const int in_width_start = mad24(out_width_idx, stride_w, -pad_left);
const int in_channel_offset = mul24(out_chan_idx, in_width);
...
...
mace/ops/opencl/cl/pooling_buffer.cl
浏览文件 @
1eb50c6c
...
...
@@ -47,7 +47,7 @@ __kernel void pooling(BUFFER_OUT_OF_RANGE_PARAMS
const
int
in_wc_size
=
mul24
(
in_width,
in_chan
)
;
const
int
batch_idx
=
out_hb_idx
/
out_height
;
const
int
out_height_idx
=
out_hb_idx
%
out_height
;
const
int
out_height_idx
=
out_hb_idx
-
mul24
(
batch_idx,
out_height
)
;
const
int
chan_idx
=
out_chan_blk_idx
<<
2
;
const
int
in_height_start
=
mul24
(
out_height_idx,
stride_h
)
-
pad_top
;
const
int
in_width_start
=
mul24
(
out_width_idx,
stride_w
)
-
pad_left
;
...
...
mace/ops/opencl/cl/reduce_mean.cl
浏览文件 @
1eb50c6c
...
...
@@ -11,9 +11,7 @@ __kernel void reduce_mean(OUT_OF_RANGE_PARAMS
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
float
image_size_reciprocal,
__private
const
float
in_width_reciprocal,
__private
const
int
channel_blocks,
__private
const
float
channel_blocks_reciprocal,
__write_only
image2d_t
output
)
{
const
int
i
=
get_local_id
(
0
)
;
const
int
j
=
get_local_id
(
1
)
;
...
...
@@ -26,7 +24,7 @@ __kernel void reduce_mean(OUT_OF_RANGE_PARAMS
const
int
dim0_size
=
get_local_size
(
0
)
;
float4
tmp
=
(
float4
)
{0,
0
,
0
,
0}
;
const
int
index
=
mad24
(
j,
dim0_size,
i
)
;
const
int
b
=
floor
(
k
*
channel_blocks_reciprocal
)
;
const
int
b
=
k
/
channel_blocks
;
const
int
ch
=
mad24
(
b,
-channel_blocks,
k
)
;
DATA_TYPE4
in
;
...
...
@@ -40,7 +38,7 @@ __kernel void reduce_mean(OUT_OF_RANGE_PARAMS
#
pragma
unroll
for
(
int
l
=
0
; l < valid_part_len; ++l) {
int
offset
=
base_offset
+
l
;
int
h_id
=
floor
(
offset
*
in_width_reciprocal
)
;
int
h_id
=
offset
/
in_width
;
int
w_id
=
mad24
(
h_id,
-in_width,
offset
)
;
int
pos_x
=
mad24
(
ch,
in_width,
w_id
)
;
int
pos_y
=
mad24
(
b,
in_height,
h_id
)
;
...
...
mace/ops/opencl/cl/resize_bicubic.cl
浏览文件 @
1eb50c6c
...
...
@@ -36,7 +36,7 @@ __kernel void resize_bicubic_nocache(OUT_OF_RANGE_PARAMS
#
endif
const
int
b
=
hb
/
out_height
;
const
int
h
=
hb
%
out_height
;
const
int
h
=
hb
-
mul24
(
b,
out_height
)
;
const
float
h_in
=
h
*
height_scale
;
const
float
w_in
=
w
*
width_scale
;
...
...
mace/ops/opencl/cl/resize_bilinear.cl
浏览文件 @
1eb50c6c
...
...
@@ -24,7 +24,7 @@ __kernel void resize_bilinear_nocache(OUT_OF_RANGE_PARAMS
const
int
out_width
=
global_size_dim1
;
const
int
b
=
hb
/
out_height
;
const
int
h
=
hb
%
out_height
;
const
int
h
=
hb
-
mul24
(
b,
out_height
)
;
const
float
h_in
=
h
*
height_scale
;
const
float
w_in
=
w
*
width_scale
;
...
...
mace/ops/opencl/cl/softmax_buffer.cl
浏览文件 @
1eb50c6c
...
...
@@ -20,7 +20,7 @@ __kernel void softmax(BUFFER_OUT_OF_RANGE_PARAMS
const
int
chan_blks
=
global_size_dim0
-
1
;
const
int
width
=
global_size_dim1
;
const
int
batch_idx
=
hb_idx
/
height
;
const
int
height_idx
=
hb_idx
%
height
;
const
int
height_idx
=
hb_idx
-
mul24
(
batch_idx,
height
)
;
const
int
chan_idx
=
chan_blk_idx
<<
2
;
const
int
offset_base
=
mul24
(
mad24
(
mad24
(
batch_idx,
height,
height_idx
)
,
...
...
mace/ops/opencl/cl/space_to_batch.cl
浏览文件 @
1eb50c6c
...
...
@@ -25,14 +25,19 @@ __kernel void space_to_batch(OUT_OF_RANGE_PARAMS
#endif
const int batch_b_idx = batch_hb_idx / batch_height;
const int batch_h_idx = batch_hb_idx
% batch_height
;
const int batch_h_idx = batch_hb_idx
- mul24(batch_b_idx, batch_height)
;
const int block_size = mul24(block_height, block_width);
const int space_b_idx = batch_b_idx % batch_size;
const int remaining_batch_idx = batch_b_idx / batch_size;
const int space_h_idx = (remaining_batch_idx / block_width) +
const int space_b_idx =
batch_b_idx - mul24(remaining_batch_idx, batch_size);
const int n_remain_blk_w = remaining_batch_idx / block_width;
const int mod_remain_blk_w =
remaining_batch_idx - mul24(n_remain_blk_w, block_width);
const int space_h_idx = n_remain_blk_w +
mul24(batch_h_idx, block_height) - padding_height;
const int space_w_idx =
(remaining_batch_idx % block_width)
+
const int space_w_idx =
mod_remain_blk_w
+
mul24(batch_w_idx, block_width) - padding_width;
const int space_coord_x = select(mul24(chan_idx, space_width) + space_w_idx,
...
...
mace/ops/opencl/cl/space_to_depth.cl
浏览文件 @
1eb50c6c
...
...
@@ -24,10 +24,10 @@ __kernel void space_to_depth(OUT_OF_RANGE_PARAMS
const int in_pos = mad24(d, input_width, w);
const int out_hb = hb / block_size;
const int offset_h = hb
% block_size
;
const int offset_h = hb
- mul24(out_hb, block_size)
;
const int out_w = w / block_size;
const int offset_w = w
% block_size
;
const int offset_d =
(offset_h * block_size + offset_w) * input_depth_blocks
;
const int offset_w = w
- mul24(out_w, block_size)
;
const int offset_d =
mul24(input_depth_blocks, mad24(offset_h, block_size, offset_w))
;
const int out_d = d + offset_d;
if (out_d >= output_depth_blocks || out_hb >= output_hb |
|
out_w
>=
output_width
)
{
...
...
mace/ops/opencl/cl/sqrdiff_mean.cl
浏览文件 @
1eb50c6c
...
...
@@ -12,9 +12,7 @@ __kernel void sqrdiff_mean(OUT_OF_RANGE_PARAMS
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
float
image_size_reciprocal,
__private
const
float
in_width_reciprocal,
__private
const
int
channel_blocks,
__private
const
float
channel_blocks_reciprocal,
__write_only
image2d_t
output
)
{
const
int
i
=
get_local_id
(
0
)
;
const
int
j
=
get_local_id
(
1
)
;
...
...
@@ -27,7 +25,7 @@ __kernel void sqrdiff_mean(OUT_OF_RANGE_PARAMS
const
int
dim0_size
=
get_local_size
(
0
)
;
float4
tmp
=
(
float4
)
{0,
0
,
0
,
0}
;
const
int
index
=
mad24
(
j,
dim0_size,
i
)
;
const
int
b
=
floor
(
k
*
channel_blocks_reciprocal
)
;
const
int
b
=
k
/
channel_blocks
;
const
int
ch
=
mad24
(
b,
-channel_blocks,
k
)
;
DATA_TYPE4
in
;
...
...
@@ -43,7 +41,7 @@ __kernel void sqrdiff_mean(OUT_OF_RANGE_PARAMS
#
pragma
unroll
for
(
int
l
=
0
; l < valid_part_len; ++l) {
int
offset
=
base_offset
+
l
;
int
h_id
=
floor
(
offset
*
in_width_reciprocal
)
;
int
h_id
=
offset
/
in_width
;
int
w_id
=
mad24
(
h_id,
-in_width,
offset
)
;
int
pos_x
=
mad24
(
ch,
in_width,
w_id
)
;
int
pos_y
=
mad24
(
b,
in_height,
h_id
)
;
...
...
mace/ops/opencl/image/reduce_mean.h
浏览文件 @
1eb50c6c
...
...
@@ -111,9 +111,7 @@ MaceStatus ReduceMeanKernel<T>::Compute(
const
int
group_size
=
lws
[
0
]
*
lws
[
1
]
*
lws
[
2
];
const
int
partial_len
=
(
image_size
+
group_size
-
1
)
/
group_size
;
const
int
remain_index
=
image_size
%
group_size
;
const
float
in_width_reciprocal
=
1.
f
/
in_width
;
const
float
img_size_reciprocal
=
1.
f
/
(
in_width
*
in_height
);
const
float
channel_blk_reciprocal
=
1.
f
/
channel_blocks
;
MACE_OUT_OF_RANGE_INIT
(
kernel_
);
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
...
...
@@ -130,9 +128,7 @@ MaceStatus ReduceMeanKernel<T>::Compute(
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
in_height
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
in_width
));
kernel_
.
setArg
(
idx
++
,
img_size_reciprocal
);
kernel_
.
setArg
(
idx
++
,
in_width_reciprocal
);
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
channel_blocks
));
kernel_
.
setArg
(
idx
++
,
channel_blk_reciprocal
);
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input
->
shape
();
...
...
mace/ops/opencl/image/sqrdiff_mean.h
浏览文件 @
1eb50c6c
...
...
@@ -107,9 +107,7 @@ MaceStatus SqrDiffMeanKernel<T>::Compute(
const
int
group_size
=
lws
[
0
]
*
lws
[
1
]
*
lws
[
2
];
const
int
partial_len
=
(
image_size
+
group_size
-
1
)
/
group_size
;
const
int
remain_index
=
image_size
%
group_size
;
const
float
in_width_reciprocal
=
1.
f
/
in_width
;
const
float
img_size_reciprocal
=
1.
f
/
(
in_width
*
in_height
);
const
float
channel_blk_reciprocal
=
1.
f
/
channel_blocks
;
MACE_OUT_OF_RANGE_INIT
(
kernel_
);
if
(
!
IsVecEqual
(
input_shape_
,
input0
->
shape
()))
{
...
...
@@ -127,9 +125,7 @@ MaceStatus SqrDiffMeanKernel<T>::Compute(
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
in_height
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
in_width
));
kernel_
.
setArg
(
idx
++
,
img_size_reciprocal
);
kernel_
.
setArg
(
idx
++
,
in_width_reciprocal
);
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
channel_blocks
));
kernel_
.
setArg
(
idx
++
,
channel_blk_reciprocal
);
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input0
->
shape
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录