Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
冰之2023
Mace
提交
63e6e07f
Mace
项目概览
冰之2023
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
63e6e07f
编写于
12月 11, 2017
作者:
L
Liangliang He
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimizing OpenCL kernel with mad/mad24/mul24
上级
22581f22
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
141 addition
and
148 deletion
+141
-148
mace/kernels/opencl/cl/batch_norm.cl
mace/kernels/opencl/cl/batch_norm.cl
+4
-3
mace/kernels/opencl/cl/bias_add.cl
mace/kernels/opencl/cl/bias_add.cl
+1
-2
mace/kernels/opencl/cl/concat.cl
mace/kernels/opencl/cl/concat.cl
+8
-8
mace/kernels/opencl/cl/conv_2d.cl
mace/kernels/opencl/cl/conv_2d.cl
+36
-43
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+33
-35
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+44
-41
mace/kernels/opencl/cl/pooling.cl
mace/kernels/opencl/cl/pooling.cl
+6
-6
mace/kernels/opencl/cl/relu.cl
mace/kernels/opencl/cl/relu.cl
+2
-2
mace/kernels/opencl/cl/resize_bilinear.cl
mace/kernels/opencl/cl/resize_bilinear.cl
+7
-8
未找到文件。
mace/kernels/opencl/cl/batch_norm.cl
浏览文件 @
63e6e07f
...
...
@@ -17,12 +17,13 @@ __kernel void batch_norm(__read_only image2d_t input,
DATA_TYPE4
mean_value
=
READ_IMAGET
(
mean,
SAMPLER,
(
int2
)(
ch_blk,
0
))
;
DATA_TYPE4
var_value
=
READ_IMAGET
(
var,
SAMPLER,
(
int2
)(
ch_blk,
0
))
;
//
native_rsqrt
seems
not
faster
than
rsqrt
DATA_TYPE4
new_scale
=
scale_value
*
rsqrt
(
var_value
+
(
DATA_TYPE4
)
epsilon
)
;
DATA_TYPE4
new_offset
=
offset_value
-
mean_value
*
new_scale
;
DATA_TYPE4
new_offset
=
mad
(
0
-
mean_value,
new_scale,
offset_value
)
;
const
int
pos
=
ch_blk
*
width
+
w
;
const
int
pos
=
mad24
(
ch_blk,
width,
w
)
;
DATA_TYPE4
in
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos,
hb
))
;
DATA_TYPE4
out
=
in
*
new_scale
+
new_offset
;
DATA_TYPE4
out
=
mad
(
in,
new_scale,
new_offset
)
;
WRITE_IMAGET
(
output,
(
int2
)(
pos,
hb
)
,
out
)
;
}
mace/kernels/opencl/cl/bias_add.cl
浏览文件 @
63e6e07f
...
...
@@ -8,8 +8,7 @@ __kernel void bias_add(__read_only image2d_t input,
const
int
hb
=
get_global_id
(
2
)
;
const
int
width
=
get_global_size
(
1
)
;
const
int
pos
=
ch_blk
*
width
+
w
;
const
int
pos
=
mad24
(
ch_blk,
width,
w
)
;
DATA_TYPE4
in
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos,
hb
))
;
DATA_TYPE4
bias_value
=
READ_IMAGET
(
bias,
SAMPLER,
(
int2
)(
ch_blk,
0
))
;
DATA_TYPE4
out
=
in
+
bias_value
;
...
...
mace/kernels/opencl/cl/concat.cl
浏览文件 @
63e6e07f
...
...
@@ -30,37 +30,37 @@ __kernel void concat_channel(__read_only image2d_t input0,
const
int
width_idx
=
get_global_id
(
1
)
;
const
int
width
=
get_global_size
(
1
)
;
const
int
hb_idx
=
get_global_id
(
2
)
;
const
int
input0_chan_blk
=
(
input0_chan
+
3
)
/
4
;
const
int
input0_chan_blk
=
(
input0_chan
+
3
)
>>
2
;
DATA_TYPE4
data
=
0
;
#
ifdef
DIVISIBLE_FOUR
if
(
chan_blk_idx
+
1
<=
input0_chan_blk
)
{
data
=
READ_IMAGET
(
input0,
SAMPLER,
(
int2
)(
chan_blk_idx
*
width
+
width_idx
,
hb_idx
))
;
(
int2
)(
mad24
(
chan_blk_idx,
width,
width_idx
)
,
hb_idx
))
;
}
else
{
data
=
READ_IMAGET
(
input1,
SAMPLER,
(
int2
)(
(
chan_blk_idx
-
input0_chan_blk
)
*
width
+
width_idx
,
hb_idx
))
;
(
int2
)(
mad24
((
chan_blk_idx
-
input0_chan_blk
)
,
width,
width_idx
)
,
hb_idx
))
;
}
#
else
if
(
chan_blk_idx
+
1
<
input0_chan_blk
)
{
data
=
READ_IMAGET
(
input0,
SAMPLER,
(
int2
)(
chan_blk_idx
*
width
+
width_idx
,
hb_idx
))
;
(
int2
)(
mad24
(
chan_blk_idx,
width,
width_idx
)
,
hb_idx
))
;
}
else
if
(
chan_blk_idx
>=
input0_chan_blk
)
{
const
int
in_chan_idx
=
chan_blk_idx
-
input0_chan_blk
;
DATA_TYPE4
data0
=
READ_IMAGET
(
input1,
SAMPLER,
(
int2
)(
in_chan_idx
*
width
+
width_idx
,
hb_idx
))
;
(
int2
)(
mad24
(
in_chan_idx,
width,
width_idx
)
,
hb_idx
))
;
DATA_TYPE4
data1
=
READ_IMAGET
(
input1,
SAMPLER,
(
int2
)(
(
in_chan_idx
+
1
)
*
width
+
width_idx
,
hb_idx
))
;
(
int2
)(
mad24
((
in_chan_idx
+
1
)
,
width,
width_idx
)
,
hb_idx
))
;
data
=
stitch_vector
(
data0,
data1,
input0_chan
%
4
,
true
)
;
}
else
{
DATA_TYPE4
data0
=
READ_IMAGET
(
input0,
SAMPLER,
(
int2
)(
chan_blk_idx
*
width
+
width_idx
,
hb_idx
))
;
(
int2
)(
mad24
(
chan_blk_idx,
width,
width_idx
)
,
hb_idx
))
;
DATA_TYPE4
data1
=
READ_IMAGET
(
input1,
SAMPLER,
(
int2
)(
width_idx,
hb_idx
))
;
...
...
@@ -68,7 +68,7 @@ __kernel void concat_channel(__read_only image2d_t input0,
}
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
chan_blk_idx
*
width
+
width_idx
,
hb_idx
)
,
data
)
;
WRITE_IMAGET
(
output,
(
int2
)(
mad24
(
chan_blk_idx,
width,
width_idx
)
,
hb_idx
)
,
data
)
;
}
//__kernel
void
concat_width
(
__read_only
image2d_t
input0,
...
...
mace/kernels/opencl/cl/conv_2d.cl
浏览文件 @
63e6e07f
...
...
@@ -19,7 +19,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blks
=
get_global_size
(
1
)
;
const
int
out_hb
=
get_global_id
(
2
)
;
const
int
rounded_in_ch
=
in_ch_blks
*
4
;
const
int
rounded_in_ch
=
in_ch_blks
<<
2
;
#
ifdef
BIAS
DATA_TYPE4
out0
=
...
...
@@ -41,29 +41,29 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
int
in_width3
=
in_width2
+
out_w_blks
;
const
int
height_idx
=
(
out_hb
%
out_height
)
-
padding_top
;
#
else
int
in_width0
=
out_w_blk
*
2
-
padding_left
;
int
in_width1
=
(
out_w_blk
+
out_w_blks
)
*
2
-
padding_left
;
int
in_width2
=
(
out_w_blk
+
2
*
out_w_blks
)
*
2
-
padding_left
;
int
in_width3
=
(
out_w_blk
+
3
*
out_w_blks
)
*
2
-
padding_left
;
const
int
height_idx
=
(
out_hb
%
out_height
)
*
2
-
padding_top
;
int
in_width0
=
(
out_w_blk
<<
1
)
-
padding_left
;
int
in_width1
=
(
(
out_w_blk
+
out_w_blks
)
<<
1
)
-
padding_left
;
int
in_width2
=
(
(
out_w_blk
+
(
out_w_blks
<<
1
))
<<
1
)
-
padding_left
;
int
in_width3
=
(
(
out_w_blk
+
(
out_w_blks
<<
1
)
+
out_w_blks
)
<<
1
)
-
padding_left
;
const
int
height_idx
=
(
(
out_hb
%
out_height
)
<<
1
)
-
padding_top
;
#
endif
const
int
batch_idx
=
(
out_hb
/
out_height
)
*
in_height
;
const
int
batch_idx
=
mul24
((
out_hb
/
out_height
)
,
in_height
)
;
const
int
rounded_in_ch_x_filter_width
=
mul24
(
rounded_in_ch,
filter_width
)
;
DATA_TYPE4
in0,
in1,
in2,
in3
;
DATA_TYPE4
weights0,
weights1,
weights2,
weights3
;
int
in_idx,
in_width_idx
;
//
Unrolling
this
loop
hurt
perfmance
for
(
short
in_ch_blk
=
0
; in_ch_blk < in_ch_blks; ++in_ch_blk) {
const
int
in_idx
=
mul24
(
in_ch_blk,
in_width
)
;
int
filter_x_part0
=
in_ch_blk
<<
2
;
for
(
short
hb_idx
=
0
; hb_idx < filter_height; ++hb_idx) {
int
in_hb_value
=
height_idx
+
hb_idx
;
in_hb_value
=
select
(
in_hb_value
+
batch_idx,
-1
,
(
in_hb_value
<
0
|
| in_hb_value >= in_height));
int filter_x_part1 = 0;
for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
in_idx = in_ch_blk * in_width;
int in_width_value;
#define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \
...
...
@@ -79,36 +79,37 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
#
undef
READ_INPUT
int
filter_idx
=
(
in_ch_blk
<<
2
)
+
(
hb_idx
*
filter_width
+
width_idx
)
*
rounded_in_ch
;
//
int
filter_idx
=
(
hb_idx
*
filter_width
+
width_idx
)
*
rounded_in_ch
+
(
in_ch_blk
<<
2
)
;
int
filter_idx
=
filter_x_part0
+
filter_x_part1
;
weights0
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_idx
+
0
,
out_ch_blk
))
;
weights1
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_idx
+
1
,
out_ch_blk
))
;
weights2
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_idx
+
2
,
out_ch_blk
))
;
weights3
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_idx
+
3
,
out_ch_blk
))
;
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
out0
=
mad
(
in0.x,
weights0,
out0
)
;
out0
=
mad
(
in0.y,
weights1,
out0
)
;
out0
=
mad
(
in0.z,
weights2,
out0
)
;
out0
=
mad
(
in0.w,
weights3,
out0
)
;
//
Interleaving
load
and
mul
does
not
improve
performance
as
expected
out0
+=
in0.x
*
weights0
;
out0
+=
in0.y
*
weights1
;
out0
+=
in0.z
*
weights2
;
out0
+=
in0.w
*
weights3
;
out1
+=
in1.x
*
weights0
;
out1
+=
in1.y
*
weights1
;
out1
+=
in1.z
*
weights2
;
out1
+=
in1.w
*
weights3
;
out1
=
mad
(
in1.x,
weights0,
out1
)
;
out1
=
mad
(
in1.y,
weights1,
out1
)
;
out1
=
mad
(
in1.z,
weights2,
out1
)
;
out1
=
mad
(
in1.w,
weights3,
out1
)
;
out2
+=
in2.x
*
weights0
;
out2
+=
in2.y
*
weights1
;
out2
+=
in2.z
*
weights2
;
out2
+=
in2.w
*
weights3
;
out2
=
mad
(
in2.x,
weights0,
out2
)
;
out2
=
mad
(
in2.y,
weights1,
out2
)
;
out2
=
mad
(
in2.z,
weights2,
out2
)
;
out2
=
mad
(
in2.w,
weights3,
out2
)
;
out3
+=
in3.x
*
weights0
;
out3
+=
in3.y
*
weights1
;
out3
+=
in3.z
*
weights2
;
out3
+=
in3.w
*
weights3
;
out3
=
mad
(
in3.x,
weights0,
out3
)
;
out3
=
mad
(
in3.y,
weights1,
out3
)
;
out3
=
mad
(
in3.z,
weights2,
out3
)
;
out3
=
mad
(
in3.w,
weights3,
out3
)
;
filter_x_part1
+=
rounded_in_ch
;
}
filter_x_part0
+=
rounded_in_ch_x_filter_width
;
}
}
...
...
@@ -120,28 +121,20 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
out3
=
fmax
(
out3,
0
)
;
#
endif
const
int
out_x_base
=
out_ch_blk
*
out_width
;
const
int
out_x_base
=
mul24
(
out_ch_blk,
out_width
)
;
int
w
=
out_w_blk
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out0
)
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out0
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out1
)
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out1
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out2
)
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out2
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out3
)
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out3
)
;
}
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
63e6e07f
...
...
@@ -36,11 +36,11 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
w.w
=
w.z
+
out_w_blks
;
int
out_hb_idx
=
(
out_hb
%
height
)
;
#
else
w.x
=
out_w_blk
*
2
;
w.y
=
(
out_w_blk
+
out_w_blks
)
*
2
;
w.z
=
(
out_w_blk
+
2
*
out_w_blks
)
*
2
;
w.w
=
(
out_w_blk
+
3
*
out_w_blks
)
*
2
;
int
out_hb_idx
=
(
out_hb
%
height
)
*
2
;
w.x
=
out_w_blk
<<
1
;
w.y
=
(
out_w_blk
+
out_w_blks
)
<<
1
;
w.z
=
(
out_w_blk
+
(
out_w_blks
<<
1
))
<<
1
;
w.w
=
(
out_w_blk
+
(
out_w_blks
<<
1
)
+
out_w_blks
)
<<
1
;
int
out_hb_idx
=
(
out_hb
%
height
)
<<
1
;
#
endif
w.x
=
select
(
w.x,
INT_MIN,
w.x
>=
in_width
)
;
...
...
@@ -48,47 +48,46 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
w.z
=
select
(
w.z,
INT_MIN,
w.z
>=
in_width
)
;
w.w
=
select
(
w.w,
INT_MIN,
w.w
>=
in_width
)
;
out_hb_idx
=
select
(
out_hb_idx
+
(
out_hb
/
height
)
*
in_height
,
out_hb_idx
=
select
(
mad24
((
out_hb
/
height
)
,
in_height,
out_hb_idx
)
,
-1
,
out_hb_idx
>=
in_height
)
;
//
Unrolling
this
loop
hurt
perfmance
int
in_x_base
=
0
;
int
filter_x_base
=
0
;
for
(
int
in_ch_blk
=
0
; in_ch_blk < in_ch_blks; ++in_ch_blk) {
DATA_TYPE4
in0
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
in_x_base
+
w.x,
out_hb_idx
))
;
DATA_TYPE4
in1
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
in_x_base
+
w.y,
out_hb_idx
))
;
DATA_TYPE4
in2
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
in_x_base
+
w.z,
out_hb_idx
))
;
DATA_TYPE4
in3
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
in_x_base
+
w.w,
out_hb_idx
))
;
const
int
filter_x0
=
in_ch_blk
<<
2
;
DATA_TYPE4
weights0
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_x0,
out_ch_blk
))
;
DATA_TYPE4
weights1
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_x0
+
1
,
out_ch_blk
))
;
DATA_TYPE4
weights2
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_x0
+
2
,
out_ch_blk
))
;
DATA_TYPE4
weights3
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_x0
+
3
,
out_ch_blk
))
;
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
out0
+=
in0.x
*
weights0
;
out0
+=
in0.y
*
weights1
;
out0
+=
in0.z
*
weights2
;
out0
+=
in0.w
*
weights3
;
out1
+=
in1.x
*
weights0
;
out1
+=
in1.y
*
weights1
;
out1
+=
in1.z
*
weights2
;
out1
+=
in1.w
*
weights3
;
out2
+=
in2.x
*
weights0
;
out2
+=
in2.y
*
weights1
;
out2
+=
in2.z
*
weights2
;
out2
+=
in2.w
*
weights3
;
out3
+=
in3.x
*
weights0
;
out3
+=
in3.y
*
weights1
;
out3
+=
in3.z
*
weights2
;
out3
+=
in3.w
*
weights3
;
DATA_TYPE4
weights0
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_x_base
+
0
,
out_ch_blk
))
;
DATA_TYPE4
weights1
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_x_base
+
1
,
out_ch_blk
))
;
DATA_TYPE4
weights2
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_x_base
+
2
,
out_ch_blk
))
;
DATA_TYPE4
weights3
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_x_base
+
3
,
out_ch_blk
))
;
out0
=
mad
(
in0.x,
weights0,
out0
)
;
out0
=
mad
(
in0.y,
weights1,
out0
)
;
out0
=
mad
(
in0.z,
weights2,
out0
)
;
out0
=
mad
(
in0.w,
weights3,
out0
)
;
out1
=
mad
(
in1.x,
weights0,
out1
)
;
out1
=
mad
(
in1.y,
weights1,
out1
)
;
out1
=
mad
(
in1.z,
weights2,
out1
)
;
out1
=
mad
(
in1.w,
weights3,
out1
)
;
out2
=
mad
(
in2.x,
weights0,
out2
)
;
out2
=
mad
(
in2.y,
weights1,
out2
)
;
out2
=
mad
(
in2.z,
weights2,
out2
)
;
out2
=
mad
(
in2.w,
weights3,
out2
)
;
out3
=
mad
(
in3.x,
weights0,
out3
)
;
out3
=
mad
(
in3.y,
weights1,
out3
)
;
out3
=
mad
(
in3.z,
weights2,
out3
)
;
out3
=
mad
(
in3.w,
weights3,
out3
)
;
in_x_base
+=
in_width
;
filter_x_base
+=
4
;
}
#
ifdef
FUSED_RELU
...
...
@@ -99,7 +98,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out3
=
fmax
(
out3,
0
)
;
#
endif
const
int
out_x_base
=
out_ch_blk
*
width
;
const
int
out_x_base
=
mul24
(
out_ch_blk,
width
)
;
int
out_x_idx
=
out_w_blk
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out0
)
;
...
...
@@ -114,5 +113,4 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out_x_idx
+=
out_w_blks
;
if
(
out_x_idx
>=
width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out3
)
;
}
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
63e6e07f
...
...
@@ -17,7 +17,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blks
=
get_global_size
(
1
)
;
const
int
out_hb
=
get_global_id
(
2
)
;
const
int
rounded_in_ch
=
in_ch_blks
*
4
;
const
int
rounded_in_ch
=
in_ch_blks
<<
2
;
#
ifdef
BIAS
DATA_TYPE4
out0
=
...
...
@@ -42,29 +42,30 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
int
in_width4
=
in_width3
+
out_w_blks
;
const
int
height_idx
=
(
out_hb
%
out_height
)
-
padding_top
;
#
else
int
in_width0
=
out_w_blk
*
2
-
padding_left
;
int
in_width1
=
(
out_w_blk
+
out_w_blks
)
*
2
-
padding_left
;
int
in_width2
=
(
out_w_blk
+
2
*
out_w_blks
)
*
2
-
padding_left
;
int
in_width3
=
(
out_w_blk
+
3
*
out_w_blks
)
*
2
-
padding_left
;
int
in_width4
=
(
out_w_blk
+
4
*
out_w_blks
)
*
2
-
padding_left
;
const
int
height_idx
=
(
out_hb
%
out_height
)
*
2
-
padding_top
;
int
in_width0
=
(
out_w_blk
<<
1
)
-
padding_left
;
int
in_width1
=
(
(
out_w_blk
+
out_w_blks
)
<<
1
)
-
padding_left
;
int
in_width2
=
(
(
out_w_blk
+
(
out_w_blks
<<
1
))
<<
1
)
-
padding_left
;
int
in_width3
=
(
(
out_w_blk
+
(
out_w_blks
<<
1
)
+
out_w_blks
)
<<
1
)
-
padding_left
;
int
in_width4
=
(
(
out_w_blk
+
(
out_w_blks
<<
2
))
<<
1
)
-
padding_left
;
const
int
height_idx
=
(
(
out_hb
%
out_height
)
<<
1
)
-
padding_top
;
#
endif
const
int
batch_idx
=
(
out_hb
/
out_height
)
*
in_height
;
const
int
batch_idx
=
mul24
((
out_hb
/
out_height
)
,
in_height
)
;
const
int
rounded_in_ch_x_3
=
(
rounded_in_ch
<<
1
)
+
rounded_in_ch
;
DATA_TYPE4
in0,
in1,
in2,
in3,
in4
;
DATA_TYPE4
weights0,
weights1,
weights2,
weights3
;
int
in_idx,
hb_idx,
width_idx,
in_width_idx
;
//
Unrolling
this
loop
hurt
perfmance
int
hb_idx,
width_idx,
in_width_idx
;
for
(
short
in_ch_blk
=
0
; in_ch_blk < in_ch_blks; ++in_ch_blk) {
const
int
in_idx
=
mul24
(
in_ch_blk,
in_width
)
;
int
filter_x_part0
=
in_ch_blk
<<
2
;
for
(
short
hb_idx
=
0
; hb_idx < 3; ++hb_idx) {
int
in_hb_value
=
height_idx
+
hb_idx
;
in_hb_value
=
select
(
in_hb_value
+
batch_idx,
-1
,
(
in_hb_value
<
0
|
| in_hb_value >= in_height));
int filter_x_part1 = 0;
for (short width_idx = 0; width_idx < 3; ++width_idx) {
in_idx = in_ch_blk * in_width;
int in_width_value;
#define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \
...
...
@@ -81,40 +82,42 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#
undef
READ_INPUT
int
filter_idx
=
(
in_ch_blk
<<
2
)
+
(
hb_idx
*
3
+
width_idx
)
*
rounded_in_ch
;
//
int
filter_idx
=
(
hb_idx
*
3
+
width_idx
)
*
rounded_in_ch
+
(
in_ch_blk
<<
2
)
;
int
filter_idx
=
filter_x_part0
+
filter_x_part1
;
weights0
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_idx
+
0
,
out_ch_blk
))
;
weights1
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_idx
+
1
,
out_ch_blk
))
;
weights2
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_idx
+
2
,
out_ch_blk
))
;
weights3
=
READ_IMAGET
(
filter,
SAMPLER,
(
int2
)(
filter_idx
+
3
,
out_ch_blk
))
;
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
//
Interleaving
load
and
mul
does
not
improve
performance
as
expected
out0
+=
in0.x
*
weights0
;
out0
+=
in0.y
*
weights1
;
out0
+=
in0.z
*
weights2
;
out
0
+=
in0.w
*
weights3
;
out1
+=
in1.x
*
weights0
;
out1
+=
in1.y
*
weights1
;
out1
+=
in1.z
*
weights2
;
out
1
+=
in1.w
*
weights3
;
out2
+=
in2.x
*
weights0
;
out2
+=
in2.y
*
weights1
;
out2
+=
in2.z
*
weights2
;
out
2
+=
in2.w
*
weights3
;
out3
+=
in3.x
*
weights0
;
out3
+=
in3.y
*
weights1
;
out3
+=
in3.z
*
weights2
;
out
3
+=
in3.w
*
weights3
;
out4
+=
in4.x
*
weights0
;
out4
+=
in4.y
*
weights1
;
out4
+=
in4.z
*
weights2
;
out4
+=
in4.w
*
weights3
;
out0
=
mad
(
in0.x,
weights0,
out0
)
;
out0
=
mad
(
in0.y,
weights1,
out0
)
;
out0
=
mad
(
in0.z,
weights2,
out0
)
;
out0
=
mad
(
in0.w,
weights3,
out0
)
;
out
1
=
mad
(
in1.x,
weights0,
out1
)
;
out1
=
mad
(
in1.y,
weights1,
out1
)
;
out1
=
mad
(
in1.z,
weights2,
out1
)
;
out1
=
mad
(
in1.w,
weights3,
out1
)
;
out
2
=
mad
(
in2.x,
weights0,
out2
)
;
out2
=
mad
(
in2.y,
weights1,
out2
)
;
out2
=
mad
(
in2.z,
weights2,
out2
)
;
out2
=
mad
(
in2.w,
weights3,
out2
)
;
out
3
=
mad
(
in3.x,
weights0,
out3
)
;
out3
=
mad
(
in3.y,
weights1,
out3
)
;
out3
=
mad
(
in3.z,
weights2,
out3
)
;
out3
=
mad
(
in3.w,
weights3,
out3
)
;
out
4
=
mad
(
in4.x,
weights0,
out4
)
;
out4
=
mad
(
in4.y,
weights1,
out4
)
;
out4
=
mad
(
in4.z,
weights2,
out4
)
;
out4
=
mad
(
in4.w,
weights3,
out4
)
;
filter_x_part1
+=
rounded_in_ch
;
}
filter_x_part0
+=
rounded_in_ch_x_3
;
}
}
...
...
@@ -127,7 +130,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out4
=
fmax
(
out4,
0
)
;
#
endif
const
int
out_x_base
=
out_ch_blk
*
out_width
;
const
int
out_x_base
=
mul24
(
out_ch_blk,
out_width
)
;
int
w
=
out_w_blk
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
...
...
mace/kernels/opencl/cl/pooling.cl
浏览文件 @
63e6e07f
...
...
@@ -15,7 +15,7 @@ inline int calculate_avg_block_size(const int pool_size,
const
int
w_start
=
max
(
0
,
pos_w
)
;
const
int
h_end
=
min
(
pos_h
+
pool_size,
h_size
)
;
const
int
w_end
=
min
(
pos_w
+
pool_size,
w_size
)
;
return
(
h_end
-
h_start
)
*
(
w_end
-
w_start
)
;
return
mul24
((
h_end
-
h_start
)
,
(
w_end
-
w_start
)
)
;
}
//
Supported
data
type:
half/float
...
...
@@ -33,10 +33,10 @@ __kernel void pooling(__read_only image2d_t input,
const
int
out_width
=
get_global_size
(
1
)
;
const
int
out_hb_idx
=
get_global_id
(
2
)
;
const
int
batch_idx
=
(
out_hb_idx
/
out_height
)
*
in_height
;
const
int
in_height_start
=
(
out_hb_idx
%
out_height
)
*
stride
-
pad_top
;
const
int
in_width_start
=
out_width_idx
*
stride
-
pad_left
;
const
int
in_channel_offset
=
out_chan_idx
*
in_width
;
const
int
batch_idx
=
mul24
((
out_hb_idx
/
out_height
)
,
in_height
)
;
const
int
in_height_start
=
mul24
((
out_hb_idx
%
out_height
)
,
stride
)
-
pad_top
;
const
int
in_width_start
=
mul24
(
out_width_idx,
stride
)
-
pad_left
;
const
int
in_channel_offset
=
mul24
(
out_chan_idx,
in_width
)
;
#
ifdef
POOL_AVG
...
...
@@ -83,5 +83,5 @@ __kernel void pooling(__read_only image2d_t input,
}
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_chan_idx
*
out_width
+
out_width_idx
,
out_hb_idx
)
,
res
)
;
WRITE_IMAGET
(
output,
(
int2
)(
mad24
(
out_chan_idx,
out_width,
out_width_idx
)
,
out_hb_idx
)
,
res
)
;
}
mace/kernels/opencl/cl/relu.cl
浏览文件 @
63e6e07f
...
...
@@ -8,7 +8,7 @@ __kernel void relu(__read_only image2d_t input,
const
int
hb
=
get_global_id
(
2
)
;
const
int
width
=
get_global_size
(
1
)
;
const
int
pos
=
ch_blk
*
width
+
w
;
const
int
pos
=
mad24
(
ch_blk,
width,
w
)
;
DATA_TYPE4
in
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos,
hb
))
;
DATA_TYPE4
out
=
fmax
(
in,
0
)
;
WRITE_IMAGET
(
output,
(
int2
)(
pos,
hb
)
,
out
)
;
...
...
@@ -22,7 +22,7 @@ __kernel void relux(__read_only image2d_t input,
const
int
hb
=
get_global_id
(
2
)
;
const
int
width
=
get_global_size
(
1
)
;
const
int
pos
=
ch_blk
*
width
+
w
;
const
int
pos
=
mad24
(
ch_blk,
width,
w
)
;
DATA_TYPE4
in
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos,
hb
))
;
DATA_TYPE4
out
=
clamp
(
in,
0
,
max_limit
)
;
WRITE_IMAGET
(
output,
(
int2
)(
pos,
hb
)
,
out
)
;
...
...
mace/kernels/opencl/cl/resize_bilinear.cl
浏览文件 @
63e6e07f
...
...
@@ -25,8 +25,8 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
const
float
h_lerp
=
h_in
-
h_lower
;
const
float
w_lerp
=
w_in
-
w_lower
;
const
int
in_w_offset
=
ch_blk
*
in_width
;
const
int
in_h_offset
=
b
*
in_height
;
const
int
in_w_offset
=
mul24
(
ch_blk,
in_width
)
;
const
int
in_h_offset
=
mul24
(
b,
in_height
)
;
DATA_TYPE4
top_left
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
in_w_offset
+
w_lower,
in_h_offset
+
h_lower
))
;
...
...
@@ -37,13 +37,12 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
DATA_TYPE4
bottom_right
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
in_w_offset
+
w_upper,
in_h_offset
+
h_upper
))
;
DATA_TYPE4
top
=
top_left
+
(
top_right
-
top_left
)
*
w_lerp
;
DATA_TYPE4
bottom
=
bottom_left
+
(
bottom_right
-
bottom_left
)
*
w_lerp
;
DATA_TYPE4
top
=
mad
((
top_right
-
top_left
)
,
w_lerp,
top_left
)
;
DATA_TYPE4
bottom
=
mad
((
bottom_right
-
bottom_left
)
,
w_lerp,
bottom_left
)
;
DATA_TYPE4
out
=
mad
((
bottom
-
top
)
,
h_lerp,
top
)
;
DATA_TYPE4
out
=
top
+
(
bottom
-
top
)
*
h_lerp
;
const
int
out_w_offset
=
ch_blk
*
out_width
;
const
int
out_h_offset
=
b
*
out_height
;
const
int
out_w_offset
=
mul24
(
ch_blk,
out_width
)
;
const
int
out_h_offset
=
mul24
(
b,
out_height
)
;
WRITE_IMAGET
(
output,
(
int2
)(
out_w_offset
+
w,
out_h_offset
+
h
)
,
out
)
;
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录