Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
慢慢CG
Mace
提交
e4ec380d
Mace
项目概览
慢慢CG
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
e4ec380d
编写于
11月 29, 2017
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Conv3x3 opencl : remove array.
上级
bc7bd0a2
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
118 addition
and
116 deletion
+118
-116
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+118
-116
未找到文件。
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
e4ec380d
...
...
@@ -3,7 +3,7 @@
__kernel
void
conv_2d_3x3
(
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin
*
kw
*
kh,
cout/4
*/
#
ifdef
BIAS
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
#
endif
__write_only
image2d_t
output,
__private
const
int
in_height,
...
...
@@ -19,24 +19,27 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const
int
out_hb
=
get_global_id
(
2
)
;
const
int
rounded_in_ch
=
in_ch_blks
*
4
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
DATA_TYPE4
out0
=
0
;
DATA_TYPE4
out1
=
0
;
DATA_TYPE4
out2
=
0
;
DATA_TYPE4
out3
=
0
;
DATA_TYPE4
out4
=
0
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
out[5]
=
{0}
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
#
ifdef
BIAS
out
[0]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)
(
bias,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
out
[1]
=
out[0]
;
out
[2]
=
out[0]
;
out
[3]
=
out[0]
;
out
[4]
=
out[0]
;
out
0
=
READ_IMAGET
(
bias,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
out
1
=
out0
;
out
2
=
out0
;
out
3
=
out0
;
out
4
=
out0
;
#
endif
int
w[5]
;
w[0]
=
out_w_blk
-
padding_left
;
w[1]
=
w[0]
+
out_w_blks
;
w[2]
=
w[1]
+
out_w_blks
;
w[3]
=
w[2]
+
out_w_blks
;
w[4]
=
w[3]
+
out_w_blks
;
int
w0
=
out_w_blk
-
padding_left
;
int
w1
=
w0
+
out_w_blks
;
int
w2
=
w1
+
out_w_blks
;
int
w3
=
w2
+
out_w_blks
;
int
w4
=
w3
+
out_w_blks
;
const
int
batch_idx
=
out_hb
/
out_height
;
const
int
height_idx
=
out_hb
%
out_height
;
...
...
@@ -51,112 +54,111 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int input_image_width = in_ch_blks * in_width;
VEC_DATA_TYPE(DATA_TYPE, 4) in[5]
;
VEC_DATA_TYPE(DATA_TYPE, 4) weights[4]
;
DATA_TYPE4 in0, in1, in2, in3, in4
;
DATA_TYPE4 weights0, weights1, weights2, weights3
;
int in_idx, hb_idx, width_idx, in_width_idx;
// Unrolling this loop hurt perfmance
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for (int i = 0; i < 9; ++i) {
in_idx = in_ch_blk * in_width;
hb_idx = i / 3;
width_idx = i % 3;
in_width_idx = w[0] + width_idx;
// Judge the width border for padding input.
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[0] = 0;
} else {
in[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[1] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[1] = 0;
} else {
in[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[2] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[2] = 0;
} else {
in[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for (short hb_idx = 0; hb_idx < 3; ++ hb_idx) {
for (short width_idx = 0; width_idx < 3; ++width_idx) {
in_idx = in_ch_blk * in_width;
in_width_idx = w0 + width_idx;
// Judge the width border for padding input.
if (in_width_idx < 0 || in_width_idx >= in_width) {
in0 = 0;
} else {
in0 = READ_IMAGET(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w1 + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in1 = 0;
} else {
in1 = READ_IMAGET(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w2 + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in2 = 0;
} else {
in2 = READ_IMAGET(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w3 + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in3 = 0;
} else {
in3 = READ_IMAGET(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w4 + width_idx;
if (in_width_idx < 0 |
|
in_width_idx
>=
in_width
)
{
in4
=
0
;
}
else
{
in4
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_idx
+
in_width_idx,
in_hb[hb_idx]
))
;
}
int
filter_idx
=
(
in_ch_blk
<<
2
)
+
(
hb_idx
*
3
+
width_idx
)
*
rounded_in_ch
;
weights0
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_idx
+
0
,
out_ch_blk
))
;
weights1
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_idx
+
1
,
out_ch_blk
))
;
weights2
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_idx
+
2
,
out_ch_blk
))
;
weights3
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_idx
+
3
,
out_ch_blk
))
;
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
//
Interleaving
load
and
mul
does
not
improve
performance
as
expected
out0
+=
in0.x
*
weights0
;
out0
+=
in0.y
*
weights1
;
out0
+=
in0.z
*
weights2
;
out0
+=
in0.w
*
weights3
;
out1
+=
in1.x
*
weights0
;
out1
+=
in1.y
*
weights1
;
out1
+=
in1.z
*
weights2
;
out1
+=
in1.w
*
weights3
;
out2
+=
in2.x
*
weights0
;
out2
+=
in2.y
*
weights1
;
out2
+=
in2.z
*
weights2
;
out2
+=
in2.w
*
weights3
;
out3
+=
in3.x
*
weights0
;
out3
+=
in3.y
*
weights1
;
out3
+=
in3.z
*
weights2
;
out3
+=
in3.w
*
weights3
;
out4
+=
in4.x
*
weights0
;
out4
+=
in4.y
*
weights1
;
out4
+=
in4.z
*
weights2
;
out4
+=
in4.w
*
weights3
;
}
in_width_idx = w[3] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[3] = 0;
} else {
in[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[4] + width_idx;
if (in_width_idx < 0 |
|
in_width_idx
>=
in_width
)
{
in[4]
=
0
;
}
else
{
in[4]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
sampler,
(
int2
)(
in_idx
+
in_width_idx,
in_hb[hb_idx]
))
;
}
int
filter_idx
=
(
in_ch_blk
<<
2
)
+
i
*
rounded_in_ch
;
weights[0]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
0
,
out_ch_blk
))
;
weights[1]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
1
,
out_ch_blk
))
;
weights[2]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
2
,
out_ch_blk
))
;
weights[3]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
3
,
out_ch_blk
))
;
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
//
Interleaving
load
and
mul
does
not
improve
performance
as
expected
out[0]
+=
in[0].x
*
weights[0]
;
out[0]
+=
in[0].y
*
weights[1]
;
out[0]
+=
in[0].z
*
weights[2]
;
out[0]
+=
in[0].w
*
weights[3]
;
out[1]
+=
in[1].x
*
weights[0]
;
out[1]
+=
in[1].y
*
weights[1]
;
out[1]
+=
in[1].z
*
weights[2]
;
out[1]
+=
in[1].w
*
weights[3]
;
out[2]
+=
in[2].x
*
weights[0]
;
out[2]
+=
in[2].y
*
weights[1]
;
out[2]
+=
in[2].z
*
weights[2]
;
out[2]
+=
in[2].w
*
weights[3]
;
out[3]
+=
in[3].x
*
weights[0]
;
out[3]
+=
in[3].y
*
weights[1]
;
out[3]
+=
in[3].z
*
weights[2]
;
out[3]
+=
in[3].w
*
weights[3]
;
out[4]
+=
in[4].x
*
weights[0]
;
out[4]
+=
in[4].y
*
weights[1]
;
out[4]
+=
in[4].z
*
weights[2]
;
out[4]
+=
in[4].w
*
weights[3]
;
}
}
const
int
out_x_base
=
out_ch_blk
*
out_width
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)
(
output,
(
int2
)(
out_x_base
+
w[0]
+
padding_left,
out_hb
)
,
out[0]
)
;
w
[1]
+=
padding_left
;
if
(
w
[1]
>=
out_width
)
return
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)
(
output,
(
int2
)(
out_x_base
+
w[1]
,
out_hb
)
,
out[1]
)
;
w
[2]
+=
padding_left
;
if
(
w
[2]
>=
out_width
)
return
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)
(
output,
(
int2
)(
out_x_base
+
w[2]
,
out_hb
)
,
out[2]
)
;
w
[3]
+=
padding_left
;
if
(
w
[3]
>=
out_width
)
return
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)
(
output,
(
int2
)(
out_x_base
+
w[3]
,
out_hb
)
,
out[3]
)
;
w
[4]
+=
padding_left
;
if
(
w
[4]
>=
out_width
)
return
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)
(
output,
(
int2
)(
out_x_base
+
w[4]
,
out_hb
)
,
out[4]
)
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w0
+
padding_left,
out_hb
)
,
out0
)
;
w
1
+=
padding_left
;
if
(
w
1
>=
out_width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w1
,
out_hb
)
,
out1
)
;
w
2
+=
padding_left
;
if
(
w
2
>=
out_width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w2
,
out_hb
)
,
out2
)
;
w
3
+=
padding_left
;
if
(
w
3
>=
out_width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w3
,
out_hb
)
,
out3
)
;
w
4
+=
padding_left
;
if
(
w
4
>=
out_width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w4
,
out_hb
)
,
out4
)
;
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录