Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
1374a365
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
1374a365
编写于
11月 29, 2017
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimize opencl conv 3x3 kernel.
上级
fd284f6a
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
113 addition
and
84 deletion
+113
-84
mace/core/opencl_allocator.cc
mace/core/opencl_allocator.cc
+1
-0
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+84
-58
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+3
-3
mace/ops/conv_2d_benchmark.cc
mace/ops/conv_2d_benchmark.cc
+5
-4
mace/ops/conv_2d_test.cc
mace/ops/conv_2d_test.cc
+20
-19
未找到文件。
mace/core/opencl_allocator.cc
浏览文件 @
1374a365
...
...
@@ -13,6 +13,7 @@ namespace {
static
cl_channel_type
DataTypeToCLChannelType
(
const
DataType
t
)
{
switch
(
t
)
{
case
DT_HALF
:
return
CL_HALF_FLOAT
;
case
DT_FLOAT
:
return
CL_FLOAT
;
case
DT_INT8
:
...
...
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
1374a365
...
...
@@ -8,7 +8,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__write_only
image2d_t
output,
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
int
in_ch
annel
s,
__private
const
int
in_ch
_blk
s,
__private
const
int
out_height,
__private
const
int
out_width,
__private
const
int
padding_top,
...
...
@@ -17,25 +17,26 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blks
=
get_global_size
(
1
)
;
const
int
out_hb
=
get_global_id
(
2
)
;
const
int
in_ch_blks
=
(
in_channels
+
3
)
/
4
;
const
int
rounded_in_ch
=
in_ch_blks
*
4
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
out[
4
]
=
{0}
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
out[
5
]
=
{0}
;
#
ifdef
BIAS
out[0]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
bias,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
out[1]
=
out[0]
;
out[2]
=
out[0]
;
out[3]
=
out[0]
;
out[4]
=
out[0]
;
#
endif
int
w[
4
]
;
int
w[
5
]
;
w[0]
=
out_w_blk
-
padding_left
;
w[1]
=
w[0]
+
out_w_blks
;
w[2]
=
w[1]
+
out_w_blks
;
w[3]
=
w[2]
+
out_w_blks
;
w[4]
=
w[3]
+
out_w_blks
;
const
int
batch_idx
=
out_hb
/
out_height
;
const
int
height_idx
=
out_hb
%
out_height
;
...
...
@@ -50,64 +51,83 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int input_image_width = in_ch_blks * in_width;
VEC_DATA_TYPE(DATA_TYPE, 4) in[5];
VEC_DATA_TYPE(DATA_TYPE, 4) weights[4];
int in_idx, hb_idx, width_idx, in_width_idx;
// Unrolling this loop hurt perfmance
int idx = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
VEC_DATA_TYPE(DATA_TYPE, 4) in[36];
VEC_DATA_TYPE(DATA_TYPE, 4) weights[36];
for (int i = 0; i < 9; ++i) {
int filter_idx = in_ch_blk << 2;
int in_idx = in_ch_blk * in_width;
in_idx = in_ch_blk * in_width;
#pragma unroll
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 3; ++j) {
idx = i * 12 + j * 4;
int in_width_idx = w[0] + j;
hb_idx = i / 3;
width_idx = i % 3;
in_width_idx = w[0] + width_idx;
// Judge the width border for padding input.
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 0] = 0;
in[0] = 0;
} else {
in[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[1] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[1] = 0;
} else {
in[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i
]));
in[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx
]));
}
in_width_idx = w[1] + j
;
in_width_idx = w[2] + width_idx
;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 1
] = 0;
in[2
] = 0;
} else {
in[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i
]));
in[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx
]));
}
in_width_idx = w[2] + j
;
in_width_idx = w[3] + width_idx
;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 2
] = 0;
in[3
] = 0;
} else {
in[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i
]));
in[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx
]));
}
in_width_idx = w[3] + j
;
in_width_idx = w[4] + width_idx
;
if (in_width_idx < 0 |
|
in_width_idx
>=
in_width
)
{
in[idx
+
3
]
=
0
;
in[4
]
=
0
;
}
else
{
in[idx
+
3]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
sampler,
(
int2
)(
in_idx
+
in_width_idx,
in_hb[i
]
))
;
in[4]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
sampler,
(
int2
)(
in_idx
+
in_width_idx,
in_hb[hb_idx
]
))
;
}
weights[idx
+
0]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
0
,
out_ch_blk
))
;
weights[idx
+
1]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
1
,
out_ch_blk
))
;
weights[idx
+
2]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
2
,
out_ch_blk
))
;
weights[idx
+
3]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
3
,
out_ch_blk
))
;
filter_idx
+=
rounded_in_ch
;
}
}
int
filter_idx
=
(
in_ch_blk
<<
2
)
+
i
*
rounded_in_ch
;
weights[0]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
0
,
out_ch_blk
))
;
weights[1]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
1
,
out_ch_blk
))
;
weights[2]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
2
,
out_ch_blk
))
;
weights[3]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
3
,
out_ch_blk
))
;
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
//
Interleaving
load
and
mul
does
not
improve
performance
as
expected
#
pragma
unroll
for
(
int
c
=
0
; c < 4; ++c) {
for
(
int
i
=
0
; i < 9; ++i) {
out[c]
+=
in[c
+
i
*
4].x
*
weights[0
+
i
*
4]
;
out[c]
+=
in[c
+
i
*
4].y
*
weights[1
+
i
*
4]
;
out[c]
+=
in[c
+
i
*
4].z
*
weights[2
+
i
*
4]
;
out[c]
+=
in[c
+
i
*
4].w
*
weights[3
+
i
*
4]
;
}
out[0]
+=
in[0].x
*
weights[0]
;
out[0]
+=
in[0].y
*
weights[1]
;
out[0]
+=
in[0].z
*
weights[2]
;
out[0]
+=
in[0].w
*
weights[3]
;
out[1]
+=
in[1].x
*
weights[0]
;
out[1]
+=
in[1].y
*
weights[1]
;
out[1]
+=
in[1].z
*
weights[2]
;
out[1]
+=
in[1].w
*
weights[3]
;
out[2]
+=
in[2].x
*
weights[0]
;
out[2]
+=
in[2].y
*
weights[1]
;
out[2]
+=
in[2].z
*
weights[2]
;
out[2]
+=
in[2].w
*
weights[3]
;
out[3]
+=
in[3].x
*
weights[0]
;
out[3]
+=
in[3].y
*
weights[1]
;
out[3]
+=
in[3].z
*
weights[2]
;
out[3]
+=
in[3].w
*
weights[3]
;
out[4]
+=
in[4].x
*
weights[0]
;
out[4]
+=
in[4].y
*
weights[1]
;
out[4]
+=
in[4].z
*
weights[2]
;
out[4]
+=
in[4].w
*
weights[3]
;
}
}
...
...
@@ -133,4 +153,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
(
int2
)(
out_x_base
+
w[3],
out_hb
)
,
out[3]
)
;
w[4]
+=
padding_left
;
if
(
w[4]
>=
out_width
)
return
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
(
int2
)(
out_x_base
+
w[4],
out_hb
)
,
out[4]
)
;
}
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
1374a365
...
...
@@ -22,7 +22,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
width_blocks
=
RoundUpDiv
4
(
width
);
const
index_t
width_blocks
=
RoundUpDiv
<
index_t
,
5
>
(
width
);
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()));
...
...
@@ -44,7 +44,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
output
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
1
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
2
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
3
)
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input
_channel_blocks
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
conv_2d_kernel
.
setArg
(
idx
++
,
padding
[
0
]
/
2
);
...
...
@@ -56,7 +56,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)),
cl
::
NDRange
(
4
,
15
,
8
),
cl
::
NDRange
(
16
,
16
,
4
),
NULL
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
,
error
);
...
...
mace/ops/conv_2d_benchmark.cc
浏览文件 @
1374a365
...
...
@@ -27,10 +27,10 @@ static void Conv2d(int iters,
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
channels
,
output_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
...
...
@@ -88,6 +88,7 @@ static void Conv2d(int iters,
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
// ICNet
...
...
@@ -99,7 +100,7 @@ BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float);
BM_CONV_2D
(
1
,
32
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
,
float
);
// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
BM_CONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
VALID
,
32
,
float
);
BM_CONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
SAME
,
32
,
float
);
// Test RGB <-> YUV
BM_CONV_2D
(
1
,
3
,
2160
,
1080
,
1
,
1
,
1
,
VALID
,
3
,
float
);
...
...
mace/ops/conv_2d_test.cc
浏览文件 @
1374a365
...
...
@@ -84,18 +84,18 @@ TEST_F(Conv2dOpTest, NEONSimple) {
TestSimple3x3SAME
<
DeviceType
::
NEON
>
();
}
template
<
DeviceType
D
>
template
<
DeviceType
D
,
typename
T
>
void
TestNHWCSimple3x3VALID
()
{
OpsTestNet
net
;
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Input"
,
{
1
,
3
,
3
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Filter"
,
{
3
,
3
,
2
,
1
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
1
},
{
0.1
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
...
...
@@ -130,23 +130,23 @@ void TestNHWCSimple3x3VALID() {
net
.
RunOp
(
D
);
}
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
1
},
{
18.1
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
auto
expected
=
CreateTensor
<
T
>
({
1
,
1
,
1
,
1
},
{
18.1
f
});
ExpectTensorNear
<
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
template
<
DeviceType
D
>
template
<
DeviceType
D
,
typename
T
>
void
TestNHWCSimple3x3SAME
()
{
OpsTestNet
net
;
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Input"
,
{
1
,
3
,
3
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Filter"
,
{
3
,
3
,
2
,
1
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
1
},
{
0.1
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
...
...
@@ -181,21 +181,21 @@ void TestNHWCSimple3x3SAME() {
net
.
RunOp
(
D
);
}
auto
expected
=
CreateTensor
<
float
>
(
auto
expected
=
CreateTensor
<
T
>
(
{
1
,
3
,
3
,
1
},
{
8.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
18.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
8.1
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
ExpectTensorNear
<
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
TEST_F
(
Conv2dOpTest
,
CPUSimple
)
{
TestNHWCSimple3x3VALID
<
DeviceType
::
CPU
>
();
TestNHWCSimple3x3SAME
<
DeviceType
::
CPU
>
();
TestNHWCSimple3x3VALID
<
DeviceType
::
CPU
,
float
>
();
TestNHWCSimple3x3SAME
<
DeviceType
::
CPU
,
float
>
();
}
TEST_F
(
Conv2dOpTest
,
OPENCLSimple
)
{
TestNHWCSimple3x3VALID
<
DeviceType
::
OPENCL
>
();
TestNHWCSimple3x3SAME
<
DeviceType
::
OPENCL
>
();
TestNHWCSimple3x3VALID
<
DeviceType
::
OPENCL
,
float
>
();
TestNHWCSimple3x3SAME
<
DeviceType
::
OPENCL
,
float
>
();
}
template
<
DeviceType
D
>
...
...
@@ -457,11 +457,11 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
srand
(
time
(
NULL
));
// generate random input
index_t
batch
=
3
+
rand
()
%
10
;
index_t
batch
=
1
;
index_t
height
=
shape
[
0
];
index_t
width
=
shape
[
1
];
index_t
input_channels
=
shape
[
2
]
+
rand
()
%
10
;
index_t
output_channels
=
shape
[
3
]
+
rand
()
%
10
;
index_t
input_channels
=
shape
[
2
];
index_t
output_channels
=
shape
[
3
];
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
...
...
@@ -509,6 +509,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
for
(
int
kernel_size
:
{
3
})
{
for
(
int
stride
:
{
1
})
{
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
);
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
SAME
);
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录