Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
慢慢CG
Mace
提交
92e3e526
Mace
项目概览
慢慢CG
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
92e3e526
编写于
11月 30, 2017
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add conv 1x1 benchmark code.
上级
e52c49b3
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
79 addition
and
28 deletion
+79
-28
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+45
-27
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+1
-1
mace/ops/conv_2d_benchmark.cc
mace/ops/conv_2d_benchmark.cc
+33
-0
未找到文件。
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
92e3e526
...
@@ -25,15 +25,15 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -25,15 +25,15 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
#
ifdef
BIAS
#
ifdef
BIAS
DATA_TYPE4
out0
=
READ_IMAGET
(
bias,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
float4
out0
=
convert_float4
(
READ_IMAGET
(
bias,
sampler,
(
int2
)(
out_ch_blk,
0
)
))
;
DATA_TYPE
4
out1
=
out0
;
float
4
out1
=
out0
;
DATA_TYPE
4
out2
=
out0
;
float
4
out2
=
out0
;
DATA_TYPE
4
out3
=
out0
;
float
4
out3
=
out0
;
#
else
#
else
DATA_TYPE
4
out0
=
0
;
float
4
out0
=
0
;
DATA_TYPE
4
out1
=
0
;
float
4
out1
=
0
;
DATA_TYPE
4
out2
=
0
;
float
4
out2
=
0
;
DATA_TYPE
4
out3
=
0
;
float
4
out3
=
0
;
#
endif
#
endif
int4
w
;
int4
w
;
...
@@ -64,16 +64,16 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -64,16 +64,16 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
int
in_x_base
=
0
;
int
in_x_base
=
0
;
for
(
int
in_ch_blk
=
0
; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for
(
int
in_ch_blk
=
0
; in_ch_blk < in_ch_blks; ++in_ch_blk) {
DATA_TYPE4
in0
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.x,
out_hb_idx
))
;
float4
in0
=
convert_float4
(
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.x,
out_hb_idx
)
))
;
DATA_TYPE4
in1
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.y,
out_hb_idx
))
;
float4
in1
=
convert_float4
(
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.y,
out_hb_idx
)
))
;
DATA_TYPE4
in2
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.z,
out_hb_idx
))
;
float4
in2
=
convert_float4
(
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.z,
out_hb_idx
)
))
;
DATA_TYPE4
in3
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.w,
out_hb_idx
))
;
float4
in3
=
convert_float4
(
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.w,
out_hb_idx
)
))
;
const
int
filter_x0
=
in_ch_blk
<<
2
;
const
int
filter_x0
=
in_ch_blk
<<
2
;
DATA_TYPE4
weights0
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_x0,
out_ch_blk
))
;
float4
weights0
=
convert_float4
(
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_x0,
out_ch_blk
)
))
;
DATA_TYPE4
weights1
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_x0
+
1
,
out_ch_blk
))
;
float4
weights1
=
convert_float4
(
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_x0
+
1
,
out_ch_blk
)
))
;
DATA_TYPE4
weights2
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_x0
+
2
,
out_ch_blk
))
;
float4
weights2
=
convert_float4
(
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_x0
+
2
,
out_ch_blk
)
))
;
DATA_TYPE4
weights3
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_x0
+
3
,
out_ch_blk
))
;
float4
weights3
=
convert_float4
(
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_x0
+
3
,
out_ch_blk
)
))
;
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
out0
+=
in0.x
*
weights0
;
out0
+=
in0.x
*
weights0
;
...
@@ -101,18 +101,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -101,18 +101,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#
ifdef
FUSED_BATCH_NORM
#
ifdef
FUSED_BATCH_NORM
//
batch
norm
//
batch
norm
DATA_TYPE
4
bn_scale_value
=
float
4
bn_scale_value
=
READ_IMAGET
(
bn_scale,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
convert_float4
(
READ_IMAGET
(
bn_scale,
sampler,
(
int2
)(
out_ch_blk,
0
)
))
;
DATA_TYPE4
scale0
=
(
DATA_TYPE
4
)(
bn_scale_value.x
)
;
float4
scale0
=
(
float
4
)(
bn_scale_value.x
)
;
DATA_TYPE4
scale1
=
(
DATA_TYPE
4
)(
bn_scale_value.y
)
;
float4
scale1
=
(
float
4
)(
bn_scale_value.y
)
;
DATA_TYPE4
scale2
=
(
DATA_TYPE
4
)(
bn_scale_value.z
)
;
float4
scale2
=
(
float
4
)(
bn_scale_value.z
)
;
DATA_TYPE4
scale3
=
(
DATA_TYPE
4
)(
bn_scale_value.w
)
;
float4
scale3
=
(
float
4
)(
bn_scale_value.w
)
;
DATA_TYPE
4
bn_offset_value
=
float
4
bn_offset_value
=
READ_IMAGET
(
bn_offset,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
READ_IMAGET
(
bn_offset,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
DATA_TYPE4
offset0
=
(
DATA_TYPE
4
)(
bn_offset_value.x
)
;
float4
offset0
=
(
float
4
)(
bn_offset_value.x
)
;
DATA_TYPE4
offset1
=
(
DATA_TYPE
4
)(
bn_offset_value.y
)
;
float4
offset1
=
(
float
4
)(
bn_offset_value.y
)
;
DATA_TYPE4
offset2
=
(
DATA_TYPE
4
)(
bn_offset_value.z
)
;
float4
offset2
=
(
float
4
)(
bn_offset_value.z
)
;
DATA_TYPE4
offset3
=
(
DATA_TYPE
4
)(
bn_offset_value.w
)
;
float4
offset3
=
(
float
4
)(
bn_offset_value.w
)
;
out0
=
out0
*
scale0
+
offset0
;
out0
=
out0
*
scale0
+
offset0
;
out1
=
out1
*
scale1
+
offset1
;
out1
=
out1
*
scale1
+
offset1
;
...
@@ -128,6 +128,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -128,6 +128,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out3
=
fmax
(
out3,
0
)
;
out3
=
fmax
(
out3,
0
)
;
#
endif
#
endif
#
ifdef
TYPE_FLOAT
const
int
out_x_base
=
out_ch_blk
*
width
;
const
int
out_x_base
=
out_ch_blk
*
width
;
int
out_x_idx
=
out_w_blk
;
int
out_x_idx
=
out_w_blk
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out0
)
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out0
)
;
...
@@ -143,4 +144,21 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -143,4 +144,21 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out_x_idx
+=
out_w_blks
;
out_x_idx
+=
out_w_blks
;
if
(
out_x_idx
>=
width
)
return
;
if
(
out_x_idx
>=
width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out3
)
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out3
)
;
#
else
const
int
out_x_base
=
out_ch_blk
*
width
;
int
out_x_idx
=
out_w_blk
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
convert_half4
(
out0
))
;
out_x_idx
+=
out_w_blks
;
if
(
out_x_idx
>=
width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
convert_half4
(
out1
))
;
out_x_idx
+=
out_w_blks
;
if
(
out_x_idx
>=
width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
convert_half4
(
out2
))
;
out_x_idx
+=
out_w_blks
;
if
(
out_x_idx
>=
width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
convert_half4
(
out3
))
;
#
endif
}
}
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
92e3e526
...
@@ -33,7 +33,7 @@ void Conv1x1(const Tensor *input,
...
@@ -33,7 +33,7 @@ void Conv1x1(const Tensor *input,
MACE_CHECK
(
input_batch
==
batch
);
MACE_CHECK
(
input_batch
==
batch
);
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
())
);
built_options
.
emplace
(
input
->
dtype
()
==
DT_FLOAT
?
"-DTYPE_FLOAT"
:
""
);
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOPENCLCMDDataType
(
input
->
dtype
()));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOPENCLCMDDataType
(
input
->
dtype
()));
built_options
.
emplace
(
"-DSTRIDE="
+
ToString
(
stride
));
built_options
.
emplace
(
"-DSTRIDE="
+
ToString
(
stride
));
if
(
bias
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
...
...
mace/ops/conv_2d_benchmark.cc
浏览文件 @
92e3e526
...
@@ -91,7 +91,40 @@ static void Conv2d(int iters,
...
@@ -91,7 +91,40 @@ static void Conv2d(int iters,
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
// ICNet
BM_CONV_2D
(
1
,
512
,
15
,
15
,
1
,
1
,
1
,
VALID
,
1024
,
half
);
// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
BM_CONV_2D
(
1
,
64
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
,
half
);
// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
BM_CONV_2D
(
1
,
32
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
,
half
);
BM_CONV_2D
(
1
,
128
,
60
,
60
,
3
,
3
,
1
,
VALID
,
128
,
half
);
// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
BM_CONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
SAME
,
32
,
half
);
BM_CONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
SAME
,
32
,
half
);
// Test RGB <-> YUV
//BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
//BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
//
//BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
//BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments
//BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
//BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
//BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
//BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
//BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
//BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
//BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
}
// namespace mace
}
// namespace mace
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录