Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
116d33db
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
116d33db
编写于
12月 07, 2017
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add auto-tuning for opencl kernel.
上级
a66e0e35
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
261 addition
and
133 deletion
+261
-133
mace/core/net.cc
mace/core/net.cc
+5
-0
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+27
-21
mace/kernels/opencl/bias_add_opencl.cc
mace/kernels/opencl/bias_add_opencl.cc
+1
-1
mace/kernels/opencl/cl/addn.cl
mace/kernels/opencl/cl/addn.cl
+4
-6
mace/kernels/opencl/cl/buffer_to_image.cl
mace/kernels/opencl/cl/buffer_to_image.cl
+3
-6
mace/kernels/opencl/cl/conv_2d.cl
mace/kernels/opencl/cl/conv_2d.cl
+6
-7
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+9
-11
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+6
-7
mace/kernels/opencl/cl/resize_bilinear.cl
mace/kernels/opencl/cl/resize_bilinear.cl
+4
-5
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+4
-4
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+46
-11
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+45
-9
mace/kernels/opencl/conv_2d_opencl_general.cc
mace/kernels/opencl/conv_2d_opencl_general.cc
+45
-10
mace/kernels/opencl/relu_opencl.cc
mace/kernels/opencl/relu_opencl.cc
+45
-21
mace/ops/conv_2d_benchmark.cc
mace/ops/conv_2d_benchmark.cc
+2
-0
mace/tools/benchmark/benchmark_model.cc
mace/tools/benchmark/benchmark_model.cc
+3
-3
mace/utils/tuner.h
mace/utils/tuner.h
+1
-0
tools/validate.py
tools/validate.py
+3
-8
tools/validate_gcn.sh
tools/validate_gcn.sh
+2
-3
未找到文件。
mace/core/net.cc
浏览文件 @
116d33db
...
...
@@ -87,6 +87,11 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
VLOG
(
1
)
<<
"Op "
<<
op
->
debug_def
().
name
()
<<
" has shape: "
<<
internal
::
MakeString
(
op
->
Output
(
0
)
->
shape
());
}
#ifdef __USE_OPENCL
if
(
device_type_
==
DeviceType
::
OPENCL
)
{
OpenCLRuntime
::
Get
()
->
command_queue
().
finish
();
}
#endif
return
true
;
}
...
...
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
116d33db
...
...
@@ -11,7 +11,7 @@
namespace
mace
{
namespace
kernels
{
template
<
typename
T
>
template
<
typename
T
>
void
BatchNormFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
scale
,
...
...
@@ -27,10 +27,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
auto
runtime
=
OpenCLRuntime
::
Get
();
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
...
...
@@ -38,9 +34,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
auto
bm_kernel
=
runtime
->
BuildKernel
(
"batch_norm"
,
"batch_norm"
,
built_options
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
bm_kernel
);
const
std
::
vector
<
uint32_t
>
lws
=
{
1
,
kwg_size
,
1
};
uint32_t
idx
=
0
;
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
scale
->
buffer
())));
...
...
@@ -50,18 +43,31 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
bm_kernel
.
setArg
(
idx
++
,
epsilon_
);
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Image2D
*>
(
output
->
buffer
())));
auto
params_generator
=
[
&
kwg_size
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
bm_kernel
);
auto
params_generator
=
[
&
kwg_size
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
return
{{
8
,
128
,
1
},
//SNPE size
{
1
,
1
,
64
},
{
1
,
1
,
128
},
{
1
,
kwg_size
/
16
,
16
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
128
,
128
},
{
1
,
1
,
kwg_size
},
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
32
,
4
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
128
,
128
},
{
3
,
15
,
9
},
{
7
,
15
,
9
},
{
9
,
7
,
15
},
{
15
,
7
,
9
},
{
1
,
kwg_size
,
1
}};
};
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
bm_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
...
...
@@ -73,10 +79,10 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
};
std
::
stringstream
ss
;
ss
<<
"batch_norm_opencl_kernel_"
<<
in
put
->
dim
(
0
)
<<
"_"
<<
in
put
->
dim
(
1
)
<<
"_"
<<
in
put
->
dim
(
2
)
<<
"_"
<<
in
put
->
dim
(
3
);
<<
out
put
->
dim
(
0
)
<<
"_"
<<
out
put
->
dim
(
1
)
<<
"_"
<<
out
put
->
dim
(
2
)
<<
"_"
<<
out
put
->
dim
(
3
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
ss
.
str
(),
lws
,
params_generator
,
...
...
mace/kernels/opencl/bias_add_opencl.cc
浏览文件 @
116d33db
...
...
@@ -36,7 +36,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
auto
bias_kernel
=
runtime
->
BuildKernel
(
"bias_add"
,
"bias_add"
,
built_options
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
bias_kernel
);
const
std
::
vector
<
uint32_t
>
lws
=
{
1
,
kwg_size
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
uint32_t
idx
=
0
;
bias_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input
->
buffer
())));
...
...
mace/kernels/opencl/cl/addn.cl
浏览文件 @
116d33db
...
...
@@ -12,19 +12,17 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
const
int
w
=
get_global_id
(
0
)
;
const
int
hb
=
get_global_id
(
1
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
DATA_TYPE4
in0
=
READ_IMAGET
(
input0,
sampler,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
in1
=
READ_IMAGET
(
input1,
sampler,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
in0
=
READ_IMAGET
(
input0,
SAMPLER,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
in1
=
READ_IMAGET
(
input1,
SAMPLER,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
out
=
in0
+
in1
;
#
if
INPUT_NUM
>
2
DATA_TYPE4
in2
=
READ_IMAGET
(
input2,
sampler
,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
in2
=
READ_IMAGET
(
input2,
SAMPLER
,
(
int2
)(
w,
hb
))
;
out
=
out
+
in2
;
#
endif
#
if
INPUT_NUM
>
3
DATA_TYPE4
in3
=
READ_IMAGET
(
input3,
sampler
,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
in3
=
READ_IMAGET
(
input3,
SAMPLER
,
(
int2
)(
w,
hb
))
;
out
=
out
+
in3
;
#
endif
...
...
mace/kernels/opencl/cl/buffer_to_image.cl
浏览文件 @
116d33db
...
...
@@ -54,9 +54,8 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc
+
out_channel_idx
;
if
(
in_channel_idx
<
in_channel
)
{
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
coord
=
(
int2
)(
w,
h
)
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
sampler
,
coord
)
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
SAMPLER
,
coord
)
;
const
int
size
=
(
out_channel
-
out_channel_idx
)
;
if
(
size
<
4
)
{
switch
(
size
)
{
...
...
@@ -119,9 +118,8 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
const
int
offset
=
((
batch_idx
*
height
+
height_idx
)
*
width
+
width_idx
)
*
channels
+
channel_idx
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
coord
=
(
int2
)(
w,
h
)
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
sampler
,
coord
)
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
SAMPLER
,
coord
)
;
const
int
size
=
channels
-
channel_idx
;
if
(
size
<
4
)
{
switch
(
size
)
{
...
...
@@ -169,9 +167,8 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
int
h
=
get_global_id
(
1
)
;
const
int
offset
=
w
*
4
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
coord
=
(
int2
)(
w,
h
)
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
sampler
,
coord
)
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
SAMPLER
,
coord
)
;
const
int
size
=
count
-
offset
;
if
(
size
<
4
)
{
switch
(
size
)
{
...
...
mace/kernels/opencl/cl/conv_2d.cl
浏览文件 @
116d33db
...
...
@@ -21,10 +21,9 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
const
int
out_hb
=
get_global_id
(
2
)
;
const
int
rounded_in_ch
=
in_ch_blks
*
4
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
#
ifdef
BIAS
DATA_TYPE4
out0
=
READ_IMAGET
(
bias,
sampler
,
(
int2
)(
out_ch_blk,
0
))
;
READ_IMAGET
(
bias,
SAMPLER
,
(
int2
)(
out_ch_blk,
0
))
;
DATA_TYPE4
out1
=
out0
;
DATA_TYPE4
out2
=
out0
;
DATA_TYPE4
out3
=
out0
;
...
...
@@ -71,7 +70,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
in_width_value = select(in_idx + in_width_value, \
-1, \
(in_width_value < 0 |
|
in_width_value
>=
in_width
))
; \
in##i
=
READ_IMAGET
(
input,
sampler
,
(
int2
)(
in_width_value,
in_hb_value
))
;
in##i
=
READ_IMAGET
(
input,
SAMPLER
,
(
int2
)(
in_width_value,
in_hb_value
))
;
READ_INPUT
(
0
)
;
READ_INPUT
(
1
)
;
...
...
@@ -81,10 +80,10 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
#
undef
READ_INPUT
int
filter_idx
=
(
in_ch_blk
<<
2
)
+
(
hb_idx
*
filter_width
+
width_idx
)
*
rounded_in_ch
;
weights0
=
READ_IMAGET
(
filter,
sampler
,
(
int2
)(
filter_idx
+
0
,
out_ch_blk
))
;
weights1
=
READ_IMAGET
(
filter,
sampler
,
(
int2
)(
filter_idx
+
1
,
out_ch_blk
))
;
weights2
=
READ_IMAGET
(
filter,
sampler
,
(
int2
)(
filter_idx
+
2
,
out_ch_blk
))
;
weights3
=
READ_IMAGET
(
filter,
sampler
,
(
int2
)(
filter_idx
+
3
,
out_ch_blk
))
;
weights0
=
READ_IMAGET
(
filter,
SAMPLER
,
(
int2
)(
filter_idx
+
0
,
out_ch_blk
))
;
weights1
=
READ_IMAGET
(
filter,
SAMPLER
,
(
int2
)(
filter_idx
+
1
,
out_ch_blk
))
;
weights2
=
READ_IMAGET
(
filter,
SAMPLER
,
(
int2
)(
filter_idx
+
2
,
out_ch_blk
))
;
weights3
=
READ_IMAGET
(
filter,
SAMPLER
,
(
int2
)(
filter_idx
+
3
,
out_ch_blk
))
;
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
...
...
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
116d33db
...
...
@@ -16,10 +16,8 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const
int
out_w_blks
=
get_global_size
(
1
)
;
const
int
out_hb
=
get_global_id
(
2
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
#
ifdef
BIAS
DATA_TYPE4
out0
=
READ_IMAGET
(
bias,
sampler
,
(
int2
)(
out_ch_blk,
0
))
;
DATA_TYPE4
out0
=
READ_IMAGET
(
bias,
SAMPLER
,
(
int2
)(
out_ch_blk,
0
))
;
DATA_TYPE4
out1
=
out0
;
DATA_TYPE4
out2
=
out0
;
DATA_TYPE4
out3
=
out0
;
...
...
@@ -58,16 +56,16 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
int
in_x_base
=
0
;
for
(
int
in_ch_blk
=
0
; in_ch_blk < in_ch_blks; ++in_ch_blk) {
DATA_TYPE4
in0
=
READ_IMAGET
(
input,
sampler
,
(
int2
)(
in_x_base
+
w.x,
out_hb_idx
))
;
DATA_TYPE4
in1
=
READ_IMAGET
(
input,
sampler
,
(
int2
)(
in_x_base
+
w.y,
out_hb_idx
))
;
DATA_TYPE4
in2
=
READ_IMAGET
(
input,
sampler
,
(
int2
)(
in_x_base
+
w.z,
out_hb_idx
))
;
DATA_TYPE4
in3
=
READ_IMAGET
(
input,
sampler
,
(
int2
)(
in_x_base
+
w.w,
out_hb_idx
))
;
DATA_TYPE4
in0
=
READ_IMAGET
(
input,
SAMPLER
,
(
int2
)(
in_x_base
+
w.x,
out_hb_idx
))
;
DATA_TYPE4
in1
=
READ_IMAGET
(
input,
SAMPLER
,
(
int2
)(
in_x_base
+
w.y,
out_hb_idx
))
;
DATA_TYPE4
in2
=
READ_IMAGET
(
input,
SAMPLER
,
(
int2
)(
in_x_base
+
w.z,
out_hb_idx
))
;
DATA_TYPE4
in3
=
READ_IMAGET
(
input,
SAMPLER
,
(
int2
)(
in_x_base
+
w.w,
out_hb_idx
))
;
const
int
filter_x0
=
in_ch_blk
<<
2
;
DATA_TYPE4
weights0
=
READ_IMAGET
(
filter,
sampler
,
(
int2
)(
filter_x0,
out_ch_blk
))
;
DATA_TYPE4
weights1
=
READ_IMAGET
(
filter,
sampler
,
(
int2
)(
filter_x0
+
1
,
out_ch_blk
))
;
DATA_TYPE4
weights2
=
READ_IMAGET
(
filter,
sampler
,
(
int2
)(
filter_x0
+
2
,
out_ch_blk
))
;
DATA_TYPE4
weights3
=
READ_IMAGET
(
filter,
sampler
,
(
int2
)(
filter_x0
+
3
,
out_ch_blk
))
;
DATA_TYPE4
weights0
=
READ_IMAGET
(
filter,
SAMPLER
,
(
int2
)(
filter_x0,
out_ch_blk
))
;
DATA_TYPE4
weights1
=
READ_IMAGET
(
filter,
SAMPLER
,
(
int2
)(
filter_x0
+
1
,
out_ch_blk
))
;
DATA_TYPE4
weights2
=
READ_IMAGET
(
filter,
SAMPLER
,
(
int2
)(
filter_x0
+
2
,
out_ch_blk
))
;
DATA_TYPE4
weights3
=
READ_IMAGET
(
filter,
SAMPLER
,
(
int2
)(
filter_x0
+
3
,
out_ch_blk
))
;
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
out0
+=
in0.x
*
weights0
;
...
...
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
116d33db
...
...
@@ -19,10 +19,9 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const
int
out_hb
=
get_global_id
(
2
)
;
const
int
rounded_in_ch
=
in_ch_blks
*
4
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
#
ifdef
BIAS
DATA_TYPE4
out0
=
READ_IMAGET
(
bias,
sampler
,
(
int2
)(
out_ch_blk,
0
))
;
READ_IMAGET
(
bias,
SAMPLER
,
(
int2
)(
out_ch_blk,
0
))
;
DATA_TYPE4
out1
=
out0
;
DATA_TYPE4
out2
=
out0
;
DATA_TYPE4
out3
=
out0
;
...
...
@@ -72,7 +71,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
in_width_value = select(in_idx + in_width_value, \
-1, \
(in_width_value < 0 |
|
in_width_value
>=
in_width
))
; \
in##i
=
READ_IMAGET
(
input,
sampler
,
(
int2
)(
in_width_value,
in_hb_value
))
;
in##i
=
READ_IMAGET
(
input,
SAMPLER
,
(
int2
)(
in_width_value,
in_hb_value
))
;
READ_INPUT
(
0
)
;
READ_INPUT
(
1
)
;
...
...
@@ -83,10 +82,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#
undef
READ_INPUT
int
filter_idx
=
(
in_ch_blk
<<
2
)
+
(
hb_idx
*
3
+
width_idx
)
*
rounded_in_ch
;
weights0
=
READ_IMAGET
(
filter,
sampler
,
(
int2
)(
filter_idx
+
0
,
out_ch_blk
))
;
weights1
=
READ_IMAGET
(
filter,
sampler
,
(
int2
)(
filter_idx
+
1
,
out_ch_blk
))
;
weights2
=
READ_IMAGET
(
filter,
sampler
,
(
int2
)(
filter_idx
+
2
,
out_ch_blk
))
;
weights3
=
READ_IMAGET
(
filter,
sampler
,
(
int2
)(
filter_idx
+
3
,
out_ch_blk
))
;
weights0
=
READ_IMAGET
(
filter,
SAMPLER
,
(
int2
)(
filter_idx
+
0
,
out_ch_blk
))
;
weights1
=
READ_IMAGET
(
filter,
SAMPLER
,
(
int2
)(
filter_idx
+
1
,
out_ch_blk
))
;
weights2
=
READ_IMAGET
(
filter,
SAMPLER
,
(
int2
)(
filter_idx
+
2
,
out_ch_blk
))
;
weights3
=
READ_IMAGET
(
filter,
SAMPLER
,
(
int2
)(
filter_idx
+
3
,
out_ch_blk
))
;
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
...
...
mace/kernels/opencl/cl/resize_bilinear.cl
浏览文件 @
116d33db
...
...
@@ -25,17 +25,16 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
const
float
h_lerp
=
h_in
-
h_lower
;
const
float
w_lerp
=
w_in
-
w_lower
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
const
int
in_w_offset
=
ch_blk
*
in_width
;
const
int
in_h_offset
=
b
*
in_height
;
DATA_TYPE4
top_left
=
READ_IMAGET
(
input,
sampler
,
DATA_TYPE4
top_left
=
READ_IMAGET
(
input,
SAMPLER
,
(
int2
)(
in_w_offset
+
w_lower,
in_h_offset
+
h_lower
))
;
DATA_TYPE4
top_right
=
READ_IMAGET
(
input,
sampler
,
DATA_TYPE4
top_right
=
READ_IMAGET
(
input,
SAMPLER
,
(
int2
)(
in_w_offset
+
w_upper,
in_h_offset
+
h_lower
))
;
DATA_TYPE4
bottom_left
=
READ_IMAGET
(
input,
sampler
,
DATA_TYPE4
bottom_left
=
READ_IMAGET
(
input,
SAMPLER
,
(
int2
)(
in_w_offset
+
w_lower,
in_h_offset
+
h_upper
))
;
DATA_TYPE4
bottom_right
=
READ_IMAGET
(
input,
sampler
,
DATA_TYPE4
bottom_right
=
READ_IMAGET
(
input,
SAMPLER
,
(
int2
)(
in_w_offset
+
w_upper,
in_h_offset
+
h_upper
))
;
DATA_TYPE4
top
=
top_left
+
(
top_right
-
top_left
)
*
w_lerp
;
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
116d33db
...
...
@@ -43,10 +43,10 @@ static void Concat2(const Tensor *input0,
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
concat_kernel
);
uint32_t
lws
[
3
];
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
channel_blk
,
kwg_size
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
width
,
kwg_size
/
lws
[
0
]);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
height
*
batch
,
kwg_size
/
(
lws
[
0
]
*
lws
[
1
]));
uint32_t
lws
[
3
]
=
{
8
,
16
,
8
}
;
//
lws[0] = std::min<uint32_t>(channel_blk, kwg_size);
//
lws[1] = std::min<uint32_t>(width, kwg_size / lws[0]);
//
lws[2] = std::min<uint32_t>(height * batch, kwg_size / (lws[0] * lws[1]));
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
concat_kernel
,
cl
::
NullRange
,
...
...
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
116d33db
...
...
@@ -7,6 +7,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -48,7 +49,6 @@ void Conv1x1(const Tensor *input,
auto
program
=
runtime
->
program
();
auto
conv_2d_kernel
=
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
"conv_2d_1x1"
,
built_options
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
uint32_t
idx
=
0
;
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input
->
buffer
())));
...
...
@@ -63,16 +63,51 @@ void Conv1x1(const Tensor *input,
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
auto
command_queue
=
runtime
->
command_queue
();
cl_int
error
;
error
=
command_queue
.
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)),
cl
::
NDRange
(
4
,
15
,
8
),
// TODO auto tuning
nullptr
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
,
error
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
15
,
8
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
auto
params_generator
=
[
&
kwg_size
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
return
{{
4
,
15
,
8
},
//SNPE size
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
32
,
4
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
128
,
128
},
{
3
,
15
,
9
},
{
7
,
15
,
9
},
{
9
,
7
,
15
},
{
15
,
7
,
9
},
{
1
,
kwg_size
,
1
}};
};
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>&
params
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
NULL
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
return
error
;
};
std
::
stringstream
ss
;
ss
<<
"conv2d_1x1_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
ss
.
str
(),
lws
,
params_generator
,
func
);
}
extern
void
Conv2dOpenclK1x1S1
(
const
Tensor
*
input
,
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
116d33db
...
...
@@ -7,6 +7,7 @@
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -54,15 +55,50 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
conv_2d_kernel
.
setArg
(
idx
++
,
padding
[
0
]
/
2
);
conv_2d_kernel
.
setArg
(
idx
++
,
padding
[
1
]
/
2
);
auto
command_queue
=
runtime
->
command_queue
();
cl_int
error
;
error
=
command_queue
.
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)),
cl
::
NDRange
(
16
,
16
,
4
),
NULL
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
,
error
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
4
,
15
,
8
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
auto
params_generator
=
[
&
kwg_size
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
return
{{
4
,
15
,
8
},
//SNPE size
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
32
,
4
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
128
,
128
},
{
3
,
15
,
9
},
{
7
,
15
,
9
},
{
9
,
7
,
15
},
{
15
,
7
,
9
},
{
1
,
kwg_size
,
1
}};
};
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
NULL
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
return
error
;
};
std
::
stringstream
ss
;
ss
<<
"conv2d_3x3_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
ss
.
str
(),
lws
,
params_generator
,
func
);
}
void
Conv2dOpenclK3x3S1
(
const
Tensor
*
input
,
...
...
mace/kernels/opencl/conv_2d_opencl_general.cc
浏览文件 @
116d33db
...
...
@@ -7,6 +7,7 @@
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -38,7 +39,6 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
auto
program
=
runtime
->
program
();
auto
conv_2d_kernel
=
runtime
->
BuildKernel
(
"conv_2d"
,
"conv_2d"
,
built_options
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
uint32_t
idx
=
0
;
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input
->
buffer
())));
...
...
@@ -57,15 +57,50 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
conv_2d_kernel
.
setArg
(
idx
++
,
padding
[
0
]
/
2
);
conv_2d_kernel
.
setArg
(
idx
++
,
padding
[
1
]
/
2
);
auto
command_queue
=
runtime
->
command_queue
();
cl_int
error
;
error
=
command_queue
.
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)),
cl
::
NDRange
(
16
,
16
,
4
),
NULL
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
,
error
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
auto
params_generator
=
[
&
kwg_size
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
return
{{
4
,
15
,
8
},
//SNPE size
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
32
,
4
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
128
,
128
},
{
3
,
15
,
9
},
{
7
,
15
,
9
},
{
9
,
7
,
15
},
{
15
,
7
,
9
},
{
1
,
kwg_size
,
1
}};
};
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
NULL
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
return
error
;
};
std
::
stringstream
ss
;
ss
<<
"conv2d_general_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
ss
.
str
(),
lws
,
params_generator
,
func
);
}
...
...
mace/kernels/opencl/relu_opencl.cc
浏览文件 @
116d33db
...
...
@@ -7,6 +7,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -22,10 +23,6 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
...
...
@@ -33,38 +30,65 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
cl
::
Kernel
relu_kernel
;
if
(
max_limit_
<
0
)
{
auto
relu_kernel
=
runtime
->
BuildKernel
(
"relu"
,
"relu"
,
built_options
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
relu_kernel
);
const
uint32_t
lws
[
3
]
=
{
1
,
kwg_size
,
1
};
relu_kernel
=
runtime
->
BuildKernel
(
"relu"
,
"relu"
,
built_options
);
uint32_t
idx
=
0
;
relu_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
relu_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
relu_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
NULL
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
);
}
else
{
auto
relu_kernel
=
runtime
->
BuildKernel
(
"relu"
,
"relux"
,
built_options
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
relu_kernel
);
const
uint32_t
lws
[
3
]
=
{
1
,
kwg_size
,
1
};
relu_kernel
=
runtime
->
BuildKernel
(
"relu"
,
"relux"
,
built_options
);
uint32_t
idx
=
0
;
relu_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
relu_kernel
.
setArg
(
idx
++
,
max_limit_
);
relu_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
relu_kernel
);
auto
params_generator
=
[
&
kwg_size
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
return
{{
4
,
15
,
8
},
//SNPE size
{
kwg_size
/
16
,
4
,
4
},
{
kwg_size
/
32
,
4
,
8
},
{
kwg_size
/
32
,
8
,
4
},
{
kwg_size
/
64
,
8
,
8
},
{
kwg_size
/
64
,
16
,
4
},
{
kwg_size
/
128
,
8
,
16
},
{
kwg_size
/
128
,
16
,
8
},
{
kwg_size
/
128
,
32
,
4
},
{
1
,
kwg_size
/
32
,
32
},
{
1
,
kwg_size
/
64
,
64
},
{
1
,
kwg_size
/
128
,
128
},
{
3
,
15
,
9
},
{
7
,
15
,
9
},
{
9
,
7
,
15
},
{
15
,
7
,
9
},
{
1
,
kwg_size
,
1
}};
};
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
)
->
cl_int
{
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
relu_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lw
s
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
param
s
[
2
]),
NULL
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
);
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
return
error
;
};
std
::
stringstream
ss
;
ss
<<
"relu_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
Tuner
<
uint32_t
>::
Get
()
->
template
TuneOrRun
<
cl_int
>(
ss
.
str
(),
lws
,
params_generator
,
func
);
}
template
...
...
mace/ops/conv_2d_benchmark.cc
浏览文件 @
116d33db
...
...
@@ -102,6 +102,8 @@ BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
BM_CONV_2D
(
1
,
128
,
60
,
60
,
3
,
3
,
1
,
VALID
,
128
,
half
);
// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
BM_CONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
SAME
,
32
,
half
);
BM_CONV_2D
(
1
,
3
,
512
,
512
,
7
,
7
,
2
,
SAME
,
64
,
half
);
BM_CONV_2D
(
1
,
512
,
64
,
64
,
1
,
1
,
1
,
SAME
,
256
,
half
);
// Test RGB <-> YUV
//BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
...
...
mace/tools/benchmark/benchmark_model.cc
浏览文件 @
116d33db
...
...
@@ -260,8 +260,6 @@ int Main(int argc, char **argv) {
DeviceType_Parse
(
device
,
&
device_type
);
VLOG
(
0
)
<<
device_type
;
if
(
device_type
==
DeviceType
::
OPENCL
)
OpenCLRuntime
::
EnableProfiling
();
// load model
std
::
ifstream
model_file_stream
(
model_file
,
std
::
ios
::
in
|
std
::
ios
::
binary
);
...
...
@@ -296,9 +294,11 @@ int Main(int argc, char **argv) {
}
}
}
auto
net
=
CreateNet
(
net_def
,
&
ws
,
device_type
,
NetMode
::
INIT
);
net
->
Run
();
// create net
auto
net
=
CreateNet
(
net_def
,
&
ws
,
device_type
);
net
=
CreateNet
(
net_def
,
&
ws
,
device_type
);
int64_t
warmup_time_us
=
0
;
int64_t
num_warmup_runs
=
0
;
...
...
mace/utils/tuner.h
浏览文件 @
116d33db
...
...
@@ -42,6 +42,7 @@ class Tuner {
}
else
{
// run
if
(
param_table_
.
find
(
param_key
)
!=
param_table_
.
end
())
{
VLOG
(
1
)
<<
param_key
<<
": "
<<
internal
::
MakeString
(
param_table_
[
param_key
]);
return
func
(
param_table_
[
param_key
]);
}
else
{
return
func
(
default_param
);
...
...
tools/validate.py
浏览文件 @
116d33db
...
...
@@ -18,10 +18,9 @@ from tensorflow import gfile
# --input_file input_file \
# --mace_out_file icnet.out
def
generate_data
(
shape
):
np
.
random
.
seed
(
FLAGS
.
random_seed
)
data
=
np
.
random
.
random
(
shape
)
data
=
np
.
random
.
random
(
shape
)
*
-
1
print
FLAGS
.
input_file
data
.
astype
(
np
.
float32
).
tofile
(
FLAGS
.
input_file
)
print
"Generate input file done."
...
...
@@ -36,12 +35,8 @@ def valid_output(out_shape, mace_out_file, tf_out_value):
mace_out_value
=
load_data
(
mace_out_file
)
if
mace_out_value
.
size
!=
0
:
mace_out_value
=
mace_out_value
.
reshape
(
out_shape
)
np
.
testing
.
assert_allclose
(
tf_out_value
,
mace_out_value
,
rtol
=
0
,
atol
=
1e-3
)
res
=
np
.
allclose
(
tf_out_value
,
mace_out_value
,
rtol
=
0
,
atol
=
1e-3
)
if
res
:
print
'=======================Passed! Haha======================'
else
:
print
'=======================Failed! Oops======================'
np
.
testing
.
assert_allclose
(
mace_out_value
,
tf_out_value
,
rtol
=
0.05
)
print
'=======================Passed! Haha======================'
else
:
print
'=======================Skip empty node==================='
...
...
tools/validate_gcn.sh
浏览文件 @
116d33db
...
...
@@ -32,7 +32,7 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
--output
=
${
MODEL_DIR
}
/
${
MACE_MODEL_NAME
}
\
--input_node
=
input
\
--output_node
=
GCN/br_result_2/fcn_br
\
--data_type
=
DT_
FLOAT
\
--data_type
=
DT_
HALF
\
--runtime
=
gpu
...
...
@@ -50,7 +50,7 @@ adb push ${MODEL_DIR}/${MACE_MODEL_NAME} ${PHONE_DATA_DIR}
adb push
${
MODEL_DIR
}
/
${
INPUT_FILE_NAME
}
${
PHONE_DATA_DIR
}
adb push bazel-bin/mace/examples/mace_run
${
PHONE_DATA_DIR
}
num_threads
=
${
1
:-
1
}
num_threads
=
${
1
:-
4
}
adb </dev/null shell
MACE_RUN_PARAMETER_PATH
=
${
PHONE_DATA_DIR
}
/mace_run.config
\
MACE_KERNEL_PATH
=
$KERNEL_DIR
\
...
...
@@ -77,4 +77,3 @@ python tools/validate.py --model_file ${TF_MODEL_FILE_PATH} \
--input_node
input
\
--output_node
GCN/br_result_2/fcn_br
\
--output_shape
1,512,512,2
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录