Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
慢慢CG
Mace
提交
c9393858
Mace
项目概览
慢慢CG
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
c9393858
编写于
12月 01, 2017
作者:
Y
yejianwu
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'master' of v9.git.n.xiaomi.com:deep-learning/mace into bm_to_image
上级
9109a5a0
0d010ac4
变更
18
显示空白变更内容
内联
并排
Showing
18 changed file
with
521 addition
and
334 deletion
+521
-334
mace/core/half.h
mace/core/half.h
+1
-1
mace/dsp/test/quantized_resize_bilinear_test.cc
mace/dsp/test/quantized_resize_bilinear_test.cc
+24
-19
mace/dsp/test/supernode_test.cc
mace/dsp/test/supernode_test.cc
+1
-1
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+29
-13
mace/kernels/opencl/buffer_to_image.cc
mace/kernels/opencl/buffer_to_image.cc
+2
-2
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+39
-18
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+121
-129
mace/kernels/opencl/conv_2d_opencl.cc
mace/kernels/opencl/conv_2d_opencl.cc
+13
-10
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+10
-6
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+11
-6
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+2
-4
mace/ops/buffer_to_image.cc
mace/ops/buffer_to_image.cc
+1
-1
mace/ops/buffer_to_image_test.cc
mace/ops/buffer_to_image_test.cc
+6
-0
mace/ops/conv_2d.cc
mace/ops/conv_2d.cc
+10
-0
mace/ops/conv_2d_benchmark.cc
mace/ops/conv_2d_benchmark.cc
+37
-36
mace/ops/conv_2d_test.cc
mace/ops/conv_2d_test.cc
+162
-69
mace/ops/ops_test_util.h
mace/ops/ops_test_util.h
+34
-19
mace/proto/mace.proto
mace/proto/mace.proto
+18
-0
未找到文件。
mace/core/half.h
浏览文件 @
c9393858
...
...
@@ -1098,7 +1098,7 @@ namespace half_float
/// Conversion constructor.
/// \param rhs float to convert
explicit
half
(
float
rhs
)
:
data_
(
detail
::
float2half
<
round_style
>
(
rhs
))
{}
half
(
float
rhs
)
:
data_
(
detail
::
float2half
<
round_style
>
(
rhs
))
{}
/// Conversion to single-precision.
/// \return single precision value representing expression value
...
...
mace/dsp/test/quantized_resize_bilinear_test.cc
浏览文件 @
c9393858
...
...
@@ -5,6 +5,7 @@
#include "mace/dsp/hexagon_control_wrapper.h"
#include "gtest/gtest.h"
#define RESIZE_BILINEAR_TEST_CHANNELS 128
using
namespace
mace
;
static
NetDef
BuildNetDef
()
{
...
...
@@ -17,7 +18,7 @@ static NetDef BuildNetDef() {
input_op
->
set_type
(
"INPUT"
);
input_op
->
set_node_id
(
0
);
input_op
->
set_padding
(
0
);
input_op
->
add_out_max_byte_size
(
1
0
00
);
input_op
->
add_out_max_byte_size
(
1
2
00
);
// relu op
OperatorDef
*
resize_bilinear_op
=
net
.
add_op
();
...
...
@@ -45,7 +46,7 @@ static NetDef BuildNetDef() {
input_node_input
=
resize_bilinear_op
->
add_node_input
();
input_node_input
->
set_node_id
(
12
);
input_node_input
->
set_output_port
(
0
);
resize_bilinear_op
->
add_out_max_byte_size
(
1
0
00
);
resize_bilinear_op
->
add_out_max_byte_size
(
1
2
00
);
resize_bilinear_op
->
add_out_max_byte_size
(
1000
);
resize_bilinear_op
->
add_out_max_byte_size
(
1000
);
...
...
@@ -64,8 +65,8 @@ static NetDef BuildNetDef() {
new_dim_tensor
->
add_dims
(
2
);
new_dim_tensor
->
set_data_type
(
DataType
::
DT_INT32
);
new_dim_tensor
->
set_node_id
(
10
);
new_dim_tensor
->
add_int32_data
(
1
);
new_dim_tensor
->
add_int32_data
(
1
);
new_dim_tensor
->
add_int32_data
(
2
);
new_dim_tensor
->
add_int32_data
(
2
);
TensorProto
*
input_min_tensor
=
net
.
add_tensors
();
input_min_tensor
->
set_name
(
"input_min"
);
...
...
@@ -86,20 +87,20 @@ static NetDef BuildNetDef() {
input_info
->
set_name
(
"input_node"
);
input_info
->
set_node_id
(
0
);
input_info
->
add_dims
(
1
);
input_info
->
add_dims
(
2
);
input_info
->
add_dims
(
2
);
input_info
->
add_dims
(
128
);
input_info
->
add_dims
(
3
);
input_info
->
add_dims
(
3
);
input_info
->
add_dims
(
RESIZE_BILINEAR_TEST_CHANNELS
);
input_info
->
set_data_type
(
DataType
::
DT_UINT8
);
input_info
->
set_max_byte_size
(
1
0
00
);
input_info
->
set_max_byte_size
(
1
2
00
);
OutputInfo
*
output_info
=
net
.
add_output_info
();
output_info
->
set_name
(
"output_node"
);
output_info
->
set_node_id
(
1
);
output_info
->
add_dims
(
1
);
output_info
->
add_dims
(
1
);
output_info
->
add_dims
(
1
);
output_info
->
add_dims
(
128
);
output_info
->
add_dims
(
2
);
output_info
->
add_dims
(
2
);
output_info
->
add_dims
(
RESIZE_BILINEAR_TEST_CHANNELS
);
output_info
->
set_data_type
(
DataType
::
DT_UINT8
);
output_info
->
set_max_byte_size
(
1
0
00
);
output_info
->
set_max_byte_size
(
1
2
00
);
return
net
;
}
...
...
@@ -117,21 +118,25 @@ TEST(QuantizedResizeBilinearTest, QuantizedResizeBilinear) {
Allocator
*
cpu_allocator
=
GetDeviceAllocator
(
DeviceType
::
CPU
);
Tensor
input_tensor
(
cpu_allocator
,
DT_UINT8
);
Tensor
output_tensor
(
cpu_allocator
,
DT_UINT8
);
input_tensor
.
Resize
({
1
,
2
,
2
,
128
});
output_tensor
.
Resize
({
1
,
1
,
1
,
128
});
input_tensor
.
Resize
({
1
,
3
,
3
,
RESIZE_BILINEAR_TEST_CHANNELS
});
output_tensor
.
Resize
({
1
,
2
,
2
,
RESIZE_BILINEAR_TEST_CHANNELS
});
uint8_t
*
input_data
=
input_tensor
.
mutable_data
<
uint8_t
>
();
const
uint8_t
*
output_data
=
output_tensor
.
data
<
uint8_t
>
();
for
(
int
c
=
0
;
c
<
128
;
++
c
)
{
input_data
[
c
]
=
input_data
[
c
+
128
]
=
input_data
[
c
+
256
]
=
input_data
[
c
+
384
]
=
(
uint8_t
)
c
;
for
(
int
wh
=
0
;
wh
<
9
;
++
wh
)
{
for
(
int
c
=
0
;
c
<
RESIZE_BILINEAR_TEST_CHANNELS
;
++
c
)
{
input_data
[
wh
*
RESIZE_BILINEAR_TEST_CHANNELS
+
c
]
=
9
-
wh
;
}
}
VLOG
(
0
)
<<
wrapper
.
ExecuteGraph
(
input_tensor
,
&
output_tensor
);
wrapper
.
PrintLog
();
for
(
int
i
=
0
;
i
<
output_tensor
.
size
();
++
i
)
{
EXPECT_EQ
(
i
,
output_data
[
i
]);
vector
<
uint8_t
>
expected
{
9
,
8
,
5
,
3
};
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
for
(
int
c
=
0
;
c
<
RESIZE_BILINEAR_TEST_CHANNELS
;
++
c
)
EXPECT_EQ
(
expected
[
i
],
output_data
[
i
*
RESIZE_BILINEAR_TEST_CHANNELS
+
c
]);
}
std
::
cout
<<
std
::
endl
;
...
...
mace/dsp/test/supernode_test.cc
浏览文件 @
c9393858
...
...
@@ -224,7 +224,7 @@ TEST(SupernodeTest, Supernode) {
input_data
[
h
*
4
+
w
]
=
(
uint8_t
)((
h
==
0
)
?
0
:
h
*
64
-
1
);
}
VLOG
(
0
)
<<
wrapper
.
ExecuteGraph
New
(
&
input_tensor
,
1
,
&
output_tensor
,
1
);
VLOG
(
0
)
<<
wrapper
.
ExecuteGraph
(
input_tensor
,
&
output_tensor
);
wrapper
.
PrintLog
();
// expect out: [[49.2095, 49.2095], [50.7905, 50.7905]]
...
...
mace/kernels/conv_2d.h
浏览文件 @
c9393858
...
...
@@ -11,13 +11,23 @@
namespace
mace
{
namespace
kernels
{
struct
Conv2dFunctorBase
{
Conv2dFunctorBase
(
const
int
*
strides
,
const
Padding
&
paddings
,
const
int
*
dilations
)
:
strides_
(
strides
),
dilations_
(
dilations
),
paddings_
(
paddings
)
{}
const
int
*
strides_
;
// [stride_h, stride_w]
const
int
*
dilations_
;
// [dilation_h, dilation_w]
Padding
paddings_
;
};
template
<
DeviceType
D
,
typename
T
>
struct
Conv2dFunctor
{
Conv2dFunctor
()
{}
struct
Conv2dFunctor
:
Conv2dFunctorBase
{
Conv2dFunctor
(
const
int
*
strides
,
const
Padding
&
paddings
,
const
int
*
dilations
)
:
strides_
(
strides
),
dilations_
(
dilations
),
paddings_
(
padding
s
)
{}
:
Conv2dFunctorBase
(
strides
,
paddings
,
dilation
s
)
{}
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
...
...
@@ -76,9 +86,10 @@ struct Conv2dFunctor {
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
T
bias_channel
=
bias_data
?
bias_data
[
c
]
:
0
;
T
bias_channel
=
0.0
f
;
if
(
bias
)
bias_channel
=
bias_data
[
c
];
*
output_data
=
bias_channel
;
T
sum
=
0
;
T
sum
=
0
.0
f
;
const
T
*
filter_ptr
=
filter_data
+
c
;
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
...
...
@@ -113,9 +124,6 @@ struct Conv2dFunctor {
}
const
int
*
strides_
;
// [stride_h, stride_w]
const
int
*
dilations_
;
// [dilation_h, dilation_w]
Padding
paddings_
;
};
template
<
>
...
...
@@ -123,11 +131,19 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
);
template
<
>
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
float
>::
operator
()(
const
Tensor
*
input
,
template
<
typename
T
>
struct
Conv2dFunctor
<
DeviceType
::
OPENCL
,
T
>
:
Conv2dFunctorBase
{
Conv2dFunctor
(
const
int
*
strides
,
const
Padding
&
paddings
,
const
int
*
dilations
)
:
Conv2dFunctorBase
(
strides
,
paddings
,
dilations
)
{}
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
);
};
}
// namespace kernels
}
// namespace mace
...
...
mace/kernels/opencl/buffer_to_image.cc
浏览文件 @
c9393858
...
...
@@ -24,8 +24,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
}
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
image
->
dtype
()
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOPENCLCMDDataType
(
image
->
dtype
()
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOPENCLCMDDataType
(
DataTypeToEnum
<
T
>::
value
));
auto
runtime
=
OpenCLRuntime
::
Get
();
string
kernel_name
;
switch
(
type
)
{
...
...
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
c9393858
...
...
@@ -10,7 +10,10 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__read_only
image2d_t
bn_offset,
/*
cout%4
*
cout/4
*/
#
endif
__write_only
image2d_t
output,
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
int
in_ch_blks,
__private
const
int
height,
__private
const
int
width
)
{
const
int
out_ch_blk
=
get_global_id
(
0
)
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
...
...
@@ -32,24 +35,37 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#
endif
int4
w
;
#
if
STRIDE
==
1
w.x
=
out_w_blk
;
w.y
=
w.x
+
out_w_blks
;
w.z
=
w.y
+
out_w_blks
;
w.w
=
w.z
+
out_w_blks
;
int
out_hb_idx
=
(
out_hb
%
height
)
;
#
else
w.x
=
out_w_blk
*
2
;
w.y
=
(
out_w_blk
+
out_w_blks
)
*
2
;
w.z
=
(
out_w_blk
+
2
*
out_w_blks
)
*
2
;
w.w
=
(
out_w_blk
+
3
*
out_w_blks
)
*
2
;
int
out_hb_idx
=
(
out_hb
%
height
)
*
2
;
#
endif
w.x
=
select
(
w.x,
INT_MIN,
w.x
>=
in_width
)
;
w.y
=
select
(
w.y,
INT_MIN,
w.y
>=
in_width
)
;
w.z
=
select
(
w.z,
INT_MIN,
w.z
>=
in_width
)
;
w.w
=
select
(
w.w,
INT_MIN,
w.w
>=
in_width
)
;
out_hb_idx
=
select
(
out_hb_idx
+
(
out_hb
/
height
)
*
in_height,
-1
,
out_hb_idx
>=
in_height
)
;
//
Unrolling
this
loop
hurt
perfmance
int
in_x_base
=
0
;
for
(
int
in_ch_blk
=
0
; in_ch_blk < in_ch_blks; ++in_ch_blk) {
DATA_TYPE4
in0
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.x,
out_hb
))
;
DATA_TYPE4
in1
=
0
;
DATA_TYPE4
in2
=
0
;
DATA_TYPE4
in3
=
0
;
if
(
w.y
<
width
)
{
//
conditional
load
hurt
perf,
this
branching
helps
sometimes
in1
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.y,
out_hb
))
;
in2
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.z,
out_hb
))
;
in3
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.w,
out_hb
))
;
}
DATA_TYPE4
in0
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.x,
out_hb_idx
))
;
DATA_TYPE4
in1
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.y,
out_hb_idx
))
;
DATA_TYPE4
in2
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.z,
out_hb_idx
))
;
DATA_TYPE4
in3
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.w,
out_hb_idx
))
;
const
int
filter_x0
=
in_ch_blk
<<
2
;
DATA_TYPE4
weights0
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_x0,
out_ch_blk
))
;
...
...
@@ -78,7 +94,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out3
+=
in3.z
*
weights2
;
out3
+=
in3.w
*
weights3
;
in_x_base
+=
width
;
in_x_base
+=
in_
width
;
}
#
ifdef
FUSED_BATCH_NORM
...
...
@@ -111,14 +127,19 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#
endif
const
int
out_x_base
=
out_ch_blk
*
width
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w.x,
out_hb
)
,
out0
)
;
int
out_x_idx
=
out_w_blk
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out0
)
;
out_x_idx
+=
out_w_blks
;
if
(
out_x_idx
>=
width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out1
)
;
if
(
w.y
>=
width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w.y,
out_hb
)
,
out1
)
;
out_x_idx
+=
out_w_blks
;
if
(
out_x_idx
>=
width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out2
)
;
if
(
w.z
>=
width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w.z,
out_hb
)
,
out2
)
;
out_x_idx
+=
out_w_blks
;
if
(
out_x_idx
>=
width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out3
)
;
if
(
w.w
>=
width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w.w,
out_hb
)
,
out3
)
;
}
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
c9393858
...
...
@@ -20,143 +20,135 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const
int
rounded_in_ch
=
in_ch_blks
*
4
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
out[5]
=
{0}
;
#
ifdef
BIAS
out[0]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
bias,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
out[1]
=
out[0]
;
out[2]
=
out[0]
;
out[3]
=
out[0]
;
out[4]
=
out[0]
;
DATA_TYPE4
out0
=
READ_IMAGET
(
bias,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
DATA_TYPE4
out1
=
out0
;
DATA_TYPE4
out2
=
out0
;
DATA_TYPE4
out3
=
out0
;
DATA_TYPE4
out4
=
out0
;
#
else
DATA_TYPE4
out0
=
0
;
DATA_TYPE4
out1
=
0
;
DATA_TYPE4
out2
=
0
;
DATA_TYPE4
out3
=
0
;
DATA_TYPE4
out4
=
0
;
#
endif
#
if
STRIDE
==
1
int
in_width0
=
out_w_blk
-
padding_left
;
int
in_width1
=
in_width0
+
out_w_blks
;
int
in_width2
=
in_width1
+
out_w_blks
;
int
in_width3
=
in_width2
+
out_w_blks
;
int
in_width4
=
in_width3
+
out_w_blks
;
const
int
height_idx
=
(
out_hb
%
out_height
)
-
padding_top
;
#
else
int
in_width0
=
out_w_blk
*
2
-
padding_left
;
int
in_width1
=
(
out_w_blk
+
out_w_blks
)
*
2
-
padding_left
;
int
in_width2
=
(
out_w_blk
+
2
*
out_w_blks
)
*
2
-
padding_left
;
int
in_width3
=
(
out_w_blk
+
3
*
out_w_blks
)
*
2
-
padding_left
;
int
in_width4
=
(
out_w_blk
+
4
*
out_w_blks
)
*
2
-
padding_left
;
const
int
height_idx
=
(
out_hb
%
out_height
)
*
2
-
padding_top
;
#
endif
int
w[5]
;
w[0]
=
out_w_blk
-
padding_left
;
w[1]
=
w[0]
+
out_w_blks
;
w[2]
=
w[1]
+
out_w_blks
;
w[3]
=
w[2]
+
out_w_blks
;
w[4]
=
w[3]
+
out_w_blks
;
const
int
batch_idx
=
out_hb
/
out_height
;
const
int
height_idx
=
out_hb
%
out_height
;
int
in_hb[3]
;
in_hb[0]
=
height_idx
-
padding_top
;
in_hb[1]
=
in_hb[0]
+
1
;
in_hb[2]
=
in_hb[1]
+
1
;
//
Judge
the
height
border
for
padding
input.
in_hb[0]
=
(
in_hb[0]
<
0
|
| in_hb[0] >= in_height) ? -1 : in_hb[0] + batch_idx * in_height;
in_hb[1] = (in_hb[1] < 0 || in_hb[1] >= in_height) ? -1 : in_hb[1] + batch_idx * in_height;
in_hb[2] = (in_hb[2] < 0 || in_hb[2] >= in_height) ? -1 : in_hb[2] + batch_idx * in_height;
const int input_image_width = in_ch_blks * in_width;
VEC_DATA_TYPE(DATA_TYPE, 4) in[5];
VEC_DATA_TYPE(DATA_TYPE, 4) weights[4];
const
int
batch_idx
=
(
out_hb
/
out_height
)
*
in_height
;
DATA_TYPE4
in0,
in1,
in2,
in3,
in4
;
DATA_TYPE4
weights0,
weights1,
weights2,
weights3
;
int
in_idx,
hb_idx,
width_idx,
in_width_idx
;
//
Unrolling
this
loop
hurt
perfmance
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for (int i = 0; i < 9; ++i) {
for
(
short
in_ch_blk
=
0
; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for
(
short
hb_idx
=
0
; hb_idx < 3; ++hb_idx) {
for
(
short
width_idx
=
0
; width_idx < 3; ++width_idx) {
in_idx
=
in_ch_blk
*
in_width
;
hb_idx = i / 3;
width_idx = i % 3;
in_width_idx = w[0] + width_idx;
// Judge the width border for padding input.
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[0] = 0;
} else {
in[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[1] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[1] = 0;
} else {
in[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[2] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[2] = 0;
} else {
in[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[3] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[3] = 0;
} else {
in[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[4] + width_idx;
if (in_width_idx < 0 |
|
in_width_idx
>=
in_width
)
{
in[4]
=
0
;
}
else
{
in[4]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
sampler,
(
int2
)(
in_idx
+
in_width_idx,
in_hb[hb_idx]
))
;
}
int
filter_idx
=
(
in_ch_blk
<<
2
)
+
i
*
rounded_in_ch
;
weights[0]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
0
,
out_ch_blk
))
;
weights[1]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
1
,
out_ch_blk
))
;
weights[2]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
2
,
out_ch_blk
))
;
weights[3]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
3
,
out_ch_blk
))
;
int
in_hb_value
=
height_idx
+
hb_idx
;
in_hb_value
=
select
(
in_hb_value
+
batch_idx,
-1
,
(
in_hb_value
<
0
|
| in_hb_value >= in_height));
int in_width_value;
#define READ_INPUT(i) \
in_width_value = in_width##i + width_idx; \
in_width_value = select(in_idx + in_width_value, \
-1, \
(in_width_value < 0 |
|
in_width_value
>=
in_width
))
; \
in##i
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_width_value,
in_hb_value
))
;
READ_INPUT
(
0
)
;
READ_INPUT
(
1
)
;
READ_INPUT
(
2
)
;
READ_INPUT
(
3
)
;
READ_INPUT
(
4
)
;
#
undef
READ_INPUT
int
filter_idx
=
(
in_ch_blk
<<
2
)
+
(
hb_idx
*
3
+
width_idx
)
*
rounded_in_ch
;
weights0
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_idx
+
0
,
out_ch_blk
))
;
weights1
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_idx
+
1
,
out_ch_blk
))
;
weights2
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_idx
+
2
,
out_ch_blk
))
;
weights3
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_idx
+
3
,
out_ch_blk
))
;
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
//
Interleaving
load
and
mul
does
not
improve
performance
as
expected
out[0]
+=
in[0].x
*
weights[0]
;
out[0]
+=
in[0].y
*
weights[1]
;
out[0]
+=
in[0].z
*
weights[2]
;
out[0]
+=
in[0].w
*
weights[3]
;
out[1]
+=
in[1].x
*
weights[0]
;
out[1]
+=
in[1].y
*
weights[1]
;
out[1]
+=
in[1].z
*
weights[2]
;
out[1]
+=
in[1].w
*
weights[3]
;
out[2]
+=
in[2].x
*
weights[0]
;
out[2]
+=
in[2].y
*
weights[1]
;
out[2]
+=
in[2].z
*
weights[2]
;
out[2]
+=
in[2].w
*
weights[3]
;
out[3]
+=
in[3].x
*
weights[0]
;
out[3]
+=
in[3].y
*
weights[1]
;
out[3]
+=
in[3].z
*
weights[2]
;
out[3]
+=
in[3].w
*
weights[3]
;
out[4]
+=
in[4].x
*
weights[0]
;
out[4]
+=
in[4].y
*
weights[1]
;
out[4]
+=
in[4].z
*
weights[2]
;
out[4]
+=
in[4].w
*
weights[3]
;
out0
+=
in0.x
*
weights0
;
out0
+=
in0.y
*
weights1
;
out0
+=
in0.z
*
weights2
;
out0
+=
in0.w
*
weights3
;
out1
+=
in1.x
*
weights0
;
out1
+=
in1.y
*
weights1
;
out1
+=
in1.z
*
weights2
;
out1
+=
in1.w
*
weights3
;
out2
+=
in2.x
*
weights0
;
out2
+=
in2.y
*
weights1
;
out2
+=
in2.z
*
weights2
;
out2
+=
in2.w
*
weights3
;
out3
+=
in3.x
*
weights0
;
out3
+=
in3.y
*
weights1
;
out3
+=
in3.z
*
weights2
;
out3
+=
in3.w
*
weights3
;
out4
+=
in4.x
*
weights0
;
out4
+=
in4.y
*
weights1
;
out4
+=
in4.z
*
weights2
;
out4
+=
in4.w
*
weights3
;
}
}
}
const
int
out_x_base
=
out_ch_blk
*
out_width
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
(
int2
)(
out_x_base
+
w[0]
+
padding_left,
out_hb
)
,
out[0]
)
;
w[1]
+=
padding_left
;
if
(
w[1]
>=
out_width
)
return
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
(
int2
)(
out_x_base
+
w[1],
out_hb
)
,
out[1]
)
;
w[2]
+=
padding_left
;
if
(
w[2]
>=
out_width
)
return
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
(
int2
)(
out_x_base
+
w[2],
out_hb
)
,
out[2]
)
;
w[3]
+=
padding_left
;
if
(
w[3]
>=
out_width
)
return
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
(
int2
)(
out_x_base
+
w[3],
out_hb
)
,
out[3]
)
;
w[4]
+=
padding_left
;
if
(
w[4]
>=
out_width
)
return
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
(
int2
)(
out_x_base
+
w[4],
out_hb
)
,
out[4]
)
;
int
w
=
out_w_blk
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out0
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out1
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out2
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out3
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out4
)
;
}
mace/kernels/opencl/conv_2d_opencl.cc
浏览文件 @
c9393858
...
...
@@ -10,33 +10,33 @@ namespace kernels {
extern
void
Conv2dOpenclK1x1S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
);
const
DataType
dt
,
Tensor
*
output
);
extern
void
Conv2dOpenclK1x1S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
);
const
DataType
dt
,
Tensor
*
output
);
extern
void
Conv2dOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
);
const
DataType
dt
,
Tensor
*
output
);
extern
void
Conv2dOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
);
const
DataType
dt
,
Tensor
*
output
);
template
<
>
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
float
>::
operator
()(
const
Tensor
*
input
,
template
<
typename
T
>
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
)
{
typedef
void
(
*
Conv2dOpenclFunction
)(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
);
DataType
dt
,
Tensor
*
output
);
// Selection matrix: kernel_size x stride_size
static
const
Conv2dOpenclFunction
selector
[
5
][
2
]
=
{
{
Conv2dOpenclK1x1S1
,
Conv2dOpenclK1x1S2
},
{
nullptr
,
nullptr
},
{
Conv2dOpenclK3x3S1
,
nullptr
},
{
Conv2dOpenclK3x3S1
,
Conv2dOpenclK3x3S2
},
{
nullptr
,
nullptr
},
{
nullptr
,
nullptr
}};
...
...
@@ -50,7 +50,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
<<
" stride "
<<
strides_
[
0
]
<<
"x"
<<
strides_
[
1
]
<<
" is not implemented yet, using slow version"
;
// TODO(heliangliang) The CPU/NEON kernel should map the buffer
Conv2dFunctor
<
DeviceType
::
CPU
,
float
>
(
strides_
,
paddings_
,
dilations_
)(
Conv2dFunctor
<
DeviceType
::
CPU
,
T
>
(
strides_
,
paddings_
,
dilations_
)(
input
,
filter
,
bias
,
output
);
return
;
}
...
...
@@ -70,8 +70,11 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
}
auto
conv2d_func
=
selector
[
kernel_h
-
1
][
strides_
[
0
]
-
1
];
conv2d_func
(
input
,
filter
,
bias
,
paddings
.
data
(),
output
);
conv2d_func
(
input
,
filter
,
bias
,
paddings
.
data
(),
DataTypeToEnum
<
T
>::
value
,
output
);
}
template
struct
Conv2dFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
Conv2dFunctor
<
DeviceType
::
OPENCL
,
half
>;
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
c9393858
...
...
@@ -15,6 +15,7 @@ void Conv1x1(const Tensor *input,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
stride
,
const
DataType
dt
,
Tensor
*
output
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
...
...
@@ -29,13 +30,11 @@ void Conv1x1(const Tensor *input,
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
MACE_CHECK
(
stride
==
1
);
MACE_CHECK
(
input_batch
==
batch
);
MACE_CHECK
(
stride
!=
1
||
(
input_height
==
height
&&
input_width
==
width
));
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOPENCLCMDDataType
(
input
->
dtype
()
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOPENCLCMDDataType
(
dt
));
built_options
.
emplace
(
"-DSTRIDE="
+
ToString
(
stride
));
if
(
bias
!=
nullptr
)
{
built_options
.
emplace
(
"-DBIAS"
);
...
...
@@ -54,7 +53,10 @@ void Conv1x1(const Tensor *input,
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
bias
->
buffer
())));
}
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
output
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input_height
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input_width
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input_channel_blocks
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
auto
command_queue
=
runtime
->
command_queue
();
...
...
@@ -73,16 +75,18 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
)
{
Conv1x1
(
input
,
filter
,
bias
,
1
,
output
);
Conv1x1
(
input
,
filter
,
bias
,
1
,
dt
,
output
);
};
extern
void
Conv2dOpenclK1x1S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
)
{
Conv1x1
(
input
,
filter
,
bias
,
2
,
output
);
Conv1x1
(
input
,
filter
,
bias
,
2
,
dt
,
output
);
};
}
// namespace kernels
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
c9393858
...
...
@@ -13,7 +13,8 @@ namespace kernels {
static
void
Conv2d3x3S12
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
uint32_t
stride
,
const
int
*
padding
,
Tensor
*
output
)
{
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -25,9 +26,10 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const
index_t
width_blocks
=
RoundUpDiv
<
index_t
,
5
>
(
width
);
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOPENCLCMDDataType
(
input
->
dtype
()
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOPENCLCMDDataType
(
dt
));
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
built_options
.
emplace
(
"-DSTRIDE="
+
ToString
(
stride
));
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
...
...
@@ -62,12 +64,15 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
}
void
Conv2dOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
)
{
Conv2d3x3S12
(
input
,
filter
,
bias
,
1
,
padding
,
output
);
const
Tensor
*
bias
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
)
{
Conv2d3x3S12
(
input
,
filter
,
bias
,
1
,
padding
,
dt
,
output
);
};
void
Conv2dOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
)
{
const
Tensor
*
bias
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
)
{
Conv2d3x3S12
(
input
,
filter
,
bias
,
2
,
padding
,
dt
,
output
);
};
}
// namespace kernels
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
c9393858
...
...
@@ -57,9 +57,8 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
std
::
string
DataTypeToCLType
(
const
DataType
dt
)
{
switch
(
dt
)
{
case
DT_FLOAT
:
return
"float"
;
case
DT_HALF
:
return
"
half
"
;
return
"
float
"
;
case
DT_UINT8
:
return
"uchar"
;
case
DT_INT8
:
...
...
@@ -85,9 +84,8 @@ std::string DataTypeToCLType(const DataType dt) {
std
::
string
DataTypeToOPENCLCMDDataType
(
const
DataType
dt
)
{
switch
(
dt
)
{
case
DT_FLOAT
:
return
"f"
;
case
DT_HALF
:
return
"
h
"
;
return
"
f
"
;
default:
LOG
(
FATAL
)
<<
"Not supported data type for opencl cmd data type"
;
return
""
;
...
...
mace/ops/buffer_to_image.cc
浏览文件 @
c9393858
...
...
@@ -14,6 +14,6 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
REGISTER_OPENCL_OPERATOR
(
OpKeyBuilder
(
"BufferToImage"
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
BufferToImageOp
<
DeviceType
::
OPENCL
,
float
>
);
BufferToImageOp
<
DeviceType
::
OPENCL
,
half
>
);
}
// namespace mace
mace/ops/buffer_to_image_test.cc
浏览文件 @
c9393858
...
...
@@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
.
Input
(
"Input"
)
.
Output
(
"B2IOutput"
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"T"
,
DataTypeToEnum
<
T
>::
value
)
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
...
...
@@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
.
Input
(
"B2IOutput"
)
.
Output
(
"I2BOutput"
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"T"
,
DataTypeToEnum
<
T
>::
value
)
.
Finalize
(
net
.
NewOperatorDef
());
// Run
...
...
@@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) {
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
ARGUMENT
,
{
1
});
}
TEST
(
BufferToImageTest
,
ArgHalfSmall
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
half
>
(
kernels
::
ARGUMENT
,
{
1
});
}
TEST
(
BufferToImageTest
,
ArgMedia
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
ARGUMENT
,
{
11
});
}
...
...
mace/ops/conv_2d.cc
浏览文件 @
c9393858
...
...
@@ -11,6 +11,11 @@ REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
.
Build
(),
Conv2dOp
<
DeviceType
::
CPU
,
float
>
);
REGISTER_CPU_OPERATOR
(
OpKeyBuilder
(
"Conv2D"
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
Conv2dOp
<
DeviceType
::
CPU
,
half
>
);
#if __ARM_NEON
REGISTER_NEON_OPERATOR
(
OpKeyBuilder
(
"Conv2D"
)
.
TypeConstraint
<
float
>
(
"T"
)
...
...
@@ -23,4 +28,9 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
.
Build
(),
Conv2dOp
<
DeviceType
::
OPENCL
,
float
>
);
REGISTER_OPENCL_OPERATOR
(
OpKeyBuilder
(
"Conv2D"
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
Conv2dOp
<
DeviceType
::
OPENCL
,
half
>
);
}
// namespace mace
mace/ops/conv_2d_benchmark.cc
浏览文件 @
c9393858
...
...
@@ -27,15 +27,15 @@ static void Conv2d(int iters,
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
channels
,
output_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
...
...
@@ -44,6 +44,7 @@ static void Conv2d(int iters,
.
AddIntsArg
(
"strides"
,
{
stride
,
stride
})
.
AddIntArg
(
"padding"
,
padding
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
...
...
@@ -54,6 +55,7 @@ static void Conv2d(int iters,
.
AddIntsArg
(
"strides"
,
{
stride
,
stride
})
.
AddIntArg
(
"padding"
,
padding
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
}
...
...
@@ -88,43 +90,42 @@ static void Conv2d(int iters,
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
// ICNet
BM_CONV_2D
(
1
,
512
,
15
,
15
,
1
,
1
,
1
,
VALID
,
1024
,
float
);
BM_CONV_2D
(
1
,
128
,
60
,
60
,
3
,
3
,
1
,
VALID
,
128
,
float
);
BM_CONV_2D
(
1
,
512
,
15
,
15
,
1
,
1
,
1
,
VALID
,
1024
,
half
);
// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
BM_CONV_2D
(
1
,
64
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
,
float
);
BM_CONV_2D
(
1
,
64
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
,
half
);
// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
BM_CONV_2D
(
1
,
32
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
,
float
);
BM_CONV_2D
(
1
,
32
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
,
half
);
BM_CONV_2D
(
1
,
128
,
60
,
60
,
3
,
3
,
1
,
VALID
,
128
,
half
);
// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
BM_CONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
SAME
,
32
,
float
);
BM_CONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
SAME
,
32
,
half
);
// Test RGB <-> YUV
BM_CONV_2D
(
1
,
3
,
2160
,
1080
,
1
,
1
,
1
,
VALID
,
3
,
float
);
BM_CONV_2D
(
1
,
3
,
480
,
480
,
1
,
1
,
1
,
VALID
,
3
,
float
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
1
,
1
,
1
,
VALID
,
128
,
float
);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
1
,
1
,
1
,
VALID
,
128
,
float
);
// Test bad alignments
BM_CONV_2D
(
1
,
3
,
512
,
512
,
1
,
1
,
1
,
VALID
,
3
,
float
);
BM_CONV_2D
(
1
,
32
,
112
,
112
,
1
,
1
,
1
,
VALID
,
64
,
float
);
BM_CONV_2D
(
1
,
64
,
56
,
56
,
1
,
1
,
1
,
VALID
,
128
,
float
);
BM_CONV_2D
(
1
,
256
,
28
,
28
,
1
,
1
,
1
,
VALID
,
256
,
float
);
BM_CONV_2D
(
1
,
1024
,
7
,
7
,
1
,
1
,
1
,
VALID
,
1024
,
float
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
VALID
,
128
,
float
);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
VALID
,
128
,
float
);
BM_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
1
,
VALID
,
3
,
float
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
SAME
,
128
,
float
);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
SAME
,
128
,
float
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
VALID
,
128
,
float
);
BM_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
2
,
VALID
,
3
,
float
);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
VALID
,
128
,
float
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
SAME
,
128
,
float
);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
SAME
,
128
,
float
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
5
,
5
,
1
,
VALID
,
128
,
float
);
BM_CONV_2D
(
1
,
64
,
32
,
31
,
5
,
5
,
1
,
VALID
,
128
,
float
);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
5
,
5
,
1
,
SAME
,
128
,
float
);
BM_CONV_2D
(
1
,
64
,
32
,
31
,
5
,
5
,
1
,
SAME
,
128
,
float
);
//
BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
//
BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
//
//
BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
//
BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments
//
BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
//
BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
//
BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
//
BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
//
BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
//
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
//
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
//
BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
//
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
//
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
//
BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
//
BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
//
BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
//
BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
//
BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
//
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
//
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
//
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
//
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
}
// namespace mace
mace/ops/conv_2d_test.cc
浏览文件 @
c9393858
...
...
@@ -98,9 +98,9 @@ void TestNHWCSimple3x3VALID() {
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
...
...
@@ -109,12 +109,13 @@ void TestNHWCSimple3x3VALID() {
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
(
D
);
// Transfer output
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
...
...
@@ -125,13 +126,14 @@ void TestNHWCSimple3x3VALID() {
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
}
auto
expected
=
CreateTensor
<
T
>
({
1
,
1
,
1
,
1
},
{
18.1
f
});
ExpectTensorNear
<
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.0
01
);
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
1
},
{
18.1
f
});
ExpectTensorNear
<
float
,
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.
01
);
}
template
<
DeviceType
D
,
typename
T
>
...
...
@@ -149,9 +151,9 @@ void TestNHWCSimple3x3SAME() {
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
...
...
@@ -160,12 +162,13 @@ void TestNHWCSimple3x3SAME() {
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
// Transfer output
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
...
...
@@ -176,16 +179,17 @@ void TestNHWCSimple3x3SAME() {
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
}
auto
expected
=
CreateTensor
<
T
>
(
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
3
,
3
,
1
},
{
8.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
18.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
8.1
f
});
ExpectTensorNear
<
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.0
01
);
ExpectTensorNear
<
float
,
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.
01
);
}
TEST_F
(
Conv2dOpTest
,
CPUSimple
)
{
...
...
@@ -233,22 +237,22 @@ TEST_F(Conv2dOpTest, NEONWithouBias) {
TestSimple3x3WithoutBias
<
DeviceType
::
NEON
>
();
}
template
<
DeviceType
D
>
template
<
DeviceType
D
,
typename
T
>
void
TestNHWCSimple3x3WithoutBias
()
{
OpsTestNet
net
;
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Input"
,
{
1
,
3
,
3
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Filter"
,
{
3
,
3
,
2
,
1
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
...
...
@@ -257,11 +261,12 @@ void TestNHWCSimple3x3WithoutBias() {
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
// Transfer output
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
...
...
@@ -270,6 +275,7 @@ void TestNHWCSimple3x3WithoutBias() {
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run
...
...
@@ -279,15 +285,15 @@ void TestNHWCSimple3x3WithoutBias() {
// Check
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
1
},
{
18.0
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.0
01
);
ExpectTensorNear
<
float
,
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.
01
);
}
TEST_F
(
Conv2dOpTest
,
CPUWithoutBias
)
{
TestNHWCSimple3x3WithoutBias
<
DeviceType
::
CPU
>
();
TestNHWCSimple3x3WithoutBias
<
DeviceType
::
CPU
,
float
>
();
}
TEST_F
(
Conv2dOpTest
,
OPENCLWithoutBias
)
{
TestNHWCSimple3x3WithoutBias
<
DeviceType
::
OPENCL
>
();
TestNHWCSimple3x3WithoutBias
<
DeviceType
::
OPENCL
,
float
>
();
}
template
<
DeviceType
D
>
...
...
@@ -333,27 +339,27 @@ TEST_F(Conv2dOpTest, NEONCombined) {
TestCombined3x3
<
DeviceType
::
NEON
>
();
}
template
<
DeviceType
D
>
template
<
DeviceType
D
,
typename
T
>
static
void
TestNHWCCombined3x3
()
{
// Construct graph
OpsTestNet
net
;
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Input"
,
{
1
,
5
,
5
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Filter"
,
{
3
,
3
,
2
,
2
},
{
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"InputImage"
)
...
...
@@ -363,11 +369,12 @@ static void TestNHWCCombined3x3() {
.
AddIntsArg
(
"strides"
,
{
2
,
2
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"Input"
)
...
...
@@ -377,6 +384,7 @@ static void TestNHWCCombined3x3() {
.
AddIntsArg
(
"strides"
,
{
2
,
2
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
...
...
@@ -388,16 +396,21 @@ static void TestNHWCCombined3x3() {
{
1
,
3
,
3
,
2
},
{
8.1
f
,
4.2
f
,
12.1
f
,
6.2
f
,
8.1
f
,
4.2
f
,
12.1
f
,
6.2
f
,
18.1
f
,
9.2
f
,
12.1
f
,
6.2
f
,
8.1
f
,
4.2
f
,
12.1
f
,
6.2
f
,
8.1
f
,
4.2
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.0
01
);
ExpectTensorNear
<
float
,
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.
01
);
}
TEST_F
(
Conv2dOpTest
,
CPUCombined
)
{
TestNHWCCombined3x3
<
DeviceType
::
CPU
>
();
TEST_F
(
Conv2dOpTest
,
CPUStride2
)
{
TestNHWCCombined3x3
<
DeviceType
::
CPU
,
float
>
();
}
TEST_F
(
Conv2dOpTest
,
OPENCLStride2
)
{
TestNHWCCombined3x3
<
DeviceType
::
OPENCL
,
float
>
();
}
template
<
DeviceType
D
>
void
TestConv1x1
()
{
// Construct graph
OpsTestNet
net
;
// Add input data
...
...
@@ -415,12 +428,12 @@ void TestConv1x1() {
{
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
// Construct graph
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
BufferToImage
<
D
,
float
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
float
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
float
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"BiasImage"
)
...
...
@@ -429,12 +442,10 @@ void TestConv1x1() {
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
// Transfer output
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
ImageToBuffer
<
D
,
float
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"Input"
)
...
...
@@ -445,7 +456,7 @@ void TestConv1x1() {
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
}
...
...
@@ -470,7 +481,7 @@ TEST_F(Conv2dOpTest, OPENCLConv1x1) {
TestConv1x1
<
DeviceType
::
OPENCL
>
();
}
template
<
DeviceType
D
>
template
<
DeviceType
D
,
typename
T
>
static
void
TestComplexConvNxNS12
(
const
std
::
vector
<
index_t
>
&
shape
)
{
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
...
...
@@ -478,7 +489,6 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
srand
(
time
(
NULL
));
// generate random input
// TODO test all sizes
index_t
batch
=
3
+
(
rand
()
%
10
);
index_t
height
=
shape
[
0
];
index_t
width
=
shape
[
1
];
...
...
@@ -494,13 +504,14 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
output_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
// run on cpu
net
.
RunOp
();
...
...
@@ -509,9 +520,9 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// run on gpu
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
...
...
@@ -521,16 +532,17 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run on device
net
.
RunOp
(
D
);
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
0.001
);
};
for
(
int
kernel_size
:
{
1
,
3
})
{
for
(
int
stride
:
{
1
})
{
for
(
int
stride
:
{
1
,
2
})
{
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
);
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
SAME
);
}
...
...
@@ -538,9 +550,90 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
}
TEST_F
(
Conv2dOpTest
,
OPENCLAlignedConvNxNS12
)
{
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
,
64
,
128
});
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
32
,
32
,
32
,
64
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLUnalignedConvNxNS12
)
{
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
107
,
113
,
5
,
7
});
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
107
,
113
,
5
,
7
});
}
template
<
DeviceType
D
>
static
void
TestHalfComplexConvNxNS12
(
const
std
::
vector
<
index_t
>
&
shape
)
{
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
Padding
type
)
{
srand
(
time
(
NULL
));
// generate random input
index_t
batch
=
3
+
(
rand
()
%
10
);
index_t
height
=
shape
[
0
];
index_t
width
=
shape
[
1
];
index_t
input_channels
=
shape
[
2
]
+
(
rand
()
%
10
);
index_t
output_channels
=
shape
[
3
]
+
(
rand
()
%
10
);
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
std
::
vector
<
float
>
float_input_data
;
GenerateRandomRealTypeData
({
batch
,
height
,
width
,
input_channels
},
float_input_data
);
std
::
vector
<
float
>
float_filter_data
;
GenerateRandomRealTypeData
({
kernel_h
,
kernel_w
,
input_channels
,
output_channels
},
float_filter_data
);
std
::
vector
<
float
>
float_bias_data
;
GenerateRandomRealTypeData
({
output_channels
},
float_bias_data
);
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
},
float_input_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
output_channels
},
float_filter_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
output_channels
},
float_bias_data
);
// run on cpu
net
.
RunOp
();
// Check
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// run on gpu
BufferToImage
<
D
,
half
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
half
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
half
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataType
::
DT_HALF
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run on device
net
.
RunOp
(
D
);
ImageToBuffer
<
D
,
float
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
0.2
);
};
for
(
int
kernel_size
:
{
1
,
3
})
{
for
(
int
stride
:
{
1
,
2
})
{
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
);
}
}
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConvNxNS12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
,
32
,
64
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfUnalignedConvNxNS12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
107
,
113
,
5
,
7
});
}
mace/ops/ops_test_util.h
浏览文件 @
c9393858
...
...
@@ -210,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
std
::
vector
<
T
>
&
res
)
{
std
::
random_device
rd
;
std
::
mt19937
gen
(
rd
());
std
::
normal_distribution
<
T
>
nd
(
0
,
1
);
std
::
normal_distribution
<
float
>
nd
(
0
,
1
);
index_t
size
=
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
index_t
>
());
res
.
resize
(
size
);
if
(
DataTypeToEnum
<
T
>::
value
==
DT_HALF
)
{
std
::
generate
(
res
.
begin
(),
res
.
end
(),
[
&
gen
,
&
nd
]
{
return
half_float
::
half_cast
<
half
>
(
nd
(
gen
));
});
}
else
{
std
::
generate
(
res
.
begin
(),
res
.
end
(),
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
}
}
template
<
typename
T
>
...
...
@@ -290,39 +294,40 @@ inline void ExpectEqual<double>(const double &a, const double &b) {
EXPECT_DOUBLE_EQ
(
a
,
b
);
}
inline
void
AssertSameTypeDims
(
const
Tensor
&
x
,
const
Tensor
&
y
)
{
ASSERT_EQ
(
x
.
dtype
(),
y
.
dtype
());
inline
void
AssertSameDims
(
const
Tensor
&
x
,
const
Tensor
&
y
)
{
ASSERT_TRUE
(
IsSameSize
(
x
,
y
))
<<
"x.shape ["
<<
ShapeToString
(
x
)
<<
"] vs "
<<
"y.shape [ "
<<
ShapeToString
(
y
)
<<
"]"
;
}
template
<
typename
T
,
bool
is_fp
=
is_floating_point_type
<
T
>
::
value
>
template
<
typename
EXP_TYPE
,
typename
RES_TYPE
,
bool
is_fp
=
is_floating_point_type
<
EXP_TYPE
>
::
value
>
struct
Expector
;
// Partial specialization for float and double.
template
<
typename
T
>
struct
Expector
<
T
,
true
>
{
static
void
Equal
(
const
T
&
a
,
const
T
&
b
)
{
ExpectEqual
(
a
,
b
);
}
template
<
typename
EXP_TYPE
,
typename
RES_TYPE
>
struct
Expector
<
EXP_TYPE
,
RES_TYPE
,
true
>
{
static
void
Equal
(
const
EXP_TYPE
&
a
,
const
RES_TYPE
&
b
)
{
ExpectEqual
(
a
,
b
);
}
static
void
Equal
(
const
Tensor
&
x
,
const
Tensor
&
y
)
{
ASSERT_EQ
(
x
.
dtype
(),
DataTypeToEnum
<
T
>::
v
());
AssertSameTypeDims
(
x
,
y
);
ASSERT_EQ
(
x
.
dtype
(),
DataTypeToEnum
<
EXP_TYPE
>::
v
());
ASSERT_EQ
(
y
.
dtype
(),
DataTypeToEnum
<
RES_TYPE
>::
v
());
AssertSameDims
(
x
,
y
);
Tensor
::
MappingGuard
x_mapper
(
&
x
);
Tensor
::
MappingGuard
y_mapper
(
&
y
);
auto
a
=
x
.
data
<
T
>
();
auto
b
=
y
.
data
<
T
>
();
auto
a
=
x
.
data
<
EXP_TYPE
>
();
auto
b
=
y
.
data
<
RES_TYPE
>
();
for
(
int
i
=
0
;
i
<
x
.
size
();
++
i
)
{
ExpectEqual
(
a
(
i
),
b
(
i
));
}
}
static
void
Near
(
const
Tensor
&
x
,
const
Tensor
&
y
,
const
double
abs_err
)
{
ASSERT_EQ
(
x
.
dtype
(),
DataTypeToEnum
<
T
>::
v
());
AssertSameTypeDims
(
x
,
y
);
ASSERT_EQ
(
x
.
dtype
(),
DataTypeToEnum
<
EXP_TYPE
>::
v
());
ASSERT_EQ
(
y
.
dtype
(),
DataTypeToEnum
<
RES_TYPE
>::
v
());
AssertSameDims
(
x
,
y
);
Tensor
::
MappingGuard
x_mapper
(
&
x
);
Tensor
::
MappingGuard
y_mapper
(
&
y
);
auto
a
=
x
.
data
<
T
>
();
auto
b
=
y
.
data
<
T
>
();
auto
a
=
x
.
data
<
EXP_TYPE
>
();
auto
b
=
y
.
data
<
RES_TYPE
>
();
for
(
int
i
=
0
;
i
<
x
.
size
();
++
i
)
{
EXPECT_NEAR
(
a
[
i
],
b
[
i
],
abs_err
)
<<
"a = "
<<
a
<<
" b = "
<<
b
<<
" index = "
<<
i
;
...
...
@@ -335,10 +340,18 @@ template <typename T>
void
ExpectTensorNear
(
const
Tensor
&
x
,
const
Tensor
&
y
,
const
double
abs_err
)
{
static_assert
(
is_floating_point_type
<
T
>::
value
,
"T is not a floating point type"
);
Expector
<
T
>::
Near
(
x
,
y
,
abs_err
);
Expector
<
T
,
T
>::
Near
(
x
,
y
,
abs_err
);
}
template
<
typename
EXP_TYPE
,
typename
RES_TYPE
>
void
ExpectTensorNear
(
const
Tensor
&
x
,
const
Tensor
&
y
,
const
double
abs_err
)
{
static_assert
(
is_floating_point_type
<
EXP_TYPE
>::
value
&&
is_floating_point_type
<
RES_TYPE
>::
value
,
"T is not a floating point type"
);
Expector
<
EXP_TYPE
,
RES_TYPE
>::
Near
(
x
,
y
,
abs_err
);
}
template
<
DeviceType
D
>
template
<
DeviceType
D
,
typename
T
>
void
BufferToImage
(
OpsTestNet
&
net
,
const
std
::
string
&
input_name
,
const
std
::
string
&
output_name
,
...
...
@@ -347,6 +360,7 @@ void BufferToImage(OpsTestNet &net,
.
Input
(
input_name
)
.
Output
(
output_name
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run
...
...
@@ -355,7 +369,7 @@ void BufferToImage(OpsTestNet &net,
net
.
Sync
();
}
template
<
DeviceType
D
>
template
<
DeviceType
D
,
typename
T
>
void
ImageToBuffer
(
OpsTestNet
&
net
,
const
std
::
string
&
input_name
,
const
std
::
string
&
output_name
,
...
...
@@ -364,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net,
.
Input
(
input_name
)
.
Output
(
output_name
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run
...
...
mace/proto/mace.proto
浏览文件 @
c9393858
...
...
@@ -67,12 +67,20 @@ message NodeInput {
optional
int32
output_port
=
2
;
}
message
OutputShape
{
repeated
int64
dims
=
1
;
}
message
OperatorDef
{
repeated
string
input
=
1
;
repeated
string
output
=
2
;
optional
string
name
=
3
;
optional
string
type
=
4
;
repeated
Argument
arg
=
5
;
optional
OutputShape
output_shape
=
6
;
// Memory optimization: only support one single output op
optional
int32
mem_id
=
10
[
default
=
-
1
];
// for hexagon mace-nnlib
optional
uint32
node_id
=
100
;
...
...
@@ -82,6 +90,16 @@ message OperatorDef {
repeated
int32
out_max_byte_size
=
104
;
// only support 32-bit len
}
// for memory optimization
message
MemoryBlock
{
optional
int32
mem_id
=
1
;
optional
uint32
x
=
2
;
optional
uint32
y
=
3
;
}
message
MemoryArena
{
repeated
MemoryBlock
mem_block
=
1
;
}
// for hexagon mace-nnlib
message
InputInfo
{
optional
string
name
=
1
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录