Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
8db742cf
Mace
项目概览
Xiaomi
/
Mace
通知
107
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
8db742cf
编写于
12月 01, 2017
作者:
L
Liangliang He
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'opencl-conv3x3' into 'master'
Optimize Opencl conv3x3 and fix buffer to image test bug. See merge request !129
上级
903f399d
9a2f3c9f
变更
15
隐藏空白更改
内联
并排
Showing
15 changed file
with
478 addition
and
314 deletion
+478
-314
mace/core/half.h
mace/core/half.h
+1
-1
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+29
-13
mace/kernels/opencl/buffer_to_image.cc
mace/kernels/opencl/buffer_to_image.cc
+2
-2
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+39
-18
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+121
-129
mace/kernels/opencl/conv_2d_opencl.cc
mace/kernels/opencl/conv_2d_opencl.cc
+13
-10
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+10
-6
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+11
-6
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+2
-4
mace/ops/buffer_to_image.cc
mace/ops/buffer_to_image.cc
+1
-1
mace/ops/buffer_to_image_test.cc
mace/ops/buffer_to_image_test.cc
+6
-0
mace/ops/conv_2d.cc
mace/ops/conv_2d.cc
+10
-0
mace/ops/conv_2d_benchmark.cc
mace/ops/conv_2d_benchmark.cc
+37
-36
mace/ops/conv_2d_test.cc
mace/ops/conv_2d_test.cc
+162
-69
mace/ops/ops_test_util.h
mace/ops/ops_test_util.h
+34
-19
未找到文件。
mace/core/half.h
浏览文件 @
8db742cf
...
@@ -1098,7 +1098,7 @@ namespace half_float
...
@@ -1098,7 +1098,7 @@ namespace half_float
/// Conversion constructor.
/// Conversion constructor.
/// \param rhs float to convert
/// \param rhs float to convert
explicit
half
(
float
rhs
)
:
data_
(
detail
::
float2half
<
round_style
>
(
rhs
))
{}
half
(
float
rhs
)
:
data_
(
detail
::
float2half
<
round_style
>
(
rhs
))
{}
/// Conversion to single-precision.
/// Conversion to single-precision.
/// \return single precision value representing expression value
/// \return single precision value representing expression value
...
...
mace/kernels/conv_2d.h
浏览文件 @
8db742cf
...
@@ -11,13 +11,23 @@
...
@@ -11,13 +11,23 @@
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
struct
Conv2dFunctorBase
{
Conv2dFunctorBase
(
const
int
*
strides
,
const
Padding
&
paddings
,
const
int
*
dilations
)
:
strides_
(
strides
),
dilations_
(
dilations
),
paddings_
(
paddings
)
{}
const
int
*
strides_
;
// [stride_h, stride_w]
const
int
*
dilations_
;
// [dilation_h, dilation_w]
Padding
paddings_
;
};
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
struct
Conv2dFunctor
{
struct
Conv2dFunctor
:
Conv2dFunctorBase
{
Conv2dFunctor
()
{}
Conv2dFunctor
(
const
int
*
strides
,
Conv2dFunctor
(
const
int
*
strides
,
const
Padding
&
paddings
,
const
Padding
&
paddings
,
const
int
*
dilations
)
const
int
*
dilations
)
:
strides_
(
strides
),
dilations_
(
dilations
),
paddings_
(
padding
s
)
{}
:
Conv2dFunctorBase
(
strides
,
paddings
,
dilation
s
)
{}
void
operator
()(
const
Tensor
*
input
,
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
filter
,
...
@@ -76,9 +86,10 @@ struct Conv2dFunctor {
...
@@ -76,9 +86,10 @@ struct Conv2dFunctor {
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
T
bias_channel
=
bias_data
?
bias_data
[
c
]
:
0
;
T
bias_channel
=
0.0
f
;
if
(
bias
)
bias_channel
=
bias_data
[
c
];
*
output_data
=
bias_channel
;
*
output_data
=
bias_channel
;
T
sum
=
0
;
T
sum
=
0
.0
f
;
const
T
*
filter_ptr
=
filter_data
+
c
;
const
T
*
filter_ptr
=
filter_data
+
c
;
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
...
@@ -113,9 +124,6 @@ struct Conv2dFunctor {
...
@@ -113,9 +124,6 @@ struct Conv2dFunctor {
}
}
const
int
*
strides_
;
// [stride_h, stride_w]
const
int
*
dilations_
;
// [dilation_h, dilation_w]
Padding
paddings_
;
};
};
template
<
>
template
<
>
...
@@ -123,11 +131,19 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
...
@@ -123,11 +131,19 @@ void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
const
Tensor
*
filter
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
Tensor
*
bias
,
Tensor
*
output
);
Tensor
*
output
);
template
<
>
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
float
>::
operator
()(
const
Tensor
*
input
,
template
<
typename
T
>
const
Tensor
*
filter
,
struct
Conv2dFunctor
<
DeviceType
::
OPENCL
,
T
>
:
Conv2dFunctorBase
{
const
Tensor
*
bias
,
Conv2dFunctor
(
const
int
*
strides
,
Tensor
*
output
);
const
Padding
&
paddings
,
const
int
*
dilations
)
:
Conv2dFunctorBase
(
strides
,
paddings
,
dilations
)
{}
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
);
};
}
// namespace kernels
}
// namespace kernels
}
// namespace mace
}
// namespace mace
...
...
mace/kernels/opencl/buffer_to_image.cc
浏览文件 @
8db742cf
...
@@ -24,8 +24,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
...
@@ -24,8 +24,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
}
}
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
image
->
dtype
()
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOPENCLCMDDataType
(
image
->
dtype
()
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOPENCLCMDDataType
(
DataTypeToEnum
<
T
>::
value
));
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
runtime
=
OpenCLRuntime
::
Get
();
string
kernel_name
;
string
kernel_name
;
switch
(
type
)
{
switch
(
type
)
{
...
...
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
8db742cf
...
@@ -10,7 +10,10 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -10,7 +10,10 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__read_only
image2d_t
bn_offset,
/*
cout%4
*
cout/4
*/
__read_only
image2d_t
bn_offset,
/*
cout%4
*
cout/4
*/
#
endif
#
endif
__write_only
image2d_t
output,
__write_only
image2d_t
output,
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
int
in_ch_blks,
__private
const
int
in_ch_blks,
__private
const
int
height,
__private
const
int
width
)
{
__private
const
int
width
)
{
const
int
out_ch_blk
=
get_global_id
(
0
)
;
const
int
out_ch_blk
=
get_global_id
(
0
)
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
...
@@ -32,24 +35,37 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -32,24 +35,37 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#
endif
#
endif
int4
w
;
int4
w
;
#
if
STRIDE
==
1
w.x
=
out_w_blk
;
w.x
=
out_w_blk
;
w.y
=
w.x
+
out_w_blks
;
w.y
=
w.x
+
out_w_blks
;
w.z
=
w.y
+
out_w_blks
;
w.z
=
w.y
+
out_w_blks
;
w.w
=
w.z
+
out_w_blks
;
w.w
=
w.z
+
out_w_blks
;
int
out_hb_idx
=
(
out_hb
%
height
)
;
#
else
w.x
=
out_w_blk
*
2
;
w.y
=
(
out_w_blk
+
out_w_blks
)
*
2
;
w.z
=
(
out_w_blk
+
2
*
out_w_blks
)
*
2
;
w.w
=
(
out_w_blk
+
3
*
out_w_blks
)
*
2
;
int
out_hb_idx
=
(
out_hb
%
height
)
*
2
;
#
endif
w.x
=
select
(
w.x,
INT_MIN,
w.x
>=
in_width
)
;
w.y
=
select
(
w.y,
INT_MIN,
w.y
>=
in_width
)
;
w.z
=
select
(
w.z,
INT_MIN,
w.z
>=
in_width
)
;
w.w
=
select
(
w.w,
INT_MIN,
w.w
>=
in_width
)
;
out_hb_idx
=
select
(
out_hb_idx
+
(
out_hb
/
height
)
*
in_height,
-1
,
out_hb_idx
>=
in_height
)
;
//
Unrolling
this
loop
hurt
perfmance
//
Unrolling
this
loop
hurt
perfmance
int
in_x_base
=
0
;
int
in_x_base
=
0
;
for
(
int
in_ch_blk
=
0
; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for
(
int
in_ch_blk
=
0
; in_ch_blk < in_ch_blks; ++in_ch_blk) {
DATA_TYPE4
in0
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.x,
out_hb
))
;
DATA_TYPE4
in1
=
0
;
DATA_TYPE4
in0
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.x,
out_hb_idx
))
;
DATA_TYPE4
in2
=
0
;
DATA_TYPE4
in1
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.y,
out_hb_idx
))
;
DATA_TYPE4
in3
=
0
;
DATA_TYPE4
in2
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.z,
out_hb_idx
))
;
if
(
w.y
<
width
)
{
DATA_TYPE4
in3
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.w,
out_hb_idx
))
;
//
conditional
load
hurt
perf,
this
branching
helps
sometimes
in1
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.y,
out_hb
))
;
in2
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.z,
out_hb
))
;
in3
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_x_base
+
w.w,
out_hb
))
;
}
const
int
filter_x0
=
in_ch_blk
<<
2
;
const
int
filter_x0
=
in_ch_blk
<<
2
;
DATA_TYPE4
weights0
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_x0,
out_ch_blk
))
;
DATA_TYPE4
weights0
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_x0,
out_ch_blk
))
;
...
@@ -78,7 +94,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -78,7 +94,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
out3
+=
in3.z
*
weights2
;
out3
+=
in3.z
*
weights2
;
out3
+=
in3.w
*
weights3
;
out3
+=
in3.w
*
weights3
;
in_x_base
+=
width
;
in_x_base
+=
in_
width
;
}
}
#
ifdef
FUSED_BATCH_NORM
#
ifdef
FUSED_BATCH_NORM
...
@@ -111,14 +127,19 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -111,14 +127,19 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#
endif
#
endif
const
int
out_x_base
=
out_ch_blk
*
width
;
const
int
out_x_base
=
out_ch_blk
*
width
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w.x,
out_hb
)
,
out0
)
;
int
out_x_idx
=
out_w_blk
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out0
)
;
out_x_idx
+=
out_w_blks
;
if
(
out_x_idx
>=
width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out1
)
;
if
(
w.y
>=
width
)
return
;
out_x_idx
+=
out_w_blks
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w.y,
out_hb
)
,
out1
)
;
if
(
out_x_idx
>=
width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out2
)
;
if
(
w.z
>=
width
)
return
;
out_x_idx
+=
out_w_blks
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w.z,
out_hb
)
,
out2
)
;
if
(
out_x_idx
>=
width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out3
)
;
if
(
w.w
>=
width
)
return
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w.w,
out_hb
)
,
out3
)
;
}
}
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
8db742cf
...
@@ -20,143 +20,135 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -20,143 +20,135 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const
int
rounded_in_ch
=
in_ch_blks
*
4
;
const
int
rounded_in_ch
=
in_ch_blks
*
4
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
out[5]
=
{0}
;
#
ifdef
BIAS
#
ifdef
BIAS
out[0]
=
DATA_TYPE4
out0
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
bias,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
READ_IMAGET
(
bias,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
out[1]
=
out[0]
;
DATA_TYPE4
out1
=
out0
;
out[2]
=
out[0]
;
DATA_TYPE4
out2
=
out0
;
out[3]
=
out[0]
;
DATA_TYPE4
out3
=
out0
;
out[4]
=
out[0]
;
DATA_TYPE4
out4
=
out0
;
#
else
DATA_TYPE4
out0
=
0
;
DATA_TYPE4
out1
=
0
;
DATA_TYPE4
out2
=
0
;
DATA_TYPE4
out3
=
0
;
DATA_TYPE4
out4
=
0
;
#
endif
#
if
STRIDE
==
1
int
in_width0
=
out_w_blk
-
padding_left
;
int
in_width1
=
in_width0
+
out_w_blks
;
int
in_width2
=
in_width1
+
out_w_blks
;
int
in_width3
=
in_width2
+
out_w_blks
;
int
in_width4
=
in_width3
+
out_w_blks
;
const
int
height_idx
=
(
out_hb
%
out_height
)
-
padding_top
;
#
else
int
in_width0
=
out_w_blk
*
2
-
padding_left
;
int
in_width1
=
(
out_w_blk
+
out_w_blks
)
*
2
-
padding_left
;
int
in_width2
=
(
out_w_blk
+
2
*
out_w_blks
)
*
2
-
padding_left
;
int
in_width3
=
(
out_w_blk
+
3
*
out_w_blks
)
*
2
-
padding_left
;
int
in_width4
=
(
out_w_blk
+
4
*
out_w_blks
)
*
2
-
padding_left
;
const
int
height_idx
=
(
out_hb
%
out_height
)
*
2
-
padding_top
;
#
endif
#
endif
int
w[5]
;
const
int
batch_idx
=
(
out_hb
/
out_height
)
*
in_height
;
w[0]
=
out_w_blk
-
padding_left
;
w[1]
=
w[0]
+
out_w_blks
;
DATA_TYPE4
in0,
in1,
in2,
in3,
in4
;
w[2]
=
w[1]
+
out_w_blks
;
DATA_TYPE4
weights0,
weights1,
weights2,
weights3
;
w[3]
=
w[2]
+
out_w_blks
;
w[4]
=
w[3]
+
out_w_blks
;
const
int
batch_idx
=
out_hb
/
out_height
;
const
int
height_idx
=
out_hb
%
out_height
;
int
in_hb[3]
;
in_hb[0]
=
height_idx
-
padding_top
;
in_hb[1]
=
in_hb[0]
+
1
;
in_hb[2]
=
in_hb[1]
+
1
;
//
Judge
the
height
border
for
padding
input.
in_hb[0]
=
(
in_hb[0]
<
0
|
| in_hb[0] >= in_height) ? -1 : in_hb[0] + batch_idx * in_height;
in_hb[1] = (in_hb[1] < 0 || in_hb[1] >= in_height) ? -1 : in_hb[1] + batch_idx * in_height;
in_hb[2] = (in_hb[2] < 0 || in_hb[2] >= in_height) ? -1 : in_hb[2] + batch_idx * in_height;
const int input_image_width = in_ch_blks * in_width;
VEC_DATA_TYPE(DATA_TYPE, 4) in[5];
VEC_DATA_TYPE(DATA_TYPE, 4) weights[4];
int
in_idx,
hb_idx,
width_idx,
in_width_idx
;
int
in_idx,
hb_idx,
width_idx,
in_width_idx
;
//
Unrolling
this
loop
hurt
perfmance
//
Unrolling
this
loop
hurt
perfmance
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for
(
short
in_ch_blk
=
0
; in_ch_blk < in_ch_blks; ++in_ch_blk) {
for (int i = 0; i < 9; ++i) {
for
(
short
hb_idx
=
0
; hb_idx < 3; ++hb_idx) {
for
(
short
width_idx
=
0
; width_idx < 3; ++width_idx) {
in_idx = in_ch_blk * in_width;
in_idx
=
in_ch_blk
*
in_width
;
hb_idx = i / 3;
width_idx = i % 3;
int
in_hb_value
=
height_idx
+
hb_idx
;
in_width_idx = w[0] + width_idx;
in_hb_value
=
select
(
in_hb_value
+
batch_idx,
// Judge the width border for padding input.
-1
,
if (in_width_idx < 0 || in_width_idx >= in_width) {
(
in_hb_value
<
0
|
| in_hb_value >= in_height));
in[0] = 0;
} else {
int in_width_value;
in[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
#define READ_INPUT(i) \
}
in_width_value = in_width##i + width_idx; \
in_width_idx = w[1] + width_idx;
in_width_value = select(in_idx + in_width_value, \
if (in_width_idx < 0 || in_width_idx >= in_width) {
-1, \
in[1] = 0;
(in_width_value < 0 |
|
in_width_value
>=
in_width
))
; \
} else {
in##i
=
READ_IMAGET
(
input,
sampler,
(
int2
)(
in_width_value,
in_hb_value
))
;
in[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
READ_INPUT
(
0
)
;
READ_INPUT
(
1
)
;
READ_INPUT
(
2
)
;
READ_INPUT
(
3
)
;
READ_INPUT
(
4
)
;
#
undef
READ_INPUT
int
filter_idx
=
(
in_ch_blk
<<
2
)
+
(
hb_idx
*
3
+
width_idx
)
*
rounded_in_ch
;
weights0
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_idx
+
0
,
out_ch_blk
))
;
weights1
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_idx
+
1
,
out_ch_blk
))
;
weights2
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_idx
+
2
,
out_ch_blk
))
;
weights3
=
READ_IMAGET
(
filter,
sampler,
(
int2
)(
filter_idx
+
3
,
out_ch_blk
))
;
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
//
Interleaving
load
and
mul
does
not
improve
performance
as
expected
out0
+=
in0.x
*
weights0
;
out0
+=
in0.y
*
weights1
;
out0
+=
in0.z
*
weights2
;
out0
+=
in0.w
*
weights3
;
out1
+=
in1.x
*
weights0
;
out1
+=
in1.y
*
weights1
;
out1
+=
in1.z
*
weights2
;
out1
+=
in1.w
*
weights3
;
out2
+=
in2.x
*
weights0
;
out2
+=
in2.y
*
weights1
;
out2
+=
in2.z
*
weights2
;
out2
+=
in2.w
*
weights3
;
out3
+=
in3.x
*
weights0
;
out3
+=
in3.y
*
weights1
;
out3
+=
in3.z
*
weights2
;
out3
+=
in3.w
*
weights3
;
out4
+=
in4.x
*
weights0
;
out4
+=
in4.y
*
weights1
;
out4
+=
in4.z
*
weights2
;
out4
+=
in4.w
*
weights3
;
}
}
in_width_idx = w[2] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[2] = 0;
} else {
in[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[3] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[3] = 0;
} else {
in[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[4] + width_idx;
if (in_width_idx < 0 |
|
in_width_idx
>=
in_width
)
{
in[4]
=
0
;
}
else
{
in[4]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
sampler,
(
int2
)(
in_idx
+
in_width_idx,
in_hb[hb_idx]
))
;
}
int
filter_idx
=
(
in_ch_blk
<<
2
)
+
i
*
rounded_in_ch
;
weights[0]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
0
,
out_ch_blk
))
;
weights[1]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
1
,
out_ch_blk
))
;
weights[2]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
2
,
out_ch_blk
))
;
weights[3]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
3
,
out_ch_blk
))
;
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
//
Interleaving
load
and
mul
does
not
improve
performance
as
expected
out[0]
+=
in[0].x
*
weights[0]
;
out[0]
+=
in[0].y
*
weights[1]
;
out[0]
+=
in[0].z
*
weights[2]
;
out[0]
+=
in[0].w
*
weights[3]
;
out[1]
+=
in[1].x
*
weights[0]
;
out[1]
+=
in[1].y
*
weights[1]
;
out[1]
+=
in[1].z
*
weights[2]
;
out[1]
+=
in[1].w
*
weights[3]
;
out[2]
+=
in[2].x
*
weights[0]
;
out[2]
+=
in[2].y
*
weights[1]
;
out[2]
+=
in[2].z
*
weights[2]
;
out[2]
+=
in[2].w
*
weights[3]
;
out[3]
+=
in[3].x
*
weights[0]
;
out[3]
+=
in[3].y
*
weights[1]
;
out[3]
+=
in[3].z
*
weights[2]
;
out[3]
+=
in[3].w
*
weights[3]
;
out[4]
+=
in[4].x
*
weights[0]
;
out[4]
+=
in[4].y
*
weights[1]
;
out[4]
+=
in[4].z
*
weights[2]
;
out[4]
+=
in[4].w
*
weights[3]
;
}
}
}
}
const
int
out_x_base
=
out_ch_blk
*
out_width
;
const
int
out_x_base
=
out_ch_blk
*
out_width
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
int
w
=
out_w_blk
;
(
int2
)(
out_x_base
+
w[0]
+
padding_left,
out_hb
)
,
WRITE_IMAGET
(
output,
out[0]
)
;
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out0
)
;
w[1]
+=
padding_left
;
if
(
w[1]
>=
out_width
)
return
;
w
+=
out_w_blks
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
if
(
w
>=
out_width
)
return
;
(
int2
)(
out_x_base
+
w[1],
out_hb
)
,
WRITE_IMAGET
(
output,
out[1]
)
;
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out1
)
;
w[2]
+=
padding_left
;
if
(
w[2]
>=
out_width
)
return
;
w
+=
out_w_blks
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
if
(
w
>=
out_width
)
return
;
(
int2
)(
out_x_base
+
w[2],
out_hb
)
,
WRITE_IMAGET
(
output,
out[2]
)
;
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out2
)
;
w[3]
+=
padding_left
;
if
(
w[3]
>=
out_width
)
return
;
w
+=
out_w_blks
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
if
(
w
>=
out_width
)
return
;
(
int2
)(
out_x_base
+
w[3],
out_hb
)
,
WRITE_IMAGET
(
output,
out[3]
)
;
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out3
)
;
w[4]
+=
padding_left
;
if
(
w[4]
>=
out_width
)
return
;
w
+=
out_w_blks
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
if
(
w
>=
out_width
)
return
;
(
int2
)(
out_x_base
+
w[4],
out_hb
)
,
WRITE_IMAGET
(
output,
out[4]
)
;
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out4
)
;
}
}
mace/kernels/opencl/conv_2d_opencl.cc
浏览文件 @
8db742cf
...
@@ -10,33 +10,33 @@ namespace kernels {
...
@@ -10,33 +10,33 @@ namespace kernels {
extern
void
Conv2dOpenclK1x1S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
extern
void
Conv2dOpenclK1x1S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
);
const
DataType
dt
,
Tensor
*
output
);
extern
void
Conv2dOpenclK1x1S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
extern
void
Conv2dOpenclK1x1S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
);
const
DataType
dt
,
Tensor
*
output
);
extern
void
Conv2dOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
extern
void
Conv2dOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
);
const
DataType
dt
,
Tensor
*
output
);
extern
void
Conv2dOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
extern
void
Conv2dOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
);
const
DataType
dt
,
Tensor
*
output
);
template
<
>
template
<
typename
T
>
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
float
>::
operator
()(
const
Tensor
*
input
,
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
Tensor
*
bias
,
Tensor
*
output
)
{
Tensor
*
output
)
{
typedef
void
(
*
Conv2dOpenclFunction
)(
const
Tensor
*
input
,
const
Tensor
*
filter
,
typedef
void
(
*
Conv2dOpenclFunction
)(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
);
DataType
dt
,
Tensor
*
output
);
// Selection matrix: kernel_size x stride_size
// Selection matrix: kernel_size x stride_size
static
const
Conv2dOpenclFunction
selector
[
5
][
2
]
=
{
static
const
Conv2dOpenclFunction
selector
[
5
][
2
]
=
{
{
Conv2dOpenclK1x1S1
,
Conv2dOpenclK1x1S2
},
{
Conv2dOpenclK1x1S1
,
Conv2dOpenclK1x1S2
},
{
nullptr
,
nullptr
},
{
nullptr
,
nullptr
},
{
Conv2dOpenclK3x3S1
,
nullptr
},
{
Conv2dOpenclK3x3S1
,
Conv2dOpenclK3x3S2
},
{
nullptr
,
nullptr
},
{
nullptr
,
nullptr
},
{
nullptr
,
nullptr
}};
{
nullptr
,
nullptr
}};
...
@@ -50,7 +50,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
...
@@ -50,7 +50,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
<<
" stride "
<<
strides_
[
0
]
<<
"x"
<<
strides_
[
1
]
<<
" stride "
<<
strides_
[
0
]
<<
"x"
<<
strides_
[
1
]
<<
" is not implemented yet, using slow version"
;
<<
" is not implemented yet, using slow version"
;
// TODO(heliangliang) The CPU/NEON kernel should map the buffer
// TODO(heliangliang) The CPU/NEON kernel should map the buffer
Conv2dFunctor
<
DeviceType
::
CPU
,
float
>
(
strides_
,
paddings_
,
dilations_
)(
Conv2dFunctor
<
DeviceType
::
CPU
,
T
>
(
strides_
,
paddings_
,
dilations_
)(
input
,
filter
,
bias
,
output
);
input
,
filter
,
bias
,
output
);
return
;
return
;
}
}
...
@@ -70,8 +70,11 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
...
@@ -70,8 +70,11 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
}
}
auto
conv2d_func
=
selector
[
kernel_h
-
1
][
strides_
[
0
]
-
1
];
auto
conv2d_func
=
selector
[
kernel_h
-
1
][
strides_
[
0
]
-
1
];
conv2d_func
(
input
,
filter
,
bias
,
paddings
.
data
(),
output
);
conv2d_func
(
input
,
filter
,
bias
,
paddings
.
data
(),
DataTypeToEnum
<
T
>::
value
,
output
);
}
}
template
struct
Conv2dFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
Conv2dFunctor
<
DeviceType
::
OPENCL
,
half
>;
}
// namespace kernels
}
// namespace kernels
}
// namespace mace
}
// namespace mace
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
8db742cf
...
@@ -15,6 +15,7 @@ void Conv1x1(const Tensor *input,
...
@@ -15,6 +15,7 @@ void Conv1x1(const Tensor *input,
const
Tensor
*
filter
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
Tensor
*
bias
,
const
int
stride
,
const
int
stride
,
const
DataType
dt
,
Tensor
*
output
)
{
Tensor
*
output
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
height
=
output
->
dim
(
1
);
...
@@ -29,13 +30,11 @@ void Conv1x1(const Tensor *input,
...
@@ -29,13 +30,11 @@ void Conv1x1(const Tensor *input,
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
MACE_CHECK
(
stride
==
1
);
MACE_CHECK
(
input_batch
==
batch
);
MACE_CHECK
(
input_batch
==
batch
);
MACE_CHECK
(
stride
!=
1
||
(
input_height
==
height
&&
input_width
==
width
));
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOPENCLCMDDataType
(
input
->
dtype
()
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOPENCLCMDDataType
(
dt
));
built_options
.
emplace
(
"-DSTRIDE="
+
ToString
(
stride
));
built_options
.
emplace
(
"-DSTRIDE="
+
ToString
(
stride
));
if
(
bias
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
built_options
.
emplace
(
"-DBIAS"
);
built_options
.
emplace
(
"-DBIAS"
);
...
@@ -54,7 +53,10 @@ void Conv1x1(const Tensor *input,
...
@@ -54,7 +53,10 @@ void Conv1x1(const Tensor *input,
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
bias
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
bias
->
buffer
())));
}
}
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
output
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
output
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input_height
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input_width
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input_channel_blocks
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input_channel_blocks
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
auto
command_queue
=
runtime
->
command_queue
();
auto
command_queue
=
runtime
->
command_queue
();
...
@@ -73,16 +75,18 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
...
@@ -73,16 +75,18 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
const
Tensor
*
filter
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
Tensor
*
bias
,
const
int
*
padding
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
)
{
Tensor
*
output
)
{
Conv1x1
(
input
,
filter
,
bias
,
1
,
output
);
Conv1x1
(
input
,
filter
,
bias
,
1
,
dt
,
output
);
};
};
extern
void
Conv2dOpenclK1x1S2
(
const
Tensor
*
input
,
extern
void
Conv2dOpenclK1x1S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
Tensor
*
bias
,
const
int
*
padding
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
)
{
Tensor
*
output
)
{
Conv1x1
(
input
,
filter
,
bias
,
2
,
output
);
Conv1x1
(
input
,
filter
,
bias
,
2
,
dt
,
output
);
};
};
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
8db742cf
...
@@ -13,7 +13,8 @@ namespace kernels {
...
@@ -13,7 +13,8 @@ namespace kernels {
static
void
Conv2d3x3S12
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
static
void
Conv2d3x3S12
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
uint32_t
stride
,
const
Tensor
*
bias
,
const
uint32_t
stride
,
const
int
*
padding
,
Tensor
*
output
)
{
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
const
index_t
width
=
output
->
dim
(
2
);
...
@@ -25,9 +26,10 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
...
@@ -25,9 +26,10 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const
index_t
width_blocks
=
RoundUpDiv
<
index_t
,
5
>
(
width
);
const
index_t
width_blocks
=
RoundUpDiv
<
index_t
,
5
>
(
width
);
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOPENCLCMDDataType
(
input
->
dtype
()
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOPENCLCMDDataType
(
dt
));
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
built_options
.
emplace
(
"-DSTRIDE="
+
ToString
(
stride
));
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
auto
program
=
runtime
->
program
();
...
@@ -62,12 +64,15 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
...
@@ -62,12 +64,15 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
}
}
void
Conv2dOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
void
Conv2dOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
)
{
const
Tensor
*
bias
,
const
int
*
padding
,
Conv2d3x3S12
(
input
,
filter
,
bias
,
1
,
padding
,
output
);
const
DataType
dt
,
Tensor
*
output
)
{
Conv2d3x3S12
(
input
,
filter
,
bias
,
1
,
padding
,
dt
,
output
);
};
};
void
Conv2dOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
void
Conv2dOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
)
{
const
Tensor
*
bias
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
)
{
Conv2d3x3S12
(
input
,
filter
,
bias
,
2
,
padding
,
dt
,
output
);
};
};
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
8db742cf
...
@@ -57,9 +57,8 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
...
@@ -57,9 +57,8 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
std
::
string
DataTypeToCLType
(
const
DataType
dt
)
{
std
::
string
DataTypeToCLType
(
const
DataType
dt
)
{
switch
(
dt
)
{
switch
(
dt
)
{
case
DT_FLOAT
:
case
DT_FLOAT
:
return
"float"
;
case
DT_HALF
:
case
DT_HALF
:
return
"
half
"
;
return
"
float
"
;
case
DT_UINT8
:
case
DT_UINT8
:
return
"uchar"
;
return
"uchar"
;
case
DT_INT8
:
case
DT_INT8
:
...
@@ -85,9 +84,8 @@ std::string DataTypeToCLType(const DataType dt) {
...
@@ -85,9 +84,8 @@ std::string DataTypeToCLType(const DataType dt) {
std
::
string
DataTypeToOPENCLCMDDataType
(
const
DataType
dt
)
{
std
::
string
DataTypeToOPENCLCMDDataType
(
const
DataType
dt
)
{
switch
(
dt
)
{
switch
(
dt
)
{
case
DT_FLOAT
:
case
DT_FLOAT
:
return
"f"
;
case
DT_HALF
:
case
DT_HALF
:
return
"
h
"
;
return
"
f
"
;
default:
default:
LOG
(
FATAL
)
<<
"Not supported data type for opencl cmd data type"
;
LOG
(
FATAL
)
<<
"Not supported data type for opencl cmd data type"
;
return
""
;
return
""
;
...
...
mace/ops/buffer_to_image.cc
浏览文件 @
8db742cf
...
@@ -14,6 +14,6 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
...
@@ -14,6 +14,6 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage")
REGISTER_OPENCL_OPERATOR
(
OpKeyBuilder
(
"BufferToImage"
)
REGISTER_OPENCL_OPERATOR
(
OpKeyBuilder
(
"BufferToImage"
)
.
TypeConstraint
<
half
>
(
"T"
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
.
Build
(),
BufferToImageOp
<
DeviceType
::
OPENCL
,
float
>
);
BufferToImageOp
<
DeviceType
::
OPENCL
,
half
>
);
}
// namespace mace
}
// namespace mace
mace/ops/buffer_to_image_test.cc
浏览文件 @
8db742cf
...
@@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
...
@@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
.
Input
(
"Input"
)
.
Input
(
"Input"
)
.
Output
(
"B2IOutput"
)
.
Output
(
"B2IOutput"
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"T"
,
DataTypeToEnum
<
T
>::
value
)
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
// Add input data
...
@@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
...
@@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector<index_t> &input_
.
Input
(
"B2IOutput"
)
.
Input
(
"B2IOutput"
)
.
Output
(
"I2BOutput"
)
.
Output
(
"I2BOutput"
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"T"
,
DataTypeToEnum
<
T
>::
value
)
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Run
// Run
...
@@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) {
...
@@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) {
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
ARGUMENT
,
{
1
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
ARGUMENT
,
{
1
});
}
}
TEST
(
BufferToImageTest
,
ArgHalfSmall
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
half
>
(
kernels
::
ARGUMENT
,
{
1
});
}
TEST
(
BufferToImageTest
,
ArgMedia
)
{
TEST
(
BufferToImageTest
,
ArgMedia
)
{
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
ARGUMENT
,
{
11
});
TestBidirectionTransform
<
DeviceType
::
OPENCL
,
float
>
(
kernels
::
ARGUMENT
,
{
11
});
}
}
...
...
mace/ops/conv_2d.cc
浏览文件 @
8db742cf
...
@@ -11,6 +11,11 @@ REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
...
@@ -11,6 +11,11 @@ REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D")
.
Build
(),
.
Build
(),
Conv2dOp
<
DeviceType
::
CPU
,
float
>
);
Conv2dOp
<
DeviceType
::
CPU
,
float
>
);
REGISTER_CPU_OPERATOR
(
OpKeyBuilder
(
"Conv2D"
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
Conv2dOp
<
DeviceType
::
CPU
,
half
>
);
#if __ARM_NEON
#if __ARM_NEON
REGISTER_NEON_OPERATOR
(
OpKeyBuilder
(
"Conv2D"
)
REGISTER_NEON_OPERATOR
(
OpKeyBuilder
(
"Conv2D"
)
.
TypeConstraint
<
float
>
(
"T"
)
.
TypeConstraint
<
float
>
(
"T"
)
...
@@ -23,4 +28,9 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
...
@@ -23,4 +28,9 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D")
.
Build
(),
.
Build
(),
Conv2dOp
<
DeviceType
::
OPENCL
,
float
>
);
Conv2dOp
<
DeviceType
::
OPENCL
,
float
>
);
REGISTER_OPENCL_OPERATOR
(
OpKeyBuilder
(
"Conv2D"
)
.
TypeConstraint
<
half
>
(
"T"
)
.
Build
(),
Conv2dOp
<
DeviceType
::
OPENCL
,
half
>
);
}
// namespace mace
}
// namespace mace
mace/ops/conv_2d_benchmark.cc
浏览文件 @
8db742cf
...
@@ -27,15 +27,15 @@ static void Conv2d(int iters,
...
@@ -27,15 +27,15 @@ static void Conv2d(int iters,
OpsTestNet
net
;
OpsTestNet
net
;
// Add input data
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
channels
,
output_channels
});
{
kernel_h
,
kernel_w
,
channels
,
output_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
if
(
D
==
DeviceType
::
OPENCL
)
{
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"FilterImage"
)
...
@@ -44,6 +44,7 @@ static void Conv2d(int iters,
...
@@ -44,6 +44,7 @@ static void Conv2d(int iters,
.
AddIntsArg
(
"strides"
,
{
stride
,
stride
})
.
AddIntsArg
(
"strides"
,
{
stride
,
stride
})
.
AddIntArg
(
"padding"
,
padding
)
.
AddIntArg
(
"padding"
,
padding
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
}
else
{
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
...
@@ -54,6 +55,7 @@ static void Conv2d(int iters,
...
@@ -54,6 +55,7 @@ static void Conv2d(int iters,
.
AddIntsArg
(
"strides"
,
{
stride
,
stride
})
.
AddIntsArg
(
"strides"
,
{
stride
,
stride
})
.
AddIntArg
(
"padding"
,
padding
)
.
AddIntArg
(
"padding"
,
padding
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
}
}
...
@@ -88,43 +90,42 @@ static void Conv2d(int iters,
...
@@ -88,43 +90,42 @@ static void Conv2d(int iters,
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
// ICNet
// ICNet
BM_CONV_2D
(
1
,
512
,
15
,
15
,
1
,
1
,
1
,
VALID
,
1024
,
float
);
BM_CONV_2D
(
1
,
512
,
15
,
15
,
1
,
1
,
1
,
VALID
,
1024
,
half
);
BM_CONV_2D
(
1
,
128
,
60
,
60
,
3
,
3
,
1
,
VALID
,
128
,
float
);
// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
BM_CONV_2D
(
1
,
64
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
,
float
);
BM_CONV_2D
(
1
,
64
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
,
half
);
// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
BM_CONV_2D
(
1
,
32
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
,
float
);
BM_CONV_2D
(
1
,
32
,
60
,
60
,
1
,
1
,
1
,
VALID
,
128
,
half
);
BM_CONV_2D
(
1
,
128
,
60
,
60
,
3
,
3
,
1
,
VALID
,
128
,
half
);
// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
BM_CONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
SAME
,
32
,
float
);
BM_CONV_2D
(
1
,
32
,
60
,
60
,
3
,
3
,
1
,
SAME
,
32
,
half
);
// Test RGB <-> YUV
// Test RGB <-> YUV
BM_CONV_2D
(
1
,
3
,
2160
,
1080
,
1
,
1
,
1
,
VALID
,
3
,
float
);
//
BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
BM_CONV_2D
(
1
,
3
,
480
,
480
,
1
,
1
,
1
,
VALID
,
3
,
float
);
//
BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
//
BM_CONV_2D
(
1
,
64
,
32
,
32
,
1
,
1
,
1
,
VALID
,
128
,
float
);
//
BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
1
,
1
,
1
,
VALID
,
128
,
float
);
// Test bad alignments
//
BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments
BM_CONV_2D
(
1
,
3
,
512
,
512
,
1
,
1
,
1
,
VALID
,
3
,
float
);
//
BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
BM_CONV_2D
(
1
,
32
,
112
,
112
,
1
,
1
,
1
,
VALID
,
64
,
float
);
//
BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
BM_CONV_2D
(
1
,
64
,
56
,
56
,
1
,
1
,
1
,
VALID
,
128
,
float
);
//
BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
BM_CONV_2D
(
1
,
256
,
28
,
28
,
1
,
1
,
1
,
VALID
,
256
,
float
);
//
BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
BM_CONV_2D
(
1
,
1024
,
7
,
7
,
1
,
1
,
1
,
VALID
,
1024
,
float
);
//
BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
VALID
,
128
,
float
);
//
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
VALID
,
128
,
float
);
//
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
BM_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
1
,
VALID
,
3
,
float
);
//
BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
1
,
SAME
,
128
,
float
);
//
BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
1
,
SAME
,
128
,
float
);
//
BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
VALID
,
128
,
float
);
//
BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
BM_CONV_2D
(
1
,
3
,
512
,
512
,
3
,
3
,
2
,
VALID
,
3
,
float
);
//
BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
VALID
,
128
,
float
);
//
BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
3
,
3
,
2
,
SAME
,
128
,
float
);
//
BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
BM_CONV_2D
(
1
,
64
,
33
,
31
,
3
,
3
,
2
,
SAME
,
128
,
float
);
//
BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
5
,
5
,
1
,
VALID
,
128
,
float
);
//
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
BM_CONV_2D
(
1
,
64
,
32
,
31
,
5
,
5
,
1
,
VALID
,
128
,
float
);
//
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
BM_CONV_2D
(
1
,
64
,
32
,
32
,
5
,
5
,
1
,
SAME
,
128
,
float
);
//
BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
BM_CONV_2D
(
1
,
64
,
32
,
31
,
5
,
5
,
1
,
SAME
,
128
,
float
);
//
BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
}
// namespace mace
}
// namespace mace
mace/ops/conv_2d_test.cc
浏览文件 @
8db742cf
...
@@ -98,9 +98,9 @@ void TestNHWCSimple3x3VALID() {
...
@@ -98,9 +98,9 @@ void TestNHWCSimple3x3VALID() {
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"FilterImage"
)
...
@@ -109,12 +109,13 @@ void TestNHWCSimple3x3VALID() {
...
@@ -109,12 +109,13 @@ void TestNHWCSimple3x3VALID() {
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
// Transfer output
// Transfer output
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
...
@@ -125,13 +126,14 @@ void TestNHWCSimple3x3VALID() {
...
@@ -125,13 +126,14 @@ void TestNHWCSimple3x3VALID() {
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Run
// Run
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
}
}
auto
expected
=
CreateTensor
<
T
>
({
1
,
1
,
1
,
1
},
{
18.1
f
});
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
1
},
{
18.1
f
});
ExpectTensorNear
<
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.0
01
);
ExpectTensorNear
<
float
,
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.
01
);
}
}
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
...
@@ -149,9 +151,9 @@ void TestNHWCSimple3x3SAME() {
...
@@ -149,9 +151,9 @@ void TestNHWCSimple3x3SAME() {
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"FilterImage"
)
...
@@ -160,12 +162,13 @@ void TestNHWCSimple3x3SAME() {
...
@@ -160,12 +162,13 @@ void TestNHWCSimple3x3SAME() {
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Run
// Run
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
// Transfer output
// Transfer output
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
...
@@ -176,16 +179,17 @@ void TestNHWCSimple3x3SAME() {
...
@@ -176,16 +179,17 @@ void TestNHWCSimple3x3SAME() {
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Run
// Run
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
}
}
auto
expected
=
CreateTensor
<
T
>
(
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
3
,
3
,
1
},
{
1
,
3
,
3
,
1
},
{
8.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
18.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
8.1
f
});
{
8.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
18.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
8.1
f
});
ExpectTensorNear
<
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.0
01
);
ExpectTensorNear
<
float
,
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.
01
);
}
}
TEST_F
(
Conv2dOpTest
,
CPUSimple
)
{
TEST_F
(
Conv2dOpTest
,
CPUSimple
)
{
...
@@ -233,22 +237,22 @@ TEST_F(Conv2dOpTest, NEONWithouBias) {
...
@@ -233,22 +237,22 @@ TEST_F(Conv2dOpTest, NEONWithouBias) {
TestSimple3x3WithoutBias
<
DeviceType
::
NEON
>
();
TestSimple3x3WithoutBias
<
DeviceType
::
NEON
>
();
}
}
template
<
DeviceType
D
>
template
<
DeviceType
D
,
typename
T
>
void
TestNHWCSimple3x3WithoutBias
()
{
void
TestNHWCSimple3x3WithoutBias
()
{
OpsTestNet
net
;
OpsTestNet
net
;
// Add input data
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Input"
,
{
1
,
3
,
3
,
2
},
"Input"
,
{
1
,
3
,
3
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Filter"
,
{
3
,
3
,
2
,
1
},
"Filter"
,
{
3
,
3
,
2
,
1
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"InputImage"
)
...
@@ -257,11 +261,12 @@ void TestNHWCSimple3x3WithoutBias() {
...
@@ -257,11 +261,12 @@ void TestNHWCSimple3x3WithoutBias() {
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Run
// Run
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
// Transfer output
// Transfer output
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Input"
)
...
@@ -270,6 +275,7 @@ void TestNHWCSimple3x3WithoutBias() {
...
@@ -270,6 +275,7 @@ void TestNHWCSimple3x3WithoutBias() {
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Run
// Run
...
@@ -279,15 +285,15 @@ void TestNHWCSimple3x3WithoutBias() {
...
@@ -279,15 +285,15 @@ void TestNHWCSimple3x3WithoutBias() {
// Check
// Check
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
1
},
{
18.0
f
});
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
1
},
{
18.0
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.0
01
);
ExpectTensorNear
<
float
,
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.
01
);
}
}
TEST_F
(
Conv2dOpTest
,
CPUWithoutBias
)
{
TEST_F
(
Conv2dOpTest
,
CPUWithoutBias
)
{
TestNHWCSimple3x3WithoutBias
<
DeviceType
::
CPU
>
();
TestNHWCSimple3x3WithoutBias
<
DeviceType
::
CPU
,
float
>
();
}
}
TEST_F
(
Conv2dOpTest
,
OPENCLWithoutBias
)
{
TEST_F
(
Conv2dOpTest
,
OPENCLWithoutBias
)
{
TestNHWCSimple3x3WithoutBias
<
DeviceType
::
OPENCL
>
();
TestNHWCSimple3x3WithoutBias
<
DeviceType
::
OPENCL
,
float
>
();
}
}
template
<
DeviceType
D
>
template
<
DeviceType
D
>
...
@@ -333,27 +339,27 @@ TEST_F(Conv2dOpTest, NEONCombined) {
...
@@ -333,27 +339,27 @@ TEST_F(Conv2dOpTest, NEONCombined) {
TestCombined3x3
<
DeviceType
::
NEON
>
();
TestCombined3x3
<
DeviceType
::
NEON
>
();
}
}
template
<
DeviceType
D
>
template
<
DeviceType
D
,
typename
T
>
static
void
TestNHWCCombined3x3
()
{
static
void
TestNHWCCombined3x3
()
{
// Construct graph
// Construct graph
OpsTestNet
net
;
OpsTestNet
net
;
// Add input data
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Input"
,
{
1
,
5
,
5
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
"Input"
,
{
1
,
5
,
5
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
net
.
AddInputFromArray
<
D
,
T
>
(
"Filter"
,
{
3
,
3
,
2
,
2
},
"Filter"
,
{
3
,
3
,
2
,
2
},
{
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
{
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
});
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"InputImage"
)
...
@@ -363,11 +369,12 @@ static void TestNHWCCombined3x3() {
...
@@ -363,11 +369,12 @@ static void TestNHWCCombined3x3() {
.
AddIntsArg
(
"strides"
,
{
2
,
2
})
.
AddIntsArg
(
"strides"
,
{
2
,
2
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Run
// Run
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"Input"
)
.
Input
(
"Input"
)
...
@@ -377,6 +384,7 @@ static void TestNHWCCombined3x3() {
...
@@ -377,6 +384,7 @@ static void TestNHWCCombined3x3() {
.
AddIntsArg
(
"strides"
,
{
2
,
2
})
.
AddIntsArg
(
"strides"
,
{
2
,
2
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Run
// Run
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
...
@@ -388,16 +396,21 @@ static void TestNHWCCombined3x3() {
...
@@ -388,16 +396,21 @@ static void TestNHWCCombined3x3() {
{
1
,
3
,
3
,
2
},
{
8.1
f
,
4.2
f
,
12.1
f
,
6.2
f
,
8.1
f
,
4.2
f
,
{
1
,
3
,
3
,
2
},
{
8.1
f
,
4.2
f
,
12.1
f
,
6.2
f
,
8.1
f
,
4.2
f
,
12.1
f
,
6.2
f
,
18.1
f
,
9.2
f
,
12.1
f
,
6.2
f
,
12.1
f
,
6.2
f
,
18.1
f
,
9.2
f
,
12.1
f
,
6.2
f
,
8.1
f
,
4.2
f
,
12.1
f
,
6.2
f
,
8.1
f
,
4.2
f
});
8.1
f
,
4.2
f
,
12.1
f
,
6.2
f
,
8.1
f
,
4.2
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
ExpectTensorNear
<
float
,
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.01
);
}
TEST_F
(
Conv2dOpTest
,
CPUStride2
)
{
TestNHWCCombined3x3
<
DeviceType
::
CPU
,
float
>
();
}
}
TEST_F
(
Conv2dOpTest
,
CPUCombined
)
{
TEST_F
(
Conv2dOpTest
,
OPENCLStride2
)
{
TestNHWCCombined3x3
<
DeviceType
::
CPU
>
();
TestNHWCCombined3x3
<
DeviceType
::
OPENCL
,
float
>
();
}
}
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
TestConv1x1
()
{
void
TestConv1x1
()
{
// Construct graph
OpsTestNet
net
;
OpsTestNet
net
;
// Add input data
// Add input data
...
@@ -415,37 +428,35 @@ void TestConv1x1() {
...
@@ -415,37 +428,35 @@ void TestConv1x1() {
{
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
});
{
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
// Construct graph
if
(
D
==
DeviceType
::
OPENCL
)
{
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
float
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
float
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
float
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
// Transfer output
ImageToBuffer
<
D
,
float
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"Input"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
}
}
...
@@ -470,7 +481,7 @@ TEST_F(Conv2dOpTest, OPENCLConv1x1) {
...
@@ -470,7 +481,7 @@ TEST_F(Conv2dOpTest, OPENCLConv1x1) {
TestConv1x1
<
DeviceType
::
OPENCL
>
();
TestConv1x1
<
DeviceType
::
OPENCL
>
();
}
}
template
<
DeviceType
D
>
template
<
DeviceType
D
,
typename
T
>
static
void
TestComplexConvNxNS12
(
const
std
::
vector
<
index_t
>
&
shape
)
{
static
void
TestComplexConvNxNS12
(
const
std
::
vector
<
index_t
>
&
shape
)
{
testing
::
internal
::
LogToStderr
();
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
...
@@ -478,7 +489,6 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
...
@@ -478,7 +489,6 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
srand
(
time
(
NULL
));
srand
(
time
(
NULL
));
// generate random input
// generate random input
// TODO test all sizes
index_t
batch
=
3
+
(
rand
()
%
10
);
index_t
batch
=
3
+
(
rand
()
%
10
);
index_t
height
=
shape
[
0
];
index_t
height
=
shape
[
0
];
index_t
width
=
shape
[
1
];
index_t
width
=
shape
[
1
];
...
@@ -494,13 +504,14 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
...
@@ -494,13 +504,14 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
output_channels
});
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
output_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
// run on cpu
// run on cpu
net
.
RunOp
();
net
.
RunOp
();
...
@@ -509,9 +520,9 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
...
@@ -509,9 +520,9 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// run on gpu
// run on gpu
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"InputImage"
)
...
@@ -521,16 +532,17 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
...
@@ -521,16 +532,17 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Run on device
// Run on device
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
0.001
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
0.001
);
};
};
for
(
int
kernel_size
:
{
1
,
3
})
{
for
(
int
kernel_size
:
{
1
,
3
})
{
for
(
int
stride
:
{
1
})
{
for
(
int
stride
:
{
1
,
2
})
{
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
);
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
);
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
SAME
);
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
SAME
);
}
}
...
@@ -538,9 +550,90 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
...
@@ -538,9 +550,90 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
}
}
TEST_F
(
Conv2dOpTest
,
OPENCLAlignedConvNxNS12
)
{
TEST_F
(
Conv2dOpTest
,
OPENCLAlignedConvNxNS12
)
{
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
,
64
,
128
});
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
32
,
32
,
32
,
64
});
}
}
TEST_F
(
Conv2dOpTest
,
OPENCLUnalignedConvNxNS12
)
{
TEST_F
(
Conv2dOpTest
,
OPENCLUnalignedConvNxNS12
)
{
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
107
,
113
,
5
,
7
});
TestComplexConvNxNS12
<
DeviceType
::
OPENCL
,
float
>
({
107
,
113
,
5
,
7
});
}
template
<
DeviceType
D
>
static
void
TestHalfComplexConvNxNS12
(
const
std
::
vector
<
index_t
>
&
shape
)
{
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
Padding
type
)
{
srand
(
time
(
NULL
));
// generate random input
index_t
batch
=
3
+
(
rand
()
%
10
);
index_t
height
=
shape
[
0
];
index_t
width
=
shape
[
1
];
index_t
input_channels
=
shape
[
2
]
+
(
rand
()
%
10
);
index_t
output_channels
=
shape
[
3
]
+
(
rand
()
%
10
);
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
std
::
vector
<
float
>
float_input_data
;
GenerateRandomRealTypeData
({
batch
,
height
,
width
,
input_channels
},
float_input_data
);
std
::
vector
<
float
>
float_filter_data
;
GenerateRandomRealTypeData
({
kernel_h
,
kernel_w
,
input_channels
,
output_channels
},
float_filter_data
);
std
::
vector
<
float
>
float_bias_data
;
GenerateRandomRealTypeData
({
output_channels
},
float_bias_data
);
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
},
float_input_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
output_channels
},
float_filter_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
output_channels
},
float_bias_data
);
// run on cpu
net
.
RunOp
();
// Check
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// run on gpu
BufferToImage
<
D
,
half
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
half
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
half
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataType
::
DT_HALF
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run on device
net
.
RunOp
(
D
);
ImageToBuffer
<
D
,
float
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
0.2
);
};
for
(
int
kernel_size
:
{
1
,
3
})
{
for
(
int
stride
:
{
1
,
2
})
{
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
);
}
}
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConvNxNS12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
,
32
,
64
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfUnalignedConvNxNS12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
107
,
113
,
5
,
7
});
}
}
mace/ops/ops_test_util.h
浏览文件 @
8db742cf
...
@@ -210,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
...
@@ -210,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
std
::
vector
<
T
>
&
res
)
{
std
::
vector
<
T
>
&
res
)
{
std
::
random_device
rd
;
std
::
random_device
rd
;
std
::
mt19937
gen
(
rd
());
std
::
mt19937
gen
(
rd
());
std
::
normal_distribution
<
T
>
nd
(
0
,
1
);
std
::
normal_distribution
<
float
>
nd
(
0
,
1
);
index_t
size
=
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
index_t
size
=
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
index_t
>
());
std
::
multiplies
<
index_t
>
());
res
.
resize
(
size
);
res
.
resize
(
size
);
std
::
generate
(
res
.
begin
(),
res
.
end
(),
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
if
(
DataTypeToEnum
<
T
>::
value
==
DT_HALF
)
{
std
::
generate
(
res
.
begin
(),
res
.
end
(),
[
&
gen
,
&
nd
]
{
return
half_float
::
half_cast
<
half
>
(
nd
(
gen
));
});
}
else
{
std
::
generate
(
res
.
begin
(),
res
.
end
(),
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
}
}
}
template
<
typename
T
>
template
<
typename
T
>
...
@@ -290,39 +294,40 @@ inline void ExpectEqual<double>(const double &a, const double &b) {
...
@@ -290,39 +294,40 @@ inline void ExpectEqual<double>(const double &a, const double &b) {
EXPECT_DOUBLE_EQ
(
a
,
b
);
EXPECT_DOUBLE_EQ
(
a
,
b
);
}
}
inline
void
AssertSameTypeDims
(
const
Tensor
&
x
,
const
Tensor
&
y
)
{
inline
void
AssertSameDims
(
const
Tensor
&
x
,
const
Tensor
&
y
)
{
ASSERT_EQ
(
x
.
dtype
(),
y
.
dtype
());
ASSERT_TRUE
(
IsSameSize
(
x
,
y
))
<<
"x.shape ["
<<
ShapeToString
(
x
)
<<
"] vs "
ASSERT_TRUE
(
IsSameSize
(
x
,
y
))
<<
"x.shape ["
<<
ShapeToString
(
x
)
<<
"] vs "
<<
"y.shape [ "
<<
ShapeToString
(
y
)
<<
"]"
;
<<
"y.shape [ "
<<
ShapeToString
(
y
)
<<
"]"
;
}
}
template
<
typename
T
,
bool
is_fp
=
is_floating_point_type
<
T
>
::
value
>
template
<
typename
EXP_TYPE
,
typename
RES_TYPE
,
bool
is_fp
=
is_floating_point_type
<
EXP_TYPE
>
::
value
>
struct
Expector
;
struct
Expector
;
// Partial specialization for float and double.
// Partial specialization for float and double.
template
<
typename
T
>
template
<
typename
EXP_TYPE
,
typename
RES_TYPE
>
struct
Expector
<
T
,
true
>
{
struct
Expector
<
EXP_TYPE
,
RES_TYPE
,
true
>
{
static
void
Equal
(
const
T
&
a
,
const
T
&
b
)
{
ExpectEqual
(
a
,
b
);
}
static
void
Equal
(
const
EXP_TYPE
&
a
,
const
RES_TYPE
&
b
)
{
ExpectEqual
(
a
,
b
);
}
static
void
Equal
(
const
Tensor
&
x
,
const
Tensor
&
y
)
{
static
void
Equal
(
const
Tensor
&
x
,
const
Tensor
&
y
)
{
ASSERT_EQ
(
x
.
dtype
(),
DataTypeToEnum
<
T
>::
v
());
ASSERT_EQ
(
x
.
dtype
(),
DataTypeToEnum
<
EXP_TYPE
>::
v
());
AssertSameTypeDims
(
x
,
y
);
ASSERT_EQ
(
y
.
dtype
(),
DataTypeToEnum
<
RES_TYPE
>::
v
());
AssertSameDims
(
x
,
y
);
Tensor
::
MappingGuard
x_mapper
(
&
x
);
Tensor
::
MappingGuard
x_mapper
(
&
x
);
Tensor
::
MappingGuard
y_mapper
(
&
y
);
Tensor
::
MappingGuard
y_mapper
(
&
y
);
auto
a
=
x
.
data
<
T
>
();
auto
a
=
x
.
data
<
EXP_TYPE
>
();
auto
b
=
y
.
data
<
T
>
();
auto
b
=
y
.
data
<
RES_TYPE
>
();
for
(
int
i
=
0
;
i
<
x
.
size
();
++
i
)
{
for
(
int
i
=
0
;
i
<
x
.
size
();
++
i
)
{
ExpectEqual
(
a
(
i
),
b
(
i
));
ExpectEqual
(
a
(
i
),
b
(
i
));
}
}
}
}
static
void
Near
(
const
Tensor
&
x
,
const
Tensor
&
y
,
const
double
abs_err
)
{
static
void
Near
(
const
Tensor
&
x
,
const
Tensor
&
y
,
const
double
abs_err
)
{
ASSERT_EQ
(
x
.
dtype
(),
DataTypeToEnum
<
T
>::
v
());
ASSERT_EQ
(
x
.
dtype
(),
DataTypeToEnum
<
EXP_TYPE
>::
v
());
AssertSameTypeDims
(
x
,
y
);
ASSERT_EQ
(
y
.
dtype
(),
DataTypeToEnum
<
RES_TYPE
>::
v
());
AssertSameDims
(
x
,
y
);
Tensor
::
MappingGuard
x_mapper
(
&
x
);
Tensor
::
MappingGuard
x_mapper
(
&
x
);
Tensor
::
MappingGuard
y_mapper
(
&
y
);
Tensor
::
MappingGuard
y_mapper
(
&
y
);
auto
a
=
x
.
data
<
T
>
();
auto
a
=
x
.
data
<
EXP_TYPE
>
();
auto
b
=
y
.
data
<
T
>
();
auto
b
=
y
.
data
<
RES_TYPE
>
();
for
(
int
i
=
0
;
i
<
x
.
size
();
++
i
)
{
for
(
int
i
=
0
;
i
<
x
.
size
();
++
i
)
{
EXPECT_NEAR
(
a
[
i
],
b
[
i
],
abs_err
)
<<
"a = "
<<
a
<<
" b = "
<<
b
EXPECT_NEAR
(
a
[
i
],
b
[
i
],
abs_err
)
<<
"a = "
<<
a
<<
" b = "
<<
b
<<
" index = "
<<
i
;
<<
" index = "
<<
i
;
...
@@ -335,10 +340,18 @@ template <typename T>
...
@@ -335,10 +340,18 @@ template <typename T>
void
ExpectTensorNear
(
const
Tensor
&
x
,
const
Tensor
&
y
,
const
double
abs_err
)
{
void
ExpectTensorNear
(
const
Tensor
&
x
,
const
Tensor
&
y
,
const
double
abs_err
)
{
static_assert
(
is_floating_point_type
<
T
>::
value
,
static_assert
(
is_floating_point_type
<
T
>::
value
,
"T is not a floating point type"
);
"T is not a floating point type"
);
Expector
<
T
>::
Near
(
x
,
y
,
abs_err
);
Expector
<
T
,
T
>::
Near
(
x
,
y
,
abs_err
);
}
template
<
typename
EXP_TYPE
,
typename
RES_TYPE
>
void
ExpectTensorNear
(
const
Tensor
&
x
,
const
Tensor
&
y
,
const
double
abs_err
)
{
static_assert
(
is_floating_point_type
<
EXP_TYPE
>::
value
&&
is_floating_point_type
<
RES_TYPE
>::
value
,
"T is not a floating point type"
);
Expector
<
EXP_TYPE
,
RES_TYPE
>::
Near
(
x
,
y
,
abs_err
);
}
}
template
<
DeviceType
D
>
template
<
DeviceType
D
,
typename
T
>
void
BufferToImage
(
OpsTestNet
&
net
,
void
BufferToImage
(
OpsTestNet
&
net
,
const
std
::
string
&
input_name
,
const
std
::
string
&
input_name
,
const
std
::
string
&
output_name
,
const
std
::
string
&
output_name
,
...
@@ -347,6 +360,7 @@ void BufferToImage(OpsTestNet &net,
...
@@ -347,6 +360,7 @@ void BufferToImage(OpsTestNet &net,
.
Input
(
input_name
)
.
Input
(
input_name
)
.
Output
(
output_name
)
.
Output
(
output_name
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Run
// Run
...
@@ -355,7 +369,7 @@ void BufferToImage(OpsTestNet &net,
...
@@ -355,7 +369,7 @@ void BufferToImage(OpsTestNet &net,
net
.
Sync
();
net
.
Sync
();
}
}
template
<
DeviceType
D
>
template
<
DeviceType
D
,
typename
T
>
void
ImageToBuffer
(
OpsTestNet
&
net
,
void
ImageToBuffer
(
OpsTestNet
&
net
,
const
std
::
string
&
input_name
,
const
std
::
string
&
input_name
,
const
std
::
string
&
output_name
,
const
std
::
string
&
output_name
,
...
@@ -364,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net,
...
@@ -364,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net,
.
Input
(
input_name
)
.
Input
(
input_name
)
.
Output
(
output_name
)
.
Output
(
output_name
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Run
// Run
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录