Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
5f5fa20b
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
5f5fa20b
编写于
11月 27, 2017
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Finish opencl conv3x3 with stride == 1.
上级
f12366af
变更
15
隐藏空白更改
内联
并排
Showing
15 changed file
with
686 addition
and
347 deletion
+686
-347
mace/core/net.cc
mace/core/net.cc
+9
-7
mace/kernels/buffer_to_image.h
mace/kernels/buffer_to_image.h
+3
-3
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+30
-31
mace/kernels/conv_pool_2d_util.cc
mace/kernels/conv_pool_2d_util.cc
+65
-0
mace/kernels/conv_pool_2d_util.h
mace/kernels/conv_pool_2d_util.h
+8
-0
mace/kernels/opencl/cl/buffer_to_image.cl
mace/kernels/opencl/cl/buffer_to_image.cl
+84
-35
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+121
-112
mace/kernels/opencl/conv_2d_opencl.cc
mace/kernels/opencl/conv_2d_opencl.cc
+16
-16
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+2
-0
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+42
-39
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+2
-2
mace/ops/conv_2d_test.cc
mace/ops/conv_2d_test.cc
+263
-99
mace/ops/image_to_buffer.h
mace/ops/image_to_buffer.h
+3
-3
mace/ops/ops_test_util.h
mace/ops/ops_test_util.h
+37
-0
mace/utils/tuner.h
mace/utils/tuner.h
+1
-0
未找到文件。
mace/core/net.cc
浏览文件 @
5f5fa20b
...
...
@@ -38,13 +38,15 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
VLOG
(
1
)
<<
"Running operator "
<<
op
->
debug_def
().
name
()
<<
"("
<<
op
->
debug_def
().
type
()
<<
")."
;
OperatorStats
*
op_stats
=
nullptr
;
if
(
run_metadata
&&
device_type_
!=
DeviceType
::
OPENCL
)
{
op_stats
=
run_metadata
->
add_op_stats
();
op_stats
->
set_operator_name
(
op
->
debug_def
().
name
());
op_stats
->
set_type
(
op
->
debug_def
().
type
());
op_stats
->
set_all_start_micros
(
NowInMicroSec
());
op_stats
->
set_op_start_rel_micros
(
NowInMicroSec
()
-
op_stats
->
all_start_micros
());
if
(
run_metadata
)
{
if
(
device_type_
!=
DeviceType
::
OPENCL
)
{
op_stats
=
run_metadata
->
add_op_stats
();
op_stats
->
set_operator_name
(
op
->
debug_def
().
name
());
op_stats
->
set_type
(
op
->
debug_def
().
type
());
op_stats
->
set_all_start_micros
(
NowInMicroSec
());
op_stats
->
set_op_start_rel_micros
(
NowInMicroSec
()
-
op_stats
->
all_start_micros
());
}
}
if
(
!
op
->
Run
())
{
LOG
(
ERROR
)
<<
"Operator failed: "
<<
ProtoDebugString
(
op
->
debug_def
());
...
...
mace/kernels/buffer_to_image.h
浏览文件 @
5f5fa20b
...
...
@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_B
ATCH_NORM
_H_
#define MACE_KERNELS_B
ATCH_NORM
_H_
#ifndef MACE_KERNELS_B
UFFER_TO_IMAGE
_H_
#define MACE_KERNELS_B
UFFER_TO_IMAGE
_H_
#include "mace/core/tensor.h"
#include "mace/kernels/opencl/helper.h"
...
...
@@ -40,4 +40,4 @@ struct BufferToImageFunctor<DeviceType::OPENCL, T> : BufferToImageFunctorBase{
}
// namepsace kernels
}
// namespace mace
#endif // MACE_KERNELS_B
ATCH_NORM
_H_
#endif // MACE_KERNELS_B
UFFER_TO_IMAGE
_H_
mace/kernels/conv_2d.h
浏览文件 @
5f5fa20b
...
...
@@ -11,13 +11,13 @@
namespace
mace
{
namespace
kernels
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
struct
Conv2dFunctor
{
Conv2dFunctor
()
{}
Conv2dFunctor
(
const
int
*
strides
,
const
Padding
&
paddings
,
const
int
*
dilations
)
:
strides_
(
strides
),
dilations_
(
dilations
),
paddings_
(
paddings
)
{}
:
strides_
(
strides
),
dilations_
(
dilations
),
paddings_
(
paddings
)
{}
void
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
...
...
@@ -29,23 +29,23 @@ struct Conv2dFunctor {
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
int
>
paddings
(
2
);
kernels
::
CalcPaddingAndOutputSize
(
kernels
::
Calc
NHWC
PaddingAndOutputSize
(
input
->
shape
().
data
(),
filter
->
shape
().
data
(),
dilations_
,
strides_
,
paddings_
,
output_shape
.
data
(),
paddings
.
data
());
output
->
Resize
(
output_shape
);
index_t
batch
=
output
->
dim
(
0
);
index_t
channels
=
output
->
dim
(
1
);
index_t
height
=
output
->
dim
(
2
);
index_t
width
=
output
->
dim
(
3
);
index_t
height
=
output
->
dim
(
1
);
index_t
width
=
output
->
dim
(
2
);
index_t
channels
=
output
->
dim
(
3
);
index_t
input_batch
=
input
->
dim
(
0
);
index_t
input_
channels
=
input
->
dim
(
1
);
index_t
input_
height
=
input
->
dim
(
2
);
index_t
input_
width
=
input
->
dim
(
3
);
index_t
input_
height
=
input
->
dim
(
1
);
index_t
input_
width
=
input
->
dim
(
2
);
index_t
input_
channels
=
input
->
dim
(
3
);
index_t
kernel_h
=
filter
->
dim
(
2
);
index_t
kernel_w
=
filter
->
dim
(
3
);
index_t
kernel_h
=
filter
->
dim
(
0
);
index_t
kernel_w
=
filter
->
dim
(
1
);
int
stride_h
=
strides_
[
0
];
int
stride_w
=
strides_
[
1
];
...
...
@@ -72,46 +72,45 @@ struct Conv2dFunctor {
auto
bias_data
=
bias
==
nullptr
?
nullptr
:
bias
->
data
<
T
>
();
auto
output_data
=
output
->
mutable_data
<
T
>
();
#pragma omp parallel for collapse(2)
for
(
int
n
=
0
;
n
<
batch
;
++
n
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
T
bias_channel
=
bias_data
?
bias_data
[
c
]
:
0
;
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
index_t
offset
=
n
*
channels
*
height
*
width
+
c
*
height
*
width
+
h
*
width
+
w
;
output_data
[
offset
]
=
bias_channel
;
for
(
int
h
=
0
;
h
<
height
;
++
h
)
{
for
(
int
w
=
0
;
w
<
width
;
++
w
)
{
for
(
int
c
=
0
;
c
<
channels
;
++
c
)
{
T
bias_channel
=
bias_data
?
bias_data
[
c
]
:
0
;
*
output_data
=
bias_channel
;
T
sum
=
0
;
const
T
*
filter_ptr
=
filter_data
+
c
*
kernel_size
;
for
(
int
inc
=
0
;
inc
<
input_channels
;
++
inc
)
{
for
(
int
k
h
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_w
;
++
kw
)
{
const
T
*
filter_ptr
=
filter_data
+
c
;
for
(
int
kh
=
0
;
kh
<
kernel_h
;
++
kh
)
{
for
(
int
k
w
=
0
;
kw
<
kernel_w
;
++
kw
)
{
for
(
int
inc
=
0
;
inc
<
input_channels
;
++
inc
)
{
int
inh
=
padded_h_start
+
h
*
stride_h
+
dilation_h
*
kh
;
int
inw
=
padded_w_start
+
w
*
stride_w
+
dilation_w
*
kw
;
if
(
inh
<
0
||
inh
>=
input_height
||
inw
<
0
||
inw
>=
input_width
)
{
MACE_CHECK
(
inh
>=
padded_h_start
&&
inh
<
padded_h_stop
&&
inw
>=
padded_w_start
&&
inw
<
padded_w_stop
,
inw
>=
padded_w_start
&&
inw
<
padded_w_stop
,
"Out of range read from input: "
,
inh
,
", "
,
inw
);
// else padding with 0:
// sum += 0;
}
else
{
index_t
input_offset
=
n
*
input_
channels
*
input_height
*
input_width
+
inc
*
input_height
*
input_width
+
inh
*
input_width
+
inw
;
n
*
input_
height
*
input_width
*
input_channels
+
inh
*
input_width
*
input_channels
+
inw
*
input_channels
+
inc
;
sum
+=
input_data
[
input_offset
]
*
*
filter_ptr
;
}
++
filter_ptr
;
filter_ptr
+=
channels
;
}
}
}
output_data
[
offset
]
+=
sum
;
*
output_data
+=
sum
;
output_data
++
;
}
}
}
}
}
const
int
*
strides_
;
// [stride_h, stride_w]
...
...
@@ -119,12 +118,12 @@ struct Conv2dFunctor {
Padding
paddings_
;
};
template
<
>
template
<
>
void
Conv2dFunctor
<
DeviceType
::
NEON
,
float
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
);
template
<
>
template
<
>
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
float
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
...
...
mace/kernels/conv_pool_2d_util.cc
浏览文件 @
5f5fa20b
...
...
@@ -72,6 +72,71 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
output_shape
[
3
]
=
output_width
;
}
void
CalcNHWCPaddingAndOutputSize
(
const
index_t
*
input_shape
,
// NHWC
const
index_t
*
filter_shape
,
// HWIO
const
int
*
dilations
,
const
int
*
strides
,
Padding
padding
,
index_t
*
output_shape
,
int
*
padding_size
)
{
MACE_CHECK
(
dilations
[
0
]
>
0
&&
dilations
[
1
]
>
0
,
"Invalid dilations, must >= 1"
);
MACE_CHECK
((
dilations
[
0
]
==
1
||
strides
[
0
]
==
1
)
&&
(
dilations
[
1
]
==
1
||
strides
[
1
]
==
1
),
"If dilations > 1, strides should be 1"
);
MACE_CHECK_NOTNULL
(
output_shape
);
MACE_CHECK_NOTNULL
(
padding_size
);
/*
* Convlution/pooling arithmetic:
* o = (i + 2 * p - k - (k - 1) * (d - 1)) / s + 1
* For details, see https://arxiv.org/pdf/1603.07285.pdf or
* http://deeplearning.net/software/theano/tutorial/conv_arithmetic.html
*/
padding_size
[
0
]
=
0
;
padding_size
[
1
]
=
0
;
index_t
output_height
=
0
,
output_width
=
0
;
index_t
kernel_height
=
filter_shape
[
0
];
index_t
kernel_width
=
filter_shape
[
1
];
index_t
output_channels
=
filter_shape
[
3
];
index_t
k_extent_height
=
(
kernel_height
-
1
)
*
dilations
[
0
]
+
1
;
index_t
k_extent_width
=
(
kernel_width
-
1
)
*
dilations
[
1
]
+
1
;
switch
(
padding
)
{
case
VALID
:
output_height
=
(
input_shape
[
1
]
-
k_extent_height
)
/
strides
[
0
]
+
1
;
output_width
=
(
input_shape
[
2
]
-
k_extent_width
)
/
strides
[
1
]
+
1
;
break
;
case
SAME
:
output_height
=
(
input_shape
[
1
]
-
1
)
/
strides
[
0
]
+
1
;
output_width
=
(
input_shape
[
2
]
-
1
)
/
strides
[
1
]
+
1
;
break
;
case
FULL
:
output_height
=
(
input_shape
[
1
]
+
k_extent_height
-
2
)
/
strides
[
0
]
+
1
;
output_width
=
(
input_shape
[
2
]
+
k_extent_width
-
2
)
/
strides
[
1
]
+
1
;
break
;
default:
MACE_CHECK
(
false
,
"Unsupported padding type: "
,
padding
);
}
// Note: TensorFlow may padded one more on the right/bottom side
// TODO may be it's better to also truncate the left/top to
// utilize the more centered features. We need to benchmark
// based on the model accuracy.
padding_size
[
0
]
=
std
::
max
<
int
>
(
0
,
(
output_height
-
1
)
*
strides
[
0
]
+
k_extent_height
-
input_shape
[
1
]);
padding_size
[
1
]
=
std
::
max
<
int
>
(
0
,
(
output_width
-
1
)
*
strides
[
1
]
+
k_extent_width
-
input_shape
[
2
]);
output_shape
[
0
]
=
input_shape
[
0
];
output_shape
[
1
]
=
output_height
;
output_shape
[
2
]
=
output_width
;
output_shape
[
3
]
=
output_channels
;
}
void
CalPaddingSize
(
const
index_t
*
input_shape
,
// NCHW
const
index_t
*
filter_shape
,
// OIHW
const
int
*
dilations
,
...
...
mace/kernels/conv_pool_2d_util.h
浏览文件 @
5f5fa20b
...
...
@@ -25,6 +25,14 @@ void CalcPaddingAndOutputSize(const index_t *input_shape, // NCHW
index_t
*
output_shape
,
int
*
padding_size
);
void
CalcNHWCPaddingAndOutputSize
(
const
index_t
*
input_shape
,
// NCHW
const
index_t
*
filter_shape
,
// OIHW
const
int
*
dilations
,
const
int
*
strides
,
Padding
padding
,
index_t
*
output_shape
,
int
*
padding_size
);
void
CalPaddingSize
(
const
index_t
*
input_shape
,
// NCHW
const
index_t
*
filter_shape
,
// OIHW
const
int
*
dilations
,
...
...
mace/kernels/opencl/cl/buffer_to_image.cl
浏览文件 @
5f5fa20b
...
...
@@ -8,14 +8,31 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, i
int
w
=
get_global_id
(
0
)
;
int
h
=
get_global_id
(
1
)
;
const
int
out_channel_idx
=
h
*
4
;
const
int
hw_idx
=
w
/
in_channel
;
int
in_channel_idx
=
w
%
in_channel
;
const
int
rounded_in_channel
=
((
in_channel
+
3
)
/
4
)
*
4
;
const
int
hw_idx
=
w
/
rounded_in_channel
;
const
int
in_channel_idx
=
w
%
rounded_in_channel
;
const
int
h_idx
=
hw_idx
/
filter_w
;
const
int
w_idx
=
hw_idx
%
filter_w
;
const
int
offset
=
((
h_idx
*
filter_w
+
w_idx
)
*
in_channel
+
in_channel_idx
)
*
out_channel
+
out_channel_idx
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
vload4
(
0
,
input
+
offset
)
;
const
int
size
=
out_channel
-
out_channel_idx
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
0
;
if
(
in_channel_idx
<
in_channel
)
{
if
(
size
<
4
)
{
switch
(
size
)
{
case
3:
values.z
=
*
(
input
+
offset
+
2
)
;
case
2:
values.y
=
*
(
input
+
offset
+
1
)
;
case
1:
values.x
=
*
(
input
+
offset
)
;
}
}
else
{
values
=
vload4
(
0
,
input
+
offset
)
;
}
}
int2
coord
=
(
int2
)(
w,
h
)
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
coord,
values
)
;
}
...
...
@@ -28,27 +45,31 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc
int
w
=
get_global_id
(
0
)
;
int
h
=
get_global_id
(
1
)
;
const
int
out_channel_idx
=
h
*
4
;
const
int
hw_idx
=
w
/
in_channel
;
int
in_channel_idx
=
w
%
in_channel
;
const
int
rounded_in_channel
=
((
in_channel
+
3
)
/
4
)
*
4
;
const
int
hw_idx
=
w
/
rounded_in_channel
;
const
int
in_channel_idx
=
w
%
rounded_in_channel
;
const
int
h_idx
=
hw_idx
/
filter_w
;
const
int
w_idx
=
hw_idx
%
filter_w
;
const
int
offset
=
((
h_idx
*
filter_w
+
w_idx
)
*
in_channel
+
in_channel_idx
)
*
out_channel
+
out_channel_idx
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
coord
=
(
int2
)(
w,
h
)
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
sampler,
coord
)
;
if
(
out_channel_idx
+
4
>
out_channel
)
{
const
int
diff
=
in_channel
-
in_channel_idx
;
output[offset]
=
values.s0
;
if
(
diff
==
2
)
{
output[offset+1]
=
values.s1
;
if
(
in_channel_idx
<
in_channel
)
{
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
coord
=
(
int2
)(
w,
h
)
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
sampler,
coord
)
;
const
int
size
=
(
out_channel
-
out_channel_idx
)
;
if
(
size
<
4
)
{
switch
(
size
)
{
case
3:
output[offset+2]
=
values.s2
;
case
2:
output[offset+1]
=
values.s1
;
case
1:
output[offset]
=
values.s0
;
}
}
else
{
output[offset+1]
=
values.s1
;
output[offset+2]
=
values.s2
;
vstore4
(
values,
0
,
output
+
offset
)
;
}
}
else
{
vstore4
(
values,
0
,
output
+
offset
)
;
}
}
...
...
@@ -66,7 +87,20 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
const
int
offset
=
((
batch_idx
*
height
+
height_idx
)
*
width
+
width_idx
)
*
channels
+
channel_idx
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
vload4
(
0
,
input
+
offset
)
;
const
int
size
=
channels
-
channel_idx
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
0
;
if
(
size
<
4
)
{
switch
(
size
)
{
case
3:
values.z
=
*
(
input
+
offset
+
2
)
;
case
2:
values.y
=
*
(
input
+
offset
+
1
)
;
case
1:
values.x
=
*
(
input
+
offset
)
;
}
}
else
{
values
=
vload4
(
0
,
input
+
offset
)
;
}
int2
coord
=
(
int2
)(
w,
h
)
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
coord,
values
)
;
}
...
...
@@ -88,14 +122,15 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
coord
=
(
int2
)(
w,
h
)
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
sampler,
coord
)
;
if
(
channel_idx
+
4
>
channels
)
{
const
int
diff
=
channels
-
channel_idx
;
output[offset]
=
values.s0
;
if
(
diff
==
2
)
{
output[offset+1]
=
values.s1
;
}
else
{
output[offset+1]
=
values.s1
;
output[offset+2]
=
values.s2
;
const
int
size
=
channels
-
channel_idx
;
if
(
size
<
4
)
{
switch
(
size
)
{
case
3:
output[offset+2]
=
values.s2
;
case
2:
output[offset+1]
=
values.s1
;
case
1:
output[offset]
=
values.s0
;
}
}
else
{
vstore4
(
values,
0
,
output
+
offset
)
;
...
...
@@ -109,7 +144,20 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
int
h
=
get_global_id
(
1
)
;
const
int
offset
=
w
*
4
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
vload4
(
0
,
input
+
offset
)
;
const
int
size
=
count
-
offset
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
0
;
if
(
size
<
4
)
{
switch
(
size
)
{
case
3:
values.z
=
*
(
input
+
offset
+
2
)
;
case
2:
values.y
=
*
(
input
+
offset
+
1
)
;
case
1:
values.x
=
*
(
input
+
offset
)
;
}
}
else
{
values
=
vload4
(
0
,
input
+
offset
)
;
}
int2
coord
=
(
int2
)(
w,
h
)
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
coord,
values
)
;
}
...
...
@@ -124,14 +172,15 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
coord
=
(
int2
)(
w,
h
)
;
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
values
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
sampler,
coord
)
;
if
(
offset
+
4
>
count
)
{
const
int
diff
=
count
-
offset
;
output[offset]
=
values.s0
;
if
(
diff
==
2
)
{
output[offset+1]
=
values.s1
;
}
else
{
output[offset+1]
=
values.s1
;
output[offset+2]
=
values.s2
;
const
int
size
=
count
-
offset
;
if
(
size
<
4
)
{
switch
(
size
)
{
case
3:
output[offset+2]
=
values.s2
;
case
2:
output[offset+1]
=
values.s1
;
case
1:
output[offset]
=
values.s0
;
}
}
else
{
vstore4
(
values,
0
,
output
+
offset
)
;
...
...
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
5f5fa20b
#
include
<common.h>
VEC_DATA_TYPE
(
DATA_TYPE,4
)
conv1x3_s1
(
const
DATA_TYPE
*input_ptr,
const
DATA_TYPE
*filter_ptr
)
{
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row0
=
vload4
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,2
)
input1
=
vload2
(
0
,
input_ptr+4
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row1
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row0.s123,
input1.s0
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row2
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row0.s23,
input1.s01
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
filter_values
=
vload3
(
0
,
filter_ptr
)
;
return
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s0
*
row0
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s1
*
row1
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s2
*
row2
;
}
VEC_DATA_TYPE
(
DATA_TYPE,4
)
conv1x3_s2
(
const
DATA_TYPE
*input_ptr,
const
DATA_TYPE
*filter_ptr
)
{
VEC_DATA_TYPE
(
DATA_TYPE,8
)
input
=
vload8
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row0
=
input.even
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row1
=
input.odd
;
VEC_DATA_TYPE
(
DATA_TYPE,4
)
row2
=
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))(
row0.s123,
input_ptr[8]
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
filter_values
=
vload3
(
0
,
filter_ptr
)
;
return
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s0
*
row0
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s1
*
row1
+
(
VEC_DATA_TYPE
(
DATA_TYPE,4
))
filter_values.s2
*
row2
;
}
__kernel
void
conv_2d_3x3
(
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin
*
kw
*
kh,
cout/4
*/
#
ifdef
BIAS
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
#
endif
__write_only
image2d_t
output,
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
int
in_channels,
__private
const
int
out_height,
__private
const
int
out_width,
__private
const
int
padding_top,
__private
const
int
padding_left
)
{
const
int
out_ch_blk
=
get_global_id
(
0
)
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blks
=
get_global_size
(
1
)
;
const
int
out_hb
=
get_global_id
(
2
)
;
const
int
in_ch_blks
=
(
in_channels
+
3
)
/
4
;
const
int
rounded_in_ch
=
in_ch_blks
*
4
;
//
Supported
data
type:
half/float
DATA_TYPE
conv3x3
(
const
DATA_TYPE
*input_ptr,
const
DATA_TYPE
*filter_ptr,
const
int
row_width
)
{
VEC_DATA_TYPE
(
DATA_TYPE,3
)
input_value
=
vload3
(
0
,
input_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
filter_value
=
vload3
(
0
,
filter_ptr
)
;
VEC_DATA_TYPE
(
DATA_TYPE,3
)
res
=
input_value
*
filter_value
;
input_ptr
+=
row_width
;
input_value
=
vload3
(
0
,
input_ptr
)
;
filter_value
=
vload3
(
1
,
filter_ptr
)
;
res
+=
input_value
*
filter_value
;
input_ptr
+=
row_width
;
input_value
=
vload3
(
0
,
input_ptr
)
;
filter_value
=
vload3
(
2
,
filter_ptr
)
;
res
+=
input_value
*
filter_value
;
return
res.s0
+
res.s1
+
res.s2
;
}
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
void
kernel
conv_2d_3x3
(
global
const
DATA_TYPE
*input,
global
const
DATA_TYPE
*filter,
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
out[4]
=
{0}
;
#
ifdef
BIAS
global
const
DATA_TYPE
*bias,
#
endif
global
DATA_TYPE
*output,
private
const
int
in_chan_num,
private
const
int
out_chan_num,
private
const
int
in_height,
private
const
int
in_width,
private
const
int
out_height,
private
const
int
out_width
)
{
int
batch
=
get_global_id
(
0
)
;
int
out_chan_blk
=
get_global_id
(
1
)
;
int
out_pixel_blk
=
get_global_id
(
2
)
;
const
int
in_pixel
=
in_height
*
in_width
;
const
int
out_pixel
=
out_height
*
out_width
;
const
int
round_out_width
=
(
out_width
+
3
)
/
4
;
const
int
out_pixel_height
=
out_pixel_blk
/
round_out_width
;
const
int
out_pixel_width
=
out_pixel_blk
%
round_out_width
;
const
int
out_chan_begin
=
out_chan_blk
*
4
;
const
int
out_chan_end
=
min
(
out_chan_begin
+
4
,
out_chan_num
)
;
const
int
out_pixel_begin
=
out_pixel_height
*
out_width
+
out_pixel_width
*
4
;
const
int
out_pixel_end
=
min
(
out_pixel_begin
+
4
,
(
out_pixel_height
+
1
)
*
out_width
)
;
#
ifdef
STRIDE_1
const
int
stride
=
1
;
#
else
const
int
stride
=
2
;
out[0]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
bias,
sampler,
(
int2
)(
out_ch_blk,
0
))
;
out[1]
=
out[0]
;
out[2]
=
out[0]
;
out[3]
=
out[0]
;
#
endif
const
int
in_pixel_begin
=
out_pixel_height
*
stride
*
in_width
+
out_pixel_width
*
stride
*
4
;
const
int
in_offset
=
batch
*
in_chan_num
*
in_pixel
;
const
int
out_offset
=
batch
*
out_chan_num
*
out_pixel
;
const
DATA_TYPE
*input_base
=
input
+
in_offset
+
in_pixel_begin
;
DATA_TYPE
*output_base
=
output
+
out_offset
+
out_pixel_begin
;
int
w[4]
;
w[0]
=
out_w_blk
-
padding_left
;
w[1]
=
w[0]
+
out_w_blks
;
w[2]
=
w[1]
+
out_w_blks
;
w[3]
=
w[2]
+
out_w_blks
;
const
int
pixels
=
out_pixel_end
-
out_pixel_begin
;
const
int
batch_idx
=
out_hb
/
out_height
;
const
int
height_idx
=
out_hb
%
out_height
;
int
in_hb[3]
;
in_hb[0]
=
height_idx
-
padding_top
;
in_hb[1]
=
in_hb[0]
+
1
;
in_hb[2]
=
in_hb[1]
+
1
;
//
Judge
the
height
border
for
padding
input.
in_hb[0]
=
(
in_hb[0]
<
0
|
| in_hb[0] >= in_height) ? -1 : in_hb[0] + batch_idx * in_height;
in_hb[1] = (in_hb[1] < 0 || in_hb[1] >= in_height) ? -1 : in_hb[1] + batch_idx * in_height;
in_hb[2] = (in_hb[2] < 0 || in_hb[2] >= in_height) ? -1 : in_hb[2] + batch_idx * in_height;
for
(
int
i
=
out_chan_begin
; i < out_chan_end; ++i) {
DATA_TYPE
*output_ptr
=
output_base
+
i
*
out_pixel
;
const
DATA_TYPE
*filter_base
=
filter
+
i
*
in_chan_num
*
9
;
if
(
pixels
==
4
)
{
#
ifdef
BIAS
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
res
=
(
VEC_DATA_TYPE
(
DATA_TYPE,
4
))
bias[i]
;
#
else
VEC_DATA_TYPE
(
DATA_TYPE,
4
)
res
=
0
;
#
endif
const int input_image_width = in_ch_blks * in_width;
for
(
int
in_chan_idx
=
0
; in_chan_idx < in_chan_num; ++in_chan_idx) {
const
DATA_TYPE
*input_ptr
=
input_base
+
in_chan_idx
*
in_pixel
;
const
DATA_TYPE
*filter_ptr
=
filter_base
+
in_chan_idx
*
9
;
#
ifdef
STRIDE_1
res
+=
conv1x3_s1
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
#
else
res
+=
conv1x3_s2
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
#
endif
}
vstore4
(
res,
0
,
output_ptr
)
;
}
else
{
for
(
int
p
=
0
; p < pixels; ++p) {
#
ifdef
BIAS
DATA_TYPE
res
=
bias[i]
;
#
else
DATA_TYPE
res
=
0
;
#
endif
for
(
uint
in_chan_idx
=
0
; in_chan_idx < in_chan_num; ++in_chan_idx) {
const
DATA_TYPE
*input_ptr
=
input_base
+
in_chan_idx
*
in_pixel
+
p
*
stride
;
const
DATA_TYPE
*filter_ptr
=
filter_base
+
in_chan_idx
*
9
;
res
+=
conv3x3
(
input_ptr,
filter_ptr,
in_width
)
;
// Unrolling this loop hurt perfmance
int idx = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
VEC_DATA_TYPE(DATA_TYPE, 4) in[36];
VEC_DATA_TYPE(DATA_TYPE, 4) weights[36];
int filter_idx = in_ch_blk << 2;
int in_idx = in_ch_blk * in_width;
#pragma unroll
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 3; ++j) {
idx = i * 12 + j * 4;
int in_width_idx = w[0] + j;
// Judge the width border for padding input.
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 0] = 0;
} else {
in[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
}
in_width_idx = w[1] + j;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 1] = 0;
} else {
in[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
}
in_width_idx = w[2] + j;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 2] = 0;
} else {
in[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
}
in_width_idx = w[3] + j;
if (in_width_idx < 0 |
|
in_width_idx
>=
in_width
)
{
in[idx
+
3]
=
0
;
}
else
{
in[idx
+
3]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
input,
sampler,
(
int2
)(
in_idx
+
in_width_idx,
in_hb[i]
))
;
}
output_ptr[p]
=
res
;
weights[idx
+
0]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
0
,
out_ch_blk
))
;
weights[idx
+
1]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
1
,
out_ch_blk
))
;
weights[idx
+
2]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
2
,
out_ch_blk
))
;
weights[idx
+
3]
=
CMD_TYPE
(
read_image,
CMD_DATA_TYPE
)(
filter,
sampler,
(
int2
)(
filter_idx
+
3
,
out_ch_blk
))
;
filter_idx
+=
rounded_in_ch
;
}
}
//
Will
prefetch
L2
improve
performance?
How
to
pretch
image
data?
//
Interleaving
load
and
mul
does
not
improve
performance
as
expected
#
pragma
unroll
for
(
int
c
=
0
; c < 4; ++c) {
for
(
int
i
=
0
; i < 9; ++i) {
out[c]
+=
in[c
+
i
*
4].x
*
weights[0
+
i
*
4]
;
out[c]
+=
in[c
+
i
*
4].y
*
weights[1
+
i
*
4]
;
out[c]
+=
in[c
+
i
*
4].z
*
weights[2
+
i
*
4]
;
out[c]
+=
in[c
+
i
*
4].w
*
weights[3
+
i
*
4]
;
}
}
}
const
int
out_x_base
=
out_ch_blk
*
out_width
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
(
int2
)(
out_x_base
+
w[0]
+
padding_left,
out_hb
)
,
out[0]
)
;
w[1]
+=
padding_left
;
if
(
w[1]
>=
out_width
)
return
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
(
int2
)(
out_x_base
+
w[1],
out_hb
)
,
out[1]
)
;
w[2]
+=
padding_left
;
if
(
w[2]
>=
out_width
)
return
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
(
int2
)(
out_x_base
+
w[2],
out_hb
)
,
out[2]
)
;
w[3]
+=
padding_left
;
if
(
w[3]
>=
out_width
)
return
;
CMD_TYPE
(
write_image,
CMD_DATA_TYPE
)(
output,
(
int2
)(
out_x_base
+
w[3],
out_hb
)
,
out[3]
)
;
}
mace/kernels/opencl/conv_2d_opencl.cc
浏览文件 @
5f5fa20b
...
...
@@ -9,33 +9,39 @@ namespace mace {
namespace
kernels
{
extern
void
Conv2dOpenclK1x1S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
);
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
);
extern
void
Conv2dOpenclK1x1S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
);
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
);
extern
void
Conv2dOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
);
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
);
extern
void
Conv2dOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
);
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
);
template
<
>
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
float
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
)
{
typedef
void
(
*
Conv2dOpenclFunction
)(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
);
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
);
// Selection matrix: kernel_size x stride_size
static
const
Conv2dOpenclFunction
selector
[
5
][
2
]
=
{
{
Conv2dOpenclK1x1S1
,
Conv2dOpenclK1x1S2
},
{
nullptr
,
nullptr
},
{
Conv2dOpenclK3x3S1
,
Conv2dOpenclK3x3S2
},
{
Conv2dOpenclK3x3S1
,
nullptr
},
{
nullptr
,
nullptr
},
{
nullptr
,
nullptr
}};
index_t
kernel_h
=
filter
->
dim
(
2
);
index_t
kernel_w
=
filter
->
dim
(
3
);
index_t
kernel_h
=
filter
->
dim
(
0
);
index_t
kernel_w
=
filter
->
dim
(
1
);
if
(
kernel_h
!=
kernel_w
||
kernel_h
>
5
||
strides_
[
0
]
!=
strides_
[
1
]
||
strides_
[
0
]
>
2
||
dilations_
[
0
]
!=
1
||
dilations_
[
1
]
!=
1
||
selector
[
kernel_h
-
1
][
strides_
[
0
]
-
1
]
==
nullptr
)
{
...
...
@@ -51,7 +57,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
int
>
paddings
(
2
);
kernels
::
CalcPaddingAndOutputSize
(
kernels
::
Calc
NHWC
PaddingAndOutputSize
(
input
->
shape
().
data
(),
filter
->
shape
().
data
(),
dilations_
,
strides_
,
paddings_
,
output_shape
.
data
(),
paddings
.
data
());
...
...
@@ -64,13 +70,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
}
auto
conv2d_func
=
selector
[
kernel_h
-
1
][
strides_
[
0
]
-
1
];
if
(
paddings
[
0
]
>
0
||
paddings
[
1
]
>
0
)
{
Tensor
padded_input
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
DataTypeToEnum
<
float
>::
v
());
ConstructInputWithPadding
(
input
,
paddings
.
data
(),
&
padded_input
);
conv2d_func
(
&
padded_input
,
filter
,
bias
,
output
);
}
else
{
conv2d_func
(
input
,
filter
,
bias
,
output
);
}
conv2d_func
(
input
,
filter
,
bias
,
paddings
.
data
(),
output
);
}
}
// namespace kernels
...
...
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
5f5fa20b
...
...
@@ -113,6 +113,7 @@ void Conv1x1V3(const Tensor *input,
extern
void
Conv2dOpenclK1x1S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
2
);
...
...
@@ -131,6 +132,7 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
extern
void
Conv2dOpenclK1x1S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
)
{
MACE_CHECK
(
input
->
dim
(
0
)
==
output
->
dim
(
0
));
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
5f5fa20b
...
...
@@ -6,65 +6,68 @@
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
namespace
mace
{
namespace
kernels
{
static
void
InnerConv2dK3x3S12
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
uint32_t
stride
,
Tensor
*
output
)
{
const
index_t
channels
=
output
->
dim
(
1
);
const
index_t
height
=
output
->
dim
(
2
);
const
index_t
width
=
output
->
dim
(
3
);
static
void
Conv2d3x3S12
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
uint32_t
stride
,
const
int
*
padding
,
Tensor
*
output
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
const
index_t
channels
=
output
->
dim
(
3
);
const
index_t
input_channels
=
input
->
dim
(
3
);
MACE_CHECK
(
input
->
dim
(
0
)
==
output
->
dim
(
0
));
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
const
index_t
pixel_blocks
=
(
width
+
3
)
/
4
*
height
;
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DataTypeToOpenclCMDDataType
(
input
->
dtype
()));
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
std
::
set
<
std
::
string
>
built_options
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DataTypeToCLType
(
input
->
dtype
()));
built_options
.
emplace
(
stride
==
1
?
"-DSTRIDE_1"
:
""
);
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
auto
conv_kernel
=
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
"conv_2d_3x3"
,
built_options
);
auto
conv_2d_kernel
=
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
"conv_2d_3x3"
,
built_options
);
const
uint32_t
kwg_size
=
runtime
->
GetKernelMaxWorkGroupSize
(
conv_2d_kernel
);
uint32_t
idx
=
0
;
conv_
kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
conv_
kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
filter
->
buffer
())));
conv_
2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input
->
buffer
())));
conv_
2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
filter
->
buffer
())));
if
(
bias
!=
nullptr
)
{
conv_
kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
bias
->
buffer
())));
conv_
2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
bias
->
buffer
())));
}
conv_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
1
)));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
channels
));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
2
)));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
3
)));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
height
));
conv_kernel
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
width
));
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
output
->
dim
(
0
)),
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
pixel_blocks
)};
const
uint32_t
lws
[
3
]
=
{
static_cast
<
uint32_t
>
(
1
),
static_cast
<
uint32_t
>
(
8
),
static_cast
<
uint32_t
>
(
128
)};
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
conv_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
conv_2d_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
output
->
buffer
())));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
1
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
2
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
3
)));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
conv_2d_kernel
.
setArg
(
idx
++
,
padding
[
0
]
/
2
);
conv_2d_kernel
.
setArg
(
idx
++
,
padding
[
1
]
/
2
);
auto
command_queue
=
runtime
->
command_queue
();
cl_int
error
;
error
=
command_queue
.
enqueueNDRangeKernel
(
conv_2d_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)),
cl
::
NDRange
(
4
,
15
,
8
),
NULL
,
OpenCLRuntime
::
Get
()
->
GetDefaultEvent
());
MACE_CHECK
(
error
==
CL_SUCCESS
);
}
MACE_CHECK
(
error
==
CL_SUCCESS
,
error
);
}
void
Conv2dOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
)
{
InnerConv2dK3x3S12
(
input
,
filter
,
bias
,
1
,
output
);
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
)
{
Conv2d3x3S12
(
input
,
filter
,
bias
,
1
,
padding
,
output
);
};
void
Conv2dOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
)
{
InnerConv2dK3x3S12
(
input
,
filter
,
bias
,
2
,
output
);
const
Tensor
*
bias
,
const
int
*
padding
,
Tensor
*
output
)
{
};
}
// namespace kernels
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
5f5fa20b
...
...
@@ -17,12 +17,12 @@ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
image_shape
[
1
]
=
shape
[
0
]
*
shape
[
1
];
}
// [H * W *
Ic
, (Oc + 3) / 4]
// [H * W *
4
, (Oc + 3) / 4]
void
CalFilterImageShape
(
const
std
::
vector
<
index_t
>
&
shape
,
/* HWIO*/
std
::
vector
<
size_t
>
&
image_shape
)
{
MACE_CHECK
(
shape
.
size
()
==
4
);
image_shape
.
resize
(
2
);
image_shape
[
0
]
=
shape
[
0
]
*
shape
[
1
]
*
shape
[
2
]
;
image_shape
[
0
]
=
shape
[
0
]
*
shape
[
1
]
*
RoundUp
<
index_t
>
(
shape
[
2
],
4
)
;
image_shape
[
1
]
=
RoundUpDiv4
(
shape
.
back
());
}
...
...
mace/ops/conv_2d_test.cc
浏览文件 @
5f5fa20b
...
...
@@ -78,19 +78,124 @@ void TestSimple3x3SAME() {
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
TEST_F
(
Conv2dOpTest
,
CPUSimple
)
{
TestSimple3x3VALID
<
DeviceType
::
CPU
>
();
TestSimple3x3SAME
<
DeviceType
::
CPU
>
();
}
TEST_F
(
Conv2dOpTest
,
NEONSimple
)
{
TestSimple3x3VALID
<
DeviceType
::
NEON
>
();
TestSimple3x3SAME
<
DeviceType
::
NEON
>
();
}
template
<
DeviceType
D
>
void
TestNHWCSimple3x3VALID
()
{
OpsTestNet
net
;
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
3
,
3
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
3
,
3
,
2
,
1
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
1
},
{
0.1
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
(
D
);
// Transfer output
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
}
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
1
},
{
18.1
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
template
<
DeviceType
D
>
void
TestNHWCSimple3x3SAME
()
{
OpsTestNet
net
;
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
3
,
3
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
3
,
3
,
2
,
1
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
1
},
{
0.1
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
// Transfer output
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
}
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
3
,
3
,
1
},
{
8.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
18.1
f
,
12.1
f
,
8.1
f
,
12.1
f
,
8.1
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
TEST_F
(
Conv2dOpTest
,
CPUSimple
)
{
TestNHWCSimple3x3VALID
<
DeviceType
::
CPU
>
();
TestNHWCSimple3x3SAME
<
DeviceType
::
CPU
>
();
}
TEST_F
(
Conv2dOpTest
,
OPENCLSimple
)
{
TestSimple3x3VALID
<
DeviceType
::
OPENCL
>
();
TestSimple3x3SAME
<
DeviceType
::
OPENCL
>
();
Test
NHWC
Simple3x3VALID
<
DeviceType
::
OPENCL
>
();
Test
NHWC
Simple3x3SAME
<
DeviceType
::
OPENCL
>
();
}
template
<
DeviceType
D
>
...
...
@@ -105,8 +210,6 @@ void TestSimple3x3WithoutBias() {
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Add args
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
2
,
3
,
3
},
...
...
@@ -125,16 +228,66 @@ void TestSimple3x3WithoutBias() {
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
TEST_F
(
Conv2dOpTest
,
CPUWithoutBias
)
{
TestSimple3x3WithoutBias
<
DeviceType
::
CPU
>
();
}
TEST_F
(
Conv2dOpTest
,
NEONWithouBias
)
{
TestSimple3x3WithoutBias
<
DeviceType
::
NEON
>
();
}
template
<
DeviceType
D
>
void
TestNHWCSimple3x3WithoutBias
()
{
OpsTestNet
net
;
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
3
,
3
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
3
,
3
,
2
,
1
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
// Transfer output
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
}
// Check
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
1
},
{
18.0
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
TEST_F
(
Conv2dOpTest
,
CPUWithoutBias
)
{
TestNHWCSimple3x3WithoutBias
<
DeviceType
::
CPU
>
();
}
TEST_F
(
Conv2dOpTest
,
OPENCLWithoutBias
)
{
TestSimple3x3WithoutBias
<
DeviceType
::
OPENCL
>
();
Test
NHWC
Simple3x3WithoutBias
<
DeviceType
::
OPENCL
>
();
}
template
<
DeviceType
D
>
...
...
@@ -175,16 +328,72 @@ static void TestCombined3x3() {
}
TEST_F
(
Conv2dOpTest
,
CPUCombined
)
{
TestCombined3x3
<
DeviceType
::
CPU
>
();
}
TEST_F
(
Conv2dOpTest
,
NEONCombined
)
{
TestCombined3x3
<
DeviceType
::
NEON
>
();
}
TEST_F
(
Conv2dOpTest
,
OPENCLCombined
)
{
TestCombined3x3
<
DeviceType
::
OPENCL
>
();
template
<
DeviceType
D
>
static
void
TestNHWCCombined3x3
()
{
// Construct graph
OpsTestNet
net
;
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
5
,
5
,
2
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
3
,
3
,
2
,
2
},
{
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
,
1.0
f
,
0.5
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
if
(
D
==
DeviceType
::
OPENCL
)
{
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
2
,
2
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT
);
}
else
{
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
2
,
2
})
.
AddIntArg
(
"padding"
,
Padding
::
SAME
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
}
// Check
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
3
,
3
,
2
},
{
8.1
f
,
4.2
f
,
12.1
f
,
6.2
f
,
8.1
f
,
4.2
f
,
12.1
f
,
6.2
f
,
18.1
f
,
9.2
f
,
12.1
f
,
6.2
f
,
8.1
f
,
4.2
f
,
12.1
f
,
6.2
f
,
8.1
f
,
4.2
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
TEST_F
(
Conv2dOpTest
,
CPUCombined
)
{
TestNHWCCombined3x3
<
DeviceType
::
CPU
>
();
}
template
<
DeviceType
D
>
...
...
@@ -203,7 +412,7 @@ void TestConv1x1() {
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
5
,
3
,
10
},
"Input"
,
{
1
,
3
,
10
,
5
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
...
...
@@ -212,8 +421,8 @@ void TestConv1x1() {
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
2
,
5
,
1
,
1
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
2.0
f
,
2.0
f
,
2.0
f
,
2
.0
f
,
2.0
f
});
"Filter"
,
{
1
,
1
,
5
,
2
},
{
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1.0
f
,
2.0
f
,
1
.0
f
,
2.0
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
// Run
...
...
@@ -221,13 +430,13 @@ void TestConv1x1() {
// Check
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
2
,
3
,
10
},
{
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
5.1
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
,
10.2
f
});
{
1
,
3
,
10
,
2
},
{
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
,
5.1
f
,
10.2
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
...
...
@@ -236,29 +445,28 @@ TEST_F(Conv2dOpTest, CPUConv1x1) {
TestConv1x1
<
DeviceType
::
CPU
>
();
}
TEST_F
(
Conv2dOpTest
,
OPENCLConv1x1
)
{
TestConv1x1
<
DeviceType
::
OPENCL
>
();
}
//
TEST_F(Conv2dOpTest, OPENCLConv1x1) {
//
TestConv1x1<DeviceType::OPENCL>();
//
}
template
<
DeviceType
D
>
static
void
Test
AlignedConvNxNS12
(
)
{
static
void
Test
ComplexConvNxNS12
(
const
std
::
vector
<
index_t
>
&
shape
)
{
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
Padding
type
)
{
srand
(
time
(
NULL
));
// generate random input
index_t
batch
=
3
;
index_t
input_channels
=
64
;
index_t
height
=
32
;
index_t
width
=
32
;
index_t
output_channels
=
128
;
index_t
batch
=
3
+
rand
()
%
10
;
index_t
height
=
shape
[
0
]
;
index_t
width
=
shape
[
1
]
;
index_t
input_channels
=
shape
[
2
]
+
rand
()
%
10
;
index_t
output_channels
=
shape
[
3
]
+
rand
()
%
10
;
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
...
...
@@ -266,92 +474,48 @@ static void TestAlignedConvNxNS12() {
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
input_channels
,
height
,
width
});
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
output_channels
,
input_channels
,
kernel_h
,
kernel_w
});
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
output_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
// Run on device
net
.
RunOp
(
D
);
// run on cpu
net
.
RunOp
();
// Check
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// run
c
pu
net
.
RunOp
(
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
;
// run
on g
pu
BufferToImage
<
D
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
)
;
for
(
int
kernel_size
:
{
1
,
3
,
5
})
{
for
(
int
stride
:
{
1
,
2
})
{
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
);
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
SAME
);
}
}
}
TEST_F
(
Conv2dOpTest
,
NEONAlignedConvNxNS12
)
{
TestAlignedConvNxNS12
<
DeviceType
::
NEON
>
();
}
TEST_F
(
Conv2dOpTest
,
OPENCLAlignedConvNxNS12
)
{
TestAlignedConvNxNS12
<
DeviceType
::
OPENCL
>
();
}
template
<
DeviceType
D
>
static
void
TestUnalignedConvNxNS12
()
{
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
Padding
type
)
{
srand
(
time
(
NULL
));
// generate random input
index_t
batch
=
3
+
rand
()
%
10
;
index_t
input_channels
=
3
+
rand
()
%
10
;
index_t
height
=
107
;
index_t
width
=
113
;
index_t
output_channels
=
3
+
rand
()
%
10
;
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
input_channels
,
height
,
width
});
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
output_channels
,
input_channels
,
kernel_h
,
kernel_w
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
// Run on device
net
.
RunOp
(
D
);
// Check
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// run cpu
net
.
RunOp
();
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
ImageToBuffer
<
D
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
0.001
);
};
for
(
int
kernel_size
:
{
1
,
3
,
5
})
{
for
(
int
stride
:
{
1
,
2
})
{
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
);
for
(
int
kernel_size
:
{
3
})
{
for
(
int
stride
:
{
1
})
{
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
SAME
);
}
}
}
TEST_F
(
Conv2dOpTest
,
NEONUna
lignedConvNxNS12
)
{
Test
UnalignedConvNxNS12
<
DeviceType
::
NEON
>
(
);
TEST_F
(
Conv2dOpTest
,
OPENCLA
lignedConvNxNS12
)
{
Test
ComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
,
64
,
128
}
);
}
TEST_F
(
Conv2dOpTest
,
OPENCLUnalignedConvNxNS12
)
{
Test
UnalignedConvNxNS12
<
DeviceType
::
OPENCL
>
(
);
Test
ComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
107
,
113
,
5
,
7
}
);
}
mace/ops/image_to_buffer.h
浏览文件 @
5f5fa20b
...
...
@@ -2,8 +2,8 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_OPS_
BUFFER_TO_IMAGE
_H_
#define MACE_OPS_
BUFFER_TO_IMAGE
_H_
#ifndef MACE_OPS_
IMAGE_TO_BUFFER
_H_
#define MACE_OPS_
IMAGE_TO_BUFFER
_H_
#include "mace/core/operator.h"
#include "mace/kernels/buffer_to_image.h"
...
...
@@ -35,4 +35,4 @@ class ImageToBufferOp: public Operator<D, T> {
};
}
// namespace mace
#endif // MACE_OPS_
BUFFER_TO_IMAGE
_H_
#endif // MACE_OPS_
IMAGE_TO_BUFFER
_H_
mace/ops/ops_test_util.h
浏览文件 @
5f5fa20b
...
...
@@ -170,6 +170,10 @@ class OpsTestNet {
return
ws_
.
GetTensor
(
output_name
);
}
Tensor
*
GetTensor
(
const
char
*
tensor_name
)
{
return
ws_
.
GetTensor
(
tensor_name
);
}
void
Sync
()
{
if
(
net_
&&
device_
==
DeviceType
::
OPENCL
)
{
OpenCLRuntime
::
Get
()
->
command_queue
().
finish
();
...
...
@@ -340,6 +344,39 @@ std::string ToString(const T &input) {
return
ss
.
str
();
}
template
<
DeviceType
D
>
void
BufferToImage
(
OpsTestNet
&
net
,
const
std
::
string
&
input_name
,
const
std
::
string
&
output_name
,
const
kernels
::
BufferType
type
)
{
OpDefBuilder
(
"BufferToImage"
,
"BufferToImageTest"
)
.
Input
(
input_name
)
.
Output
(
output_name
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
Sync
();
}
template
<
DeviceType
D
>
void
ImageToBuffer
(
OpsTestNet
&
net
,
const
std
::
string
&
input_name
,
const
std
::
string
&
output_name
,
const
kernels
::
BufferType
type
)
{
OpDefBuilder
(
"ImageToBuffer"
,
"ImageToBufferTest"
)
.
Input
(
input_name
)
.
Output
(
output_name
)
.
AddIntArg
(
"buffer_type"
,
type
)
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
Sync
();
}
}
// namespace mace
...
...
mace/utils/tuner.h
浏览文件 @
5f5fa20b
...
...
@@ -148,6 +148,7 @@ class Tuner {
inline
RetType
Tune
(
const
std
::
function
<
std
::
vector
<
std
::
vector
<
param_type
>>
()
>
&
param_generator
,
const
std
::
function
<
RetType
(
const
std
::
vector
<
param_type
>
&
)
>
&
func
,
std
::
vector
<
param_type
>
&
opt_params
)
{
OpenCLRuntime
::
EnableProfiling
();
RetType
res
;
double
opt_time
=
std
::
numeric_limits
<
double
>::
max
();
auto
params
=
param_generator
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录