Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
336f35b4
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
336f35b4
编写于
12月 27, 2017
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Finish atrous convolution: faster than space_to_batch method.
上级
41a905c9
变更
17
隐藏空白更改
内联
并排
Showing
17 changed file
with
531 addition
and
139 deletion
+531
-139
mace/core/runtime/opencl/opencl_allocator.cc
mace/core/runtime/opencl/opencl_allocator.cc
+3
-1
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+25
-36
mace/examples/mace_run.cc
mace/examples/mace_run.cc
+3
-1
mace/kernels/opencl/cl/conv_2d.cl
mace/kernels/opencl/cl/conv_2d.cl
+5
-3
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+5
-4
mace/kernels/opencl/conv_2d_opencl.cc
mace/kernels/opencl/conv_2d_opencl.cc
+23
-17
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+2
-0
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+8
-4
mace/kernels/opencl/conv_2d_opencl_general.cc
mace/kernels/opencl/conv_2d_opencl_general.cc
+4
-2
mace/kernels/opencl/fused_conv_2d_opencl.cc
mace/kernels/opencl/fused_conv_2d_opencl.cc
+20
-17
mace/ops/conv_2d_test.cc
mace/ops/conv_2d_test.cc
+129
-16
mace/ops/fused_conv_2d_test.cc
mace/ops/fused_conv_2d_test.cc
+157
-0
mace/ops/ops_test_util.h
mace/ops/ops_test_util.h
+16
-9
mace/python/tools/tf_converter.py
mace/python/tools/tf_converter.py
+2
-2
mace/python/tools/tf_converter_lib.py
mace/python/tools/tf_converter_lib.py
+123
-17
tools/validate.py
tools/validate.py
+4
-9
tools/validate_gcn.sh
tools/validate_gcn.sh
+2
-1
未找到文件。
mace/core/runtime/opencl/opencl_allocator.cc
浏览文件 @
336f35b4
...
...
@@ -60,7 +60,9 @@ void *OpenCLAllocator::NewImage(const std::vector<size_t> &image_shape,
img_format
,
image_shape
[
0
],
image_shape
[
1
],
0
,
nullptr
,
&
error
);
MACE_CHECK
(
error
==
CL_SUCCESS
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
error
<<
" with image shape: ["
<<
image_shape
[
0
]
<<
", "
<<
image_shape
[
1
]
<<
"]"
;
return
cl_image
;
}
...
...
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
336f35b4
...
...
@@ -192,18 +192,9 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
*
program
=
cl
::
Program
(
this
->
context
(),
{
device
()},
{
binary
});
#else
std
::
string
source_filename
=
kernel_path_
+
program_file_name
;
std
::
string
binary_filename
=
kernel_path_
+
binary_file_name_prefix
+
".bin"
;
// Create program
bool
is_binary_filename_exist
=
std
::
ifstream
(
binary_filename
).
is_open
();
if
(
is_binary_filename_exist
)
{
VLOG
(
1
)
<<
"Create program with binary: "
<<
binary_filename
;
std
::
vector
<
unsigned
char
>
binary
;
MACE_CHECK
(
ReadFile
(
binary_filename
,
true
,
&
binary
));
*
program
=
cl
::
Program
(
this
->
context
(),
{
device
()},
{
binary
});
}
else
if
(
std
::
ifstream
(
source_filename
).
is_open
())
{
if
(
std
::
ifstream
(
source_filename
).
is_open
())
{
VLOG
(
1
)
<<
"Create program with source: "
<<
source_filename
;
std
::
vector
<
unsigned
char
>
kernel_source
;
MACE_CHECK
(
ReadFile
(
source_filename
,
false
,
&
kernel_source
));
...
...
@@ -214,8 +205,7 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
*
program
=
cl
::
Program
(
this
->
context
(),
sources
);
}
else
{
LOG
(
FATAL
)
<<
"Failed to open kernel file "
<<
binary_filename
<<
" or "
<<
source_filename
;
LOG
(
FATAL
)
<<
"Failed to open kernel file "
<<
source_filename
;
}
#endif
...
...
@@ -237,32 +227,31 @@ void OpenCLRuntime::BuildProgram(const std::string &program_file_name,
#ifndef MACE_EMBED_BINARY_PROGRAM
// Write binary if necessary
if
(
!
is_binary_filename_exist
)
{
size_t
device_list_size
=
1
;
std
::
unique_ptr
<
size_t
[]
>
program_binary_sizes
(
new
size_t
[
device_list_size
]);
cl_int
err
=
clGetProgramInfo
((
*
program
)(),
CL_PROGRAM_BINARY_SIZES
,
sizeof
(
size_t
)
*
device_list_size
,
program_binary_sizes
.
get
(),
nullptr
);
MACE_CHECK
(
err
==
CL_SUCCESS
)
<<
"Error code: "
<<
err
;
std
::
unique_ptr
<
std
::
unique_ptr
<
unsigned
char
[]
>
[]
>
program_binaries
(
new
std
::
unique_ptr
<
unsigned
char
[]
>
[
device_list_size
]);
for
(
cl_uint
i
=
0
;
i
<
device_list_size
;
++
i
)
{
program_binaries
[
i
]
=
std
::
unique_ptr
<
unsigned
char
[]
>
(
new
unsigned
char
[
program_binary_sizes
[
i
]]);
}
std
::
string
binary_filename
=
kernel_path_
+
binary_file_name_prefix
+
".bin"
;
size_t
device_list_size
=
1
;
std
::
unique_ptr
<
size_t
[]
>
program_binary_sizes
(
new
size_t
[
device_list_size
]);
cl_int
err
=
clGetProgramInfo
((
*
program
)(),
CL_PROGRAM_BINARY_SIZES
,
sizeof
(
size_t
)
*
device_list_size
,
program_binary_sizes
.
get
(),
nullptr
);
MACE_CHECK
(
err
==
CL_SUCCESS
)
<<
"Error code: "
<<
err
;
std
::
unique_ptr
<
std
::
unique_ptr
<
unsigned
char
[]
>
[]
>
program_binaries
(
new
std
::
unique_ptr
<
unsigned
char
[]
>
[
device_list_size
]);
for
(
cl_uint
i
=
0
;
i
<
device_list_size
;
++
i
)
{
program_binaries
[
i
]
=
std
::
unique_ptr
<
unsigned
char
[]
>
(
new
unsigned
char
[
program_binary_sizes
[
i
]]);
}
err
=
clGetProgramInfo
((
*
program
)(),
CL_PROGRAM_BINARIES
,
sizeof
(
unsigned
char
*
)
*
device_list_size
,
program_binaries
.
get
(),
nullptr
);
MACE_CHECK
(
err
==
CL_SUCCESS
)
<<
"Error code: "
<<
err
;
std
::
vector
<
unsigned
char
>
content
(
reinterpret_cast
<
unsigned
char
const
*>
(
program_binaries
[
0
].
get
()),
reinterpret_cast
<
unsigned
char
const
*>
(
program_binaries
[
0
].
get
())
+
program_binary_sizes
[
0
]);
err
=
clGetProgramInfo
((
*
program
)(),
CL_PROGRAM_BINARIES
,
sizeof
(
unsigned
char
*
)
*
device_list_size
,
program_binaries
.
get
(),
nullptr
);
MACE_CHECK
(
err
==
CL_SUCCESS
)
<<
"Error code: "
<<
err
;
std
::
vector
<
unsigned
char
>
content
(
reinterpret_cast
<
unsigned
char
const
*>
(
program_binaries
[
0
].
get
()),
reinterpret_cast
<
unsigned
char
const
*>
(
program_binaries
[
0
].
get
())
+
program_binary_sizes
[
0
]);
MACE_CHECK
(
WriteFile
(
binary_filename
,
true
,
content
));
}
MACE_CHECK
(
WriteFile
(
binary_filename
,
true
,
content
));
#endif
}
...
...
mace/examples/mace_run.cc
浏览文件 @
336f35b4
...
...
@@ -212,12 +212,14 @@ int main(int argc, char **argv) {
LOG
(
INFO
)
<<
"Avg duration: "
<<
(
t1
-
t0
)
/
round
<<
" us"
;
}
MACE_CHECK
(
engine
.
Run
(
input_data
.
get
(),
input_shape_vec
,
output_data
.
get
()));
if
(
output_data
!=
nullptr
)
{
ofstream
out_file
(
output_file
,
ios
::
binary
);
out_file
.
write
((
const
char
*
)
(
output_data
.
get
()),
output_size
*
sizeof
(
float
));
out_file
.
flush
();
out_file
.
close
();
LOG
(
INFO
)
<<
"Write output file done."
;
}
else
{
LOG
(
ERROR
)
<<
"output data is null"
;
}
}
mace/kernels/opencl/cl/conv_2d.cl
浏览文件 @
336f35b4
...
...
@@ -14,7 +14,9 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__private
const
int
filter_height,
__private
const
int
filter_width,
__private
const
int
padding_top,
__private
const
int
padding_left
)
{
__private
const
int
padding_left,
__private
const
int
dilation_h,
__private
const
int
dilation_w
)
{
const
int
out_ch_blk
=
get_global_id
(
0
)
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blks
=
get_global_size
(
1
)
;
...
...
@@ -57,7 +59,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
const
int
in_idx
=
mul24
(
in_ch_blk,
in_width
)
;
int
filter_x_part0
=
in_ch_blk
<<
2
;
for
(
short
hb_idx
=
0
; hb_idx < filter_height; ++hb_idx) {
int
in_hb_value
=
height_idx
+
hb_idx
;
int
in_hb_value
=
height_idx
+
mul24
(
hb_idx,
dilation_h
)
;
in_hb_value
=
select
(
in_hb_value
+
batch_idx,
-1
,
(
in_hb_value
<
0
|
| in_hb_value >= in_height));
...
...
@@ -66,7 +68,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
for (short width_idx = 0; width_idx < filter_width; ++width_idx) {
int in_width_value;
#define READ_INPUT(i) \
in_width_value = in_width##i +
width_idx;
\
in_width_value = in_width##i +
mul24(width_idx, dilation_w);
\
in_width_value = select(in_idx + in_width_value, \
-1, \
(in_width_value < 0 |
|
in_width_value
>=
in_width
))
; \
...
...
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
336f35b4
...
...
@@ -12,7 +12,9 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__private
const
int
out_height,
__private
const
int
out_width,
__private
const
int
padding_top,
__private
const
int
padding_left
)
{
__private
const
int
padding_left,
__private
const
int
dilation_h,
__private
const
int
dilation_w
)
{
const
int
out_ch_blk
=
get_global_id
(
0
)
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blks
=
get_global_size
(
1
)
;
...
...
@@ -55,12 +57,11 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
DATA_TYPE4
in0,
in1,
in2,
in3,
in4
;
DATA_TYPE4
weights0,
weights1,
weights2,
weights3
;
int
hb_idx,
width_idx,
in_width_idx
;
for
(
short
in_ch_blk
=
0
; in_ch_blk < in_ch_blks; ++in_ch_blk) {
const
int
in_idx
=
mul24
(
in_ch_blk,
in_width
)
;
int
filter_x_part0
=
in_ch_blk
<<
2
;
for
(
short
hb_idx
=
0
; hb_idx < 3; ++hb_idx) {
int
in_hb_value
=
height_idx
+
hb_idx
;
int
in_hb_value
=
height_idx
+
mul24
(
hb_idx,
dilation_h
)
;
in_hb_value
=
select
(
in_hb_value
+
batch_idx,
-1
,
(
in_hb_value
<
0
|
| in_hb_value >= in_height));
...
...
@@ -68,7 +69,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
for (short width_idx = 0; width_idx < 3; ++width_idx) {
int in_width_value;
#define READ_INPUT(i) \
in_width_value = in_width##i +
width_idx;
\
in_width_value = in_width##i +
mul24(width_idx, dilation_w);
\
in_width_value = select(in_idx + in_width_value, \
-1, \
(in_width_value < 0 |
|
in_width_value
>=
in_width
))
; \
...
...
mace/kernels/opencl/conv_2d_opencl.cc
浏览文件 @
336f35b4
...
...
@@ -10,29 +10,33 @@ namespace kernels {
extern
void
Conv2dOpenclK1x1S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
const
int
*
padding
,
const
int
*
dilations
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
extern
void
Conv2dOpenclK1x1S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
const
int
*
padding
,
const
int
*
dilations
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
extern
void
Conv2dOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
const
int
*
padding
,
const
int
*
dilations
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
extern
void
Conv2dOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
const
int
*
padding
,
const
int
*
dilations
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
extern
void
Conv2dOpencl
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
uint32_t
stride
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
outpu
t
,
StatsFuture
*
future
);
const
int
*
dilations
,
const
DataType
d
t
,
Tensor
*
output
,
StatsFuture
*
future
);
template
<
typename
T
>
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
...
...
@@ -42,8 +46,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
StatsFuture
*
future
)
{
typedef
void
(
*
Conv2dOpenclFunction
)(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
,
const
int
*
padding
,
const
int
*
dilations
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
// Selection matrix: kernel_size x stride_size
static
const
Conv2dOpenclFunction
selector
[
5
][
2
]
=
{
...
...
@@ -55,12 +59,14 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
index_t
kernel_h
=
filter
->
dim
(
0
);
index_t
kernel_w
=
filter
->
dim
(
1
);
if
(
!
input
->
is_image
()
||
strides_
[
0
]
!=
strides_
[
1
]
||
strides_
[
0
]
>
2
||
dilations_
[
0
]
!=
1
||
dilations_
[
1
]
!=
1
)
{
if
(
!
input
->
is_image
()
||
strides_
[
0
]
!=
strides_
[
1
]
||
strides_
[
0
]
>
2
||
(
dilations_
[
0
]
>
1
&&
(
strides_
[
0
]
>
1
||
kernel_h
==
1
))
)
{
LOG
(
WARNING
)
<<
"OpenCL conv2d kernel with "
<<
"filter"
<<
kernel_h
<<
"x"
<<
kernel_w
<<
","
<<
" stride "
<<
strides_
[
0
]
<<
"x"
<<
strides_
[
1
]
<<
" is not implemented yet, using slow version"
;
<<
",dilations "
<<
dilations_
[
0
]
<<
"x"
<<
dilations_
[
1
]
<<
" and input image: "
<<
input
->
is_image
()
<<
" is not implemented yet."
;
MACE_NOT_IMPLEMENTED
;
}
...
...
@@ -77,11 +83,11 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if
(
kernel_h
==
kernel_w
&&
kernel_h
<=
5
&&
selector
[
kernel_h
-
1
][
strides_
[
0
]
-
1
]
!=
nullptr
)
{
auto
conv2d_func
=
selector
[
kernel_h
-
1
][
strides_
[
0
]
-
1
];
conv2d_func
(
input
,
filter
,
bias
,
false
,
paddings
.
data
(),
conv2d_func
(
input
,
filter
,
bias
,
false
,
paddings
.
data
(),
dilations_
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
}
else
{
Conv2dOpencl
(
input
,
filter
,
bias
,
false
,
strides_
[
0
],
paddings
.
data
(),
DataTypeToEnum
<
T
>::
value
,
paddings
.
data
(),
dilations_
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
}
...
...
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
336f35b4
...
...
@@ -129,6 +129,7 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
int
*
padding
,
const
int
*
dilations
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
)
{
...
...
@@ -140,6 +141,7 @@ extern void Conv2dOpenclK1x1S2(const Tensor *input,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
int
*
padding
,
const
int
*
dilations
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
)
{
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
336f35b4
...
...
@@ -15,8 +15,8 @@ namespace kernels {
static
void
Conv2d3x3S12
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
uint32_t
stride
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
outpu
t
,
StatsFuture
*
future
)
{
const
int
*
dilations
,
const
DataType
d
t
,
Tensor
*
output
,
StatsFuture
*
future
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -53,6 +53,8 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
conv_2d_kernel
.
setArg
(
idx
++
,
padding
[
0
]
/
2
);
conv_2d_kernel
.
setArg
(
idx
++
,
padding
[
1
]
/
2
);
conv_2d_kernel
.
setArg
(
idx
++
,
dilations
[
0
]);
conv_2d_kernel
.
setArg
(
idx
++
,
dilations
[
1
]);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
...
...
@@ -121,10 +123,11 @@ void Conv2dOpenclK3x3S1(const Tensor *input,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
int
*
padding
,
const
int
*
dilations
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
)
{
Conv2d3x3S12
(
input
,
filter
,
bias
,
fused_relu
,
1
,
padding
,
dt
,
output
,
future
);
Conv2d3x3S12
(
input
,
filter
,
bias
,
fused_relu
,
1
,
padding
,
d
ilations
,
d
t
,
output
,
future
);
};
void
Conv2dOpenclK3x3S2
(
const
Tensor
*
input
,
...
...
@@ -132,10 +135,11 @@ void Conv2dOpenclK3x3S2(const Tensor *input,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
int
*
padding
,
const
int
*
dilations
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
)
{
Conv2d3x3S12
(
input
,
filter
,
bias
,
fused_relu
,
2
,
padding
,
dt
,
output
,
future
);
Conv2d3x3S12
(
input
,
filter
,
bias
,
fused_relu
,
2
,
padding
,
d
ilations
,
d
t
,
output
,
future
);
};
}
// namespace kernels
...
...
mace/kernels/opencl/conv_2d_opencl_general.cc
浏览文件 @
336f35b4
...
...
@@ -15,8 +15,8 @@ namespace kernels {
void
Conv2dOpencl
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
uint32_t
stride
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
outpu
t
,
StatsFuture
*
future
)
{
const
int
*
dilations
,
const
DataType
d
t
,
Tensor
*
output
,
StatsFuture
*
future
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -55,6 +55,8 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
conv_2d_kernel
.
setArg
(
idx
++
,
static_cast
<
int
>
(
filter
->
dim
(
1
)));
conv_2d_kernel
.
setArg
(
idx
++
,
padding
[
0
]
/
2
);
conv_2d_kernel
.
setArg
(
idx
++
,
padding
[
1
]
/
2
);
conv_2d_kernel
.
setArg
(
idx
++
,
dilations
[
0
]);
conv_2d_kernel
.
setArg
(
idx
++
,
dilations
[
1
]);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
...
...
mace/kernels/opencl/fused_conv_2d_opencl.cc
浏览文件 @
336f35b4
...
...
@@ -10,33 +10,33 @@ namespace kernels {
extern
void
Conv2dOpenclK1x1S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
,
const
int
*
padding
,
const
int
*
dilations
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
extern
void
Conv2dOpenclK1x1S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
,
const
int
*
padding
,
const
int
*
dilations
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
extern
void
Conv2dOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
,
const
int
*
padding
,
const
int
*
dilations
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
extern
void
Conv2dOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
,
const
int
*
padding
,
const
int
*
dilations
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
extern
void
Conv2dOpencl
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
uint32_t
stride
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
outpu
t
,
StatsFuture
*
future
);
const
int
*
dilations
,
const
DataType
d
t
,
Tensor
*
output
,
StatsFuture
*
future
);
template
<
typename
T
>
void
FusedConv2dFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
...
...
@@ -46,8 +46,9 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
StatsFuture
*
future
)
{
typedef
void
(
*
Conv2dOpenclFunction
)(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
bool
fused_relu
,
const
int
*
padding
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
const
int
*
padding
,
const
int
*
dilations
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
);
// Selection matrix: kernel_size x stride_size
static
const
Conv2dOpenclFunction
selector
[
5
][
2
]
=
{
{
Conv2dOpenclK1x1S1
,
Conv2dOpenclK1x1S2
},
...
...
@@ -57,12 +58,14 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
{
nullptr
,
nullptr
}};
index_t
kernel_h
=
filter
->
dim
(
0
);
index_t
kernel_w
=
filter
->
dim
(
1
);
if
(
!
input
->
is_image
()
||
strides_
[
0
]
!=
strides_
[
1
]
||
strides_
[
0
]
>
2
||
dilations_
[
0
]
!=
1
||
dilations_
[
1
]
!=
1
)
{
if
(
!
input
->
is_image
()
||
strides_
[
0
]
!=
strides_
[
1
]
||
strides_
[
0
]
>
2
||
(
dilations_
[
0
]
>
1
&&
(
strides_
[
0
]
>
1
||
kernel_h
==
1
))
)
{
LOG
(
WARNING
)
<<
"OpenCL conv2d kernel with "
<<
"filter"
<<
kernel_h
<<
"x"
<<
kernel_w
<<
","
<<
" stride "
<<
strides_
[
0
]
<<
"x"
<<
strides_
[
1
]
<<
" is not implemented yet, using slow version"
;
<<
",dilations "
<<
dilations_
[
0
]
<<
"x"
<<
dilations_
[
1
]
<<
" and input image: "
<<
input
->
is_image
()
<<
" is not implemented yet."
;
MACE_NOT_IMPLEMENTED
;
}
...
...
@@ -79,11 +82,11 @@ void FusedConv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if
(
kernel_h
==
kernel_w
&&
kernel_h
<=
5
&&
selector
[
kernel_h
-
1
][
strides_
[
0
]
-
1
]
!=
nullptr
)
{
auto
conv2d_func
=
selector
[
kernel_h
-
1
][
strides_
[
0
]
-
1
];
conv2d_func
(
input
,
filter
,
bias
,
true
,
paddings
.
data
(),
conv2d_func
(
input
,
filter
,
bias
,
true
,
paddings
.
data
(),
dilations_
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
}
else
{
Conv2dOpencl
(
input
,
filter
,
bias
,
true
,
strides_
[
0
],
paddings
.
data
(),
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
dilations_
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
}
}
...
...
mace/ops/conv_2d_test.cc
浏览文件 @
336f35b4
...
...
@@ -2,6 +2,7 @@
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <fstream>
#include "mace/ops/conv_2d.h"
#include "mace/ops/ops_test_util.h"
...
...
@@ -564,19 +565,20 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
template
<
DeviceType
D
>
static
void
TestHalfComplexConvNxNS12
(
const
std
::
vector
<
index_t
>
&
input_shape
,
const
std
::
vector
<
index_t
>
&
filter_shape
)
{
const
std
::
vector
<
index_t
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
dilations
)
{
testing
::
internal
::
LogToStderr
();
srand
(
time
(
NULL
));
auto
func
=
[
&
](
int
stride_h
,
int
stride_w
,
Padding
padding
)
{
// generate random input
index_t
batch
=
3
+
(
rand
()
%
10
)
;
index_t
batch
=
3
;
index_t
height
=
input_shape
[
0
];
index_t
width
=
input_shape
[
1
];
index_t
kernel_h
=
filter_shape
[
0
];
index_t
kernel_w
=
filter_shape
[
1
];
index_t
input_channels
=
filter_shape
[
2
]
+
(
rand
()
%
10
)
;
index_t
output_channels
=
filter_shape
[
3
]
+
(
rand
()
%
10
)
;
index_t
input_channels
=
filter_shape
[
2
];
index_t
output_channels
=
filter_shape
[
3
];
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
...
...
@@ -586,7 +588,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
padding
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
dilations
[
0
],
dilations
[
1
]
})
.
Finalize
(
net
.
NewOperatorDef
());
std
::
vector
<
float
>
float_input_data
;
...
...
@@ -619,7 +621,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
padding
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntsArg
(
"dilations"
,
{
dilations
[
0
],
dilations
[
1
]
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataType
::
DT_HALF
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run on device
...
...
@@ -630,43 +632,154 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
0.5
);
};
for
(
int
stride
:
{
1
,
2
})
{
func
(
stride
,
stride
,
VALID
);
func
(
stride
,
stride
,
SAME
);
func
(
1
,
1
,
VALID
);
func
(
1
,
1
,
SAME
);
if
(
dilations
[
0
]
==
1
)
{
func
(
2
,
2
,
VALID
);
func
(
2
,
2
,
SAME
);
}
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv1x1S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
1
,
1
,
32
,
64
});
{
1
,
1
,
32
,
64
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv3x3S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
3
,
3
,
32
,
64
});
{
3
,
3
,
32
,
64
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv15x1S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
15
,
1
,
256
,
2
});
{
15
,
1
,
256
,
2
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv1x15S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
1
,
15
,
256
,
2
});
{
1
,
15
,
256
,
2
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv7x75S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
7
,
7
,
3
,
64
});
{
7
,
7
,
3
,
64
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfUnalignedConv1x1S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
107
,
113
},
{
1
,
1
,
5
,
7
});
{
1
,
1
,
5
,
7
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfUnalignedConv3x3S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
107
,
113
},
{
3
,
3
,
5
,
7
});
{
3
,
3
,
5
,
7
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfConv5x5Dilation2
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
64
,
64
},
{
5
,
5
,
16
,
16
},
{
2
,
2
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfConv7x7Dilation2
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
64
,
64
},
{
7
,
7
,
16
,
16
},
{
2
,
2
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfConv7x7Dilation4
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
OPENCL
>
({
63
,
67
},
{
7
,
7
,
16
,
16
},
{
4
,
4
});
}
template
<
DeviceType
D
,
typename
T
>
static
void
TestDilationConvNxN
(
const
std
::
vector
<
index_t
>
&
shape
,
const
int
dilation_rate
)
{
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
Padding
type
)
{
srand
(
time
(
NULL
));
// generate random input
index_t
batch
=
1
;
index_t
height
=
shape
[
0
];
index_t
width
=
shape
[
1
];
index_t
input_channels
=
shape
[
2
];
index_t
output_channels
=
shape
[
3
];
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
dilation_rate
,
dilation_rate
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
output_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
// run on cpu
net
.
RunOp
();
// Check
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// run on gpu
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
dilation_rate
,
dilation_rate
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run on device
net
.
RunOp
(
D
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
0.001
);
};
for
(
int
kernel_size
:
{
3
})
{
for
(
int
stride
:
{
1
})
{
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
);
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
SAME
);
}
}
}
TEST_F
(
Conv2dOpTest
,
OPENCLAlignedDilation2
)
{
TestDilationConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
32
,
32
,
32
,
64
},
2
);
}
TEST_F
(
Conv2dOpTest
,
OPENCLAligned2Dilation4
)
{
TestDilationConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
128
,
128
,
16
,
16
},
4
);
}
TEST_F
(
Conv2dOpTest
,
OPENCLUnalignedDilation4
)
{
TestDilationConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
107
,
113
,
5
,
7
},
4
);
}
mace/ops/fused_conv_2d_test.cc
浏览文件 @
336f35b4
...
...
@@ -486,3 +486,160 @@ TEST_F(FusedConv2dOpTest, OPENCL15X1ConvNxNS12) {
{
15
,
1
,
32
,
64
});
}
template
<
DeviceType
D
,
typename
T
>
static
void
TestAtrousConvNxN
(
const
std
::
vector
<
index_t
>
&
shape
,
const
int
dilation
)
{
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
Padding
type
)
{
srand
(
time
(
NULL
));
// generate random input
index_t
batch
=
1
;
index_t
height
=
shape
[
0
];
index_t
width
=
shape
[
1
];
index_t
input_channels
=
shape
[
2
];
index_t
output_channels
=
shape
[
3
];
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
dilation
,
dilation
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
output_channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
// run on cpu
net
.
RunOp
();
// Check
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// run on gpu
BufferToImage
<
D
,
T
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
T
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
T
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
dilation
,
dilation
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run on device
net
.
RunOp
(
D
);
ImageToBuffer
<
D
,
T
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
0.001
);
};
for
(
int
kernel_size
:
{
3
})
{
for
(
int
stride
:
{
1
})
{
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
VALID
);
func
(
kernel_size
,
kernel_size
,
stride
,
stride
,
SAME
);
}
}
}
TEST_F
(
FusedConv2dOpTest
,
OPENCLalignedAtrousConvNxN2
)
{
TestAtrousConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
128
,
128
,
16
,
16
},
2
);
}
TEST_F
(
FusedConv2dOpTest
,
OPENCLalignedAtrousConvNxN4
)
{
TestAtrousConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
128
,
128
,
16
,
16
},
4
);
}
TEST_F
(
FusedConv2dOpTest
,
OPENCLUnalignedAtrousConvNxN
)
{
TestAtrousConvNxN
<
DeviceType
::
OPENCL
,
float
>
({
107
,
113
,
5
,
7
},
2
);
}
template
<
DeviceType
D
>
static
void
TestGeneralHalfAtrousConv
(
const
std
::
vector
<
index_t
>
&
image_shape
,
const
std
::
vector
<
index_t
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
dilations
)
{
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
stride_h
,
int
stride_w
,
Padding
type
)
{
srand
(
time
(
NULL
));
// generate random input
index_t
batch
=
1
;
index_t
height
=
image_shape
[
0
];
index_t
width
=
image_shape
[
1
];
index_t
input_channels
=
filter_shape
[
2
];
index_t
output_channels
=
filter_shape
[
3
];
index_t
kernel_h
=
filter_shape
[
0
];
index_t
kernel_w
=
filter_shape
[
1
];
// Construct graph
OpsTestNet
net
;
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
kernel_h
,
kernel_w
,
input_channels
,
output_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
// run on cpu
net
.
RunOp
();
// Check
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
// run on gpu
BufferToImage
<
D
,
half
>
(
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT
);
BufferToImage
<
D
,
half
>
(
net
,
"Filter"
,
"FilterImage"
,
kernels
::
BufferType
::
FILTER
);
BufferToImage
<
D
,
half
>
(
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"FusedConv2D"
,
"FusedConv2dTest"
)
.
Input
(
"InputImage"
)
.
Input
(
"FilterImage"
)
.
Input
(
"BiasImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"strides"
,
{
stride_h
,
stride_w
})
.
AddIntArg
(
"padding"
,
type
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
half
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run on device
net
.
RunOp
(
D
);
ImageToBuffer
<
D
,
float
>
(
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
0.7
);
};
func
(
1
,
1
,
VALID
);
func
(
1
,
1
,
SAME
);
}
TEST_F
(
FusedConv2dOpTest
,
OPENCL7X7AtrousConvD2
)
{
TestGeneralHalfAtrousConv
<
DeviceType
::
OPENCL
>
({
32
,
32
},
{
7
,
7
,
3
,
16
},
{
2
,
2
});
}
TEST_F
(
FusedConv2dOpTest
,
OPENCL15X15AtrousConvD4
)
{
TestGeneralHalfAtrousConv
<
DeviceType
::
OPENCL
>
({
63
,
71
},
{
15
,
15
,
16
,
16
},
{
2
,
2
});
}
mace/ops/ops_test_util.h
浏览文件 @
336f35b4
...
...
@@ -322,18 +322,25 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
Tensor
::
MappingGuard
y_mapper
(
&
y
);
auto
a
=
x
.
data
<
EXP_TYPE
>
();
auto
b
=
y
.
data
<
RES_TYPE
>
();
for
(
int
n
=
0
;
n
<
x
.
dim
(
0
);
++
n
)
{
for
(
int
h
=
0
;
h
<
x
.
dim
(
1
);
++
h
)
{
for
(
int
w
=
0
;
w
<
x
.
dim
(
2
);
++
w
)
{
for
(
int
c
=
0
;
c
<
x
.
dim
(
3
);
++
c
)
{
EXPECT_NEAR
(
*
a
,
*
b
,
abs_err
)
<<
"with index = ["
<<
n
<<
", "
<<
h
<<
", "
<<
w
<<
", "
<<
c
<<
"]"
;
a
++
;
b
++
;
if
(
x
.
dim_size
()
==
4
)
{
for
(
int
n
=
0
;
n
<
x
.
dim
(
0
);
++
n
)
{
for
(
int
h
=
0
;
h
<
x
.
dim
(
1
);
++
h
)
{
for
(
int
w
=
0
;
w
<
x
.
dim
(
2
);
++
w
)
{
for
(
int
c
=
0
;
c
<
x
.
dim
(
3
);
++
c
)
{
EXPECT_NEAR
(
*
a
,
*
b
,
abs_err
)
<<
"with index = ["
<<
n
<<
", "
<<
h
<<
", "
<<
w
<<
", "
<<
c
<<
"]"
;
a
++
;
b
++
;
}
}
}
}
}
else
{
for
(
int
i
=
0
;
i
<
x
.
size
();
++
i
)
{
EXPECT_NEAR
(
a
[
i
],
b
[
i
],
abs_err
)
<<
"a = "
<<
a
<<
" b = "
<<
b
<<
" index = "
<<
i
;
}
}
}
...
...
mace/python/tools/tf_converter.py
浏览文件 @
336f35b4
...
...
@@ -21,7 +21,6 @@ def main(unused_args):
data
=
f
.
read
()
input_graph_def
.
ParseFromString
(
data
)
print
'done'
if
FLAGS
.
runtime
==
'dsp'
:
output_graph_def
=
tf_dsp_converter_lib
.
convert_to_mace_pb
(
input_graph_def
,
FLAGS
.
input_node
,
FLAGS
.
output_node
,
FLAGS
.
prequantize
)
...
...
@@ -38,6 +37,7 @@ def main(unused_args):
with
gfile
.
GFile
(
FLAGS
.
output
+
'_txt'
,
"wb"
)
as
f
:
# output_graph_def.ClearField('tensors')
f
.
write
(
str
(
output_graph_def
))
print
(
"Model conversion is completed."
)
def
parse_args
():
...
...
@@ -82,7 +82,7 @@ def parse_args():
parser
.
add_argument
(
"--output_type"
,
type
=
str
,
default
=
"
source
"
,
default
=
"
pb
"
,
help
=
"output type: source/pb"
)
parser
.
add_argument
(
"--template"
,
...
...
mace/python/tools/tf_converter_lib.py
浏览文件 @
336f35b4
...
...
@@ -44,6 +44,7 @@ class TFConverter(object):
self
.
device
=
device
self
.
tf_graph
=
{}
self
.
resolved_ops
=
{}
self
.
unused_tensor
=
set
()
for
op
in
tf_ops
:
self
.
resolved_ops
[
op
.
name
]
=
0
...
...
@@ -72,6 +73,23 @@ class TFConverter(object):
arg
.
i
=
self
.
dt
return
output_name
def
add_image_to_buffer
(
self
,
input_name
,
input_type
):
output_name
=
input_name
[:
-
2
]
+
"_i2b"
+
input_name
[
-
2
:]
op_def
=
self
.
net_def
.
op
.
add
()
op_def
.
name
=
output_name
[:
-
2
]
op_def
.
type
=
'ImageToBuffer'
op_def
.
input
.
extend
([
input_name
])
op_def
.
output
.
extend
([
output_name
])
arg
=
op_def
.
arg
.
add
()
arg
.
name
=
'buffer_type'
arg
.
i
=
buffer_type_map
[
input_type
]
arg
=
op_def
.
arg
.
add
()
arg
.
name
=
'T'
arg
.
i
=
self
.
dt
return
output_name
def
add_input_transform
(
self
,
name
):
new_input_name
=
MACE_INPUT_NODE_NAME
+
":0"
op_def
=
self
.
net_def
.
op
.
add
()
...
...
@@ -111,22 +129,23 @@ class TFConverter(object):
op
.
output_shape
.
extend
(
output_shapes
)
def
convert_tensor
(
self
,
op
):
tensor
=
self
.
net_def
.
tensors
.
add
()
tf_tensor
=
op
.
outputs
[
0
].
eval
()
tensor
.
name
=
op
.
outputs
[
0
].
name
shape
=
list
(
tf_tensor
.
shape
)
tensor
.
dims
.
extend
(
shape
)
tf_dt
=
op
.
get_attr
(
'dtype'
)
if
tf_dt
==
tf
.
float32
:
tensor
.
data_type
=
mace_pb2
.
DT_FLOAT
tensor
.
float_data
.
extend
(
tf_tensor
.
astype
(
np
.
float32
).
flat
)
elif
tf_dt
==
tf
.
int32
:
tensor
.
data_type
=
mace_pb2
.
DT_INT32
tensor
.
int32_data
.
extend
(
tf_tensor
.
astype
(
np
.
int32
).
flat
)
else
:
raise
Exception
(
"Not supported tensor type: "
+
tf_dt
.
name
)
if
op
.
outputs
[
0
].
name
not
in
self
.
unused_tensor
:
tensor
=
self
.
net_def
.
tensors
.
add
()
tf_tensor
=
op
.
outputs
[
0
].
eval
()
tensor
.
name
=
op
.
outputs
[
0
].
name
shape
=
list
(
tf_tensor
.
shape
)
tensor
.
dims
.
extend
(
shape
)
tf_dt
=
op
.
get_attr
(
'dtype'
)
if
tf_dt
==
tf
.
float32
:
tensor
.
data_type
=
mace_pb2
.
DT_FLOAT
tensor
.
float_data
.
extend
(
tf_tensor
.
astype
(
np
.
float32
).
flat
)
elif
tf_dt
==
tf
.
int32
:
tensor
.
data_type
=
mace_pb2
.
DT_INT32
tensor
.
int32_data
.
extend
(
tf_tensor
.
astype
(
np
.
int32
).
flat
)
else
:
raise
Exception
(
"Not supported tensor type: "
+
tf_dt
.
name
)
self
.
resolved_ops
[
op
.
name
]
=
1
def
convert_conv2d
(
self
,
op
):
...
...
@@ -253,6 +272,7 @@ class TFConverter(object):
data_format_arg
=
op_def
.
arg
.
add
()
data_format_arg
.
name
=
'data_format'
data_format_arg
.
s
=
'NHWC'
self
.
unused_tensor
.
add
(
get_input_tensor
(
op
,
1
).
name
)
self
.
net_def
.
op
.
extend
([
op_def
])
for
i
in
range
(
0
,
7
):
...
...
@@ -326,6 +346,7 @@ class TFConverter(object):
axis_arg
.
i
=
get_input_tensor
(
op
,
2
).
eval
().
astype
(
np
.
int32
)
self
.
add_output_shape
(
op
.
outputs
,
op_def
)
self
.
resolved_ops
[
op
.
name
]
=
1
self
.
unused_tensor
.
add
(
get_input_tensor
(
op
,
2
).
name
)
def
convert_resize_bilinear
(
self
,
op
):
op_def
=
self
.
net_def
.
op
.
add
()
...
...
@@ -344,6 +365,7 @@ class TFConverter(object):
size_arg
.
i
=
op
.
get_attr
(
'align_corners'
)
self
.
add_output_shape
(
op
.
outputs
,
op_def
)
self
.
resolved_ops
[
op
.
name
]
=
1
self
.
unused_tensor
.
add
(
get_input_tensor
(
op
,
1
).
name
)
def
convert_bias_add
(
self
,
op
):
op_def
=
mace_pb2
.
OperatorDef
()
...
...
@@ -383,6 +405,79 @@ class TFConverter(object):
size_arg
.
ints
.
extend
(
get_input_tensor
(
op
,
2
).
eval
().
astype
(
np
.
int32
).
flat
)
self
.
add_output_shape
(
op
.
outputs
,
op_def
)
self
.
resolved_ops
[
op
.
name
]
=
1
self
.
unused_tensor
.
add
(
get_input_tensor
(
op
,
1
).
name
)
self
.
unused_tensor
.
add
(
get_input_tensor
(
op
,
2
).
name
)
def
is_atrous_conv2d
(
self
,
op
):
return
op
.
type
==
'SpaceToBatchND'
and
\
len
(
self
.
tf_graph
[
op
.
name
])
==
1
and
self
.
tf_graph
[
op
.
name
][
0
].
type
==
'Conv2D'
def
convert_atrous_conv2d
(
self
,
op
):
op_def
=
mace_pb2
.
OperatorDef
()
arg
=
op_def
.
arg
.
add
()
arg
.
name
=
'T'
arg
.
i
=
self
.
dt
conv_op
=
self
.
tf_graph
[
op
.
name
][
0
]
op_def
.
name
=
conv_op
.
name
op_def
.
type
=
conv_op
.
type
if
self
.
device
==
'gpu'
:
op_def
.
input
.
extend
([
op
.
inputs
[
0
].
name
])
output_name
=
self
.
add_buffer_to_image
(
conv_op
.
inputs
[
1
].
name
,
"FILTER"
)
op_def
.
input
.
extend
([
output_name
])
else
:
op_def
.
input
.
extend
([
op
.
inputs
[
0
].
name
])
op_def
.
input
.
extend
([
conv_op
.
inputs
[
1
].
name
])
dilation_arg
=
op_def
.
arg
.
add
()
dilation_arg
.
name
=
'dilations'
dilation_arg
.
ints
.
extend
(
get_input_tensor
(
op
,
1
).
eval
().
astype
(
np
.
int32
).
flat
)
padding_arg
=
op_def
.
arg
.
add
()
padding_arg
.
name
=
'padding'
padding_values
=
get_input_tensor
(
op
,
2
).
eval
().
astype
(
np
.
int32
).
flat
if
len
(
padding_values
)
>
0
and
padding_values
[
0
]
>
0
:
padding_arg
.
i
=
padding_mode
[
'SAME'
]
else
:
padding_arg
.
i
=
padding_mode
[
'VALID'
]
self
.
unused_tensor
.
add
(
get_input_tensor
(
op
,
1
).
name
)
self
.
unused_tensor
.
add
(
get_input_tensor
(
op
,
2
).
name
)
strides_arg
=
op_def
.
arg
.
add
()
strides_arg
.
name
=
'strides'
strides_arg
.
ints
.
extend
([
1
,
1
])
data_format_arg
=
op_def
.
arg
.
add
()
data_format_arg
.
name
=
'data_format'
data_format_arg
.
s
=
'NHWC'
final_op
=
conv_op
self
.
resolved_ops
[
op
.
name
]
=
1
self
.
resolved_ops
[
conv_op
.
name
]
=
1
if
len
(
self
.
tf_graph
[
final_op
.
name
])
==
1
and
self
.
tf_graph
[
final_op
.
name
][
0
].
type
==
'BiasAdd'
:
bias_add_op
=
self
.
tf_graph
[
final_op
.
name
][
0
]
if
self
.
device
==
'gpu'
:
output_name
=
self
.
add_buffer_to_image
(
bias_add_op
.
inputs
[
1
].
name
,
"ARGUMENT"
)
op_def
.
input
.
extend
([
output_name
])
else
:
op_def
.
input
.
extend
([
bias_add_op
.
inputs
[
1
].
name
])
final_op
=
bias_add_op
self
.
resolved_ops
[
bias_add_op
.
name
]
=
1
if
len
(
self
.
tf_graph
[
final_op
.
name
])
==
1
\
and
self
.
tf_graph
[
final_op
.
name
][
0
].
type
==
'BatchToSpaceND'
:
final_op
=
self
.
tf_graph
[
final_op
.
name
][
0
]
self
.
resolved_ops
[
final_op
.
name
]
=
1
else
:
raise
Exception
(
'Convert atrous conv error: no BatchToSpaceND op'
)
if
len
(
self
.
tf_graph
[
final_op
.
name
])
==
1
\
and
self
.
tf_graph
[
final_op
.
name
][
0
].
type
==
'Relu'
:
relu_op
=
self
.
tf_graph
[
final_op
.
name
][
0
]
op_def
.
type
=
"FusedConv2D"
final_op
=
relu_op
self
.
resolved_ops
[
relu_op
.
name
]
=
1
op_def
.
output
.
extend
([
output
.
name
for
output
in
final_op
.
outputs
])
self
.
add_output_shape
(
final_op
.
outputs
,
op_def
)
self
.
net_def
.
op
.
extend
([
op_def
])
def
convert_normal_op
(
self
,
op
):
op_def
=
self
.
net_def
.
op
.
add
()
...
...
@@ -407,7 +502,9 @@ class TFConverter(object):
self
.
resolved_ops
[
op
.
name
]
=
1
pass
elif
op
.
type
==
'Const'
:
self
.
convert_tensor
(
op
)
pass
elif
self
.
is_atrous_conv2d
(
op
):
self
.
convert_atrous_conv2d
(
op
)
elif
op
.
type
==
'Conv2D'
or
op
.
type
==
'DepthwiseConv2dNative'
:
self
.
convert_conv2d
(
op
)
elif
op
.
type
==
'FusedBatchNorm'
:
...
...
@@ -435,6 +532,15 @@ class TFConverter(object):
else
:
raise
Exception
(
'Unknown Op: %s, type: %s'
%
(
op
.
name
,
op
.
type
))
for
op
in
self
.
tf_ops
:
if
self
.
resolved_ops
[
op
.
name
]
==
1
:
continue
elif
op
.
type
==
'Const'
:
self
.
convert_tensor
(
op
)
else
:
raise
Exception
(
'Unknown Op: %s, type: %s'
%
(
op
.
name
,
op
.
type
))
if
self
.
device
==
'gpu'
:
self
.
add_output_transform
(
output_node
)
...
...
tools/validate.py
浏览文件 @
336f35b4
...
...
@@ -10,8 +10,8 @@ from tensorflow import gfile
# Validation Flow:
# 1. Generate input data
# python validate_icnet.py --generate_data 1
\
#
--random_seed 1
# python validate_icnet.py --generate_data 1
#
# 2. Use mace_run to run icnet on phone.
# 3. adb pull the result.
# 4. Compare output data of mace and tf
...
...
@@ -20,7 +20,7 @@ from tensorflow import gfile
# --mace_out_file icnet.out
def
generate_data
(
shape
):
np
.
random
.
seed
(
FLAGS
.
random_seed
)
np
.
random
.
seed
()
data
=
np
.
random
.
random
(
shape
)
*
-
1
print
FLAGS
.
input_file
data
.
astype
(
np
.
float32
).
tofile
(
FLAGS
.
input_file
)
...
...
@@ -122,12 +122,7 @@ def parse_args():
"--generate_data"
,
type
=
'bool'
,
default
=
"false"
,
help
=
"Random seed for generate test case."
)
parser
.
add_argument
(
"--random_seed"
,
type
=
int
,
default
=
"0"
,
help
=
"Random seed for generate test case."
)
help
=
"Generate data or not."
)
return
parser
.
parse_known_args
()
...
...
tools/validate_gcn.sh
浏览文件 @
336f35b4
...
...
@@ -79,7 +79,8 @@ build_and_run()
}
echo
"Step 1: Generate input data"
python tools/validate.py
--generate_data
true
--random_seed
1
\
rm
-rf
${
MODEL_DIR
}
/
${
INPUT_FILE_NAME
}
python tools/validate.py
--generate_data
true
\
--input_file
=
${
MODEL_DIR
}
/
${
INPUT_FILE_NAME
}
\
--input_shape
=
"
${
IMAGE_SIZE
}
,
${
IMAGE_SIZE
}
,3"
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录