Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
慢慢CG
Mace
提交
33415ee9
Mace
项目概览
慢慢CG
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
33415ee9
编写于
5月 28, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Return mace status for allocate
上级
ccaec70c
变更
111
显示空白变更内容
内联
并排
Showing
111 changed file
with
2763 addition
and
3629 deletion
+2763
-3629
mace/core/mace.cc
mace/core/mace.cc
+4
-4
mace/core/net.cc
mace/core/net.cc
+3
-3
mace/core/workspace.cc
mace/core/workspace.cc
+5
-14
mace/kernels/addn.h
mace/kernels/addn.h
+1
-1
mace/kernels/arm/conv_2d_neon.h
mace/kernels/arm/conv_2d_neon.h
+51
-51
mace/kernels/arm/conv_2d_neon_15x1.cc
mace/kernels/arm/conv_2d_neon_15x1.cc
+9
-11
mace/kernels/arm/conv_2d_neon_1x1.cc
mace/kernels/arm/conv_2d_neon_1x1.cc
+2
-6
mace/kernels/arm/conv_2d_neon_1x15.cc
mace/kernels/arm/conv_2d_neon_1x15.cc
+9
-11
mace/kernels/arm/conv_2d_neon_1x7.cc
mace/kernels/arm/conv_2d_neon_1x7.cc
+10
-15
mace/kernels/arm/conv_2d_neon_3x3.cc
mace/kernels/arm/conv_2d_neon_3x3.cc
+31
-38
mace/kernels/arm/conv_2d_neon_5x5.cc
mace/kernels/arm/conv_2d_neon_5x5.cc
+61
-62
mace/kernels/arm/conv_2d_neon_7x1.cc
mace/kernels/arm/conv_2d_neon_7x1.cc
+10
-16
mace/kernels/arm/conv_2d_neon_7x7.cc
mace/kernels/arm/conv_2d_neon_7x7.cc
+164
-167
mace/kernels/arm/conv_winograd.cc
mace/kernels/arm/conv_winograd.cc
+69
-123
mace/kernels/arm/conv_winograd_test.cc
mace/kernels/arm/conv_winograd_test.cc
+13
-27
mace/kernels/arm/depthwise_conv2d_neon.h
mace/kernels/arm/depthwise_conv2d_neon.h
+9
-9
mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
+47
-155
mace/kernels/channel_shuffle.h
mace/kernels/channel_shuffle.h
+1
-1
mace/kernels/concat.h
mace/kernels/concat.h
+1
-1
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+3
-3
mace/kernels/conv_pool_2d_util.cc
mace/kernels/conv_pool_2d_util.cc
+3
-3
mace/kernels/deconv_2d.h
mace/kernels/deconv_2d.h
+2
-2
mace/kernels/depth_to_space.h
mace/kernels/depth_to_space.h
+1
-1
mace/kernels/depthwise_conv2d.h
mace/kernels/depthwise_conv2d.h
+1
-1
mace/kernels/eltwise.h
mace/kernels/eltwise.h
+1
-1
mace/kernels/fully_connected.h
mace/kernels/fully_connected.h
+1
-1
mace/kernels/matmul.h
mace/kernels/matmul.h
+1
-1
mace/kernels/opencl/activation.cc
mace/kernels/opencl/activation.cc
+21
-17
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+5
-5
mace/kernels/opencl/batch_norm.cc
mace/kernels/opencl/batch_norm.cc
+11
-10
mace/kernels/opencl/bias_add.cc
mace/kernels/opencl/bias_add.cc
+6
-7
mace/kernels/opencl/buffer_to_image.cc
mace/kernels/opencl/buffer_to_image.cc
+5
-7
mace/kernels/opencl/channel_shuffle.cc
mace/kernels/opencl/channel_shuffle.cc
+12
-17
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+37
-36
mace/kernels/opencl/conv_2d.cc
mace/kernels/opencl/conv_2d.cc
+59
-61
mace/kernels/opencl/conv_2d_1x1.cc
mace/kernels/opencl/conv_2d_1x1.cc
+25
-26
mace/kernels/opencl/conv_2d_3x3.cc
mace/kernels/opencl/conv_2d_3x3.cc
+30
-30
mace/kernels/opencl/conv_2d_general.cc
mace/kernels/opencl/conv_2d_general.cc
+28
-28
mace/kernels/opencl/deconv_2d_opencl.cc
mace/kernels/opencl/deconv_2d_opencl.cc
+48
-48
mace/kernels/opencl/depth_to_space.cc
mace/kernels/opencl/depth_to_space.cc
+7
-8
mace/kernels/opencl/depthwise_conv.cc
mace/kernels/opencl/depthwise_conv.cc
+29
-32
mace/kernels/opencl/eltwise.cc
mace/kernels/opencl/eltwise.cc
+17
-20
mace/kernels/opencl/fully_connected.cc
mace/kernels/opencl/fully_connected.cc
+39
-37
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+22
-30
mace/kernels/opencl/helper.h
mace/kernels/opencl/helper.h
+2
-3
mace/kernels/opencl/image_to_buffer.cc
mace/kernels/opencl/image_to_buffer.cc
+4
-6
mace/kernels/opencl/matmul.cc
mace/kernels/opencl/matmul.cc
+8
-9
mace/kernels/opencl/out_of_range_check_test.cc
mace/kernels/opencl/out_of_range_check_test.cc
+8
-12
mace/kernels/opencl/pad.cc
mace/kernels/opencl/pad.cc
+20
-24
mace/kernels/opencl/pooling.cc
mace/kernels/opencl/pooling.cc
+11
-13
mace/kernels/opencl/resize_bilinear.cc
mace/kernels/opencl/resize_bilinear.cc
+7
-9
mace/kernels/opencl/slice.cc
mace/kernels/opencl/slice.cc
+16
-18
mace/kernels/opencl/softmax.cc
mace/kernels/opencl/softmax.cc
+8
-10
mace/kernels/opencl/space_to_batch.cc
mace/kernels/opencl/space_to_batch.cc
+11
-16
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+9
-12
mace/kernels/pad.h
mace/kernels/pad.h
+1
-1
mace/kernels/pooling.h
mace/kernels/pooling.h
+1
-1
mace/kernels/proposal.h
mace/kernels/proposal.h
+1
-1
mace/kernels/psroi_align.h
mace/kernels/psroi_align.h
+1
-1
mace/kernels/resize_bilinear.h
mace/kernels/resize_bilinear.h
+1
-1
mace/kernels/slice.h
mace/kernels/slice.h
+1
-1
mace/kernels/space_to_batch.h
mace/kernels/space_to_batch.h
+2
-2
mace/ops/BUILD
mace/ops/BUILD
+35
-23
mace/ops/activation.h
mace/ops/activation.h
+3
-3
mace/ops/activation_test.cc
mace/ops/activation_test.cc
+2
-7
mace/ops/addn_test.cc
mace/ops/addn_test.cc
+4
-4
mace/ops/batch_norm.h
mace/ops/batch_norm.h
+1
-1
mace/ops/batch_norm_test.cc
mace/ops/batch_norm_test.cc
+126
-143
mace/ops/batch_to_space.h
mace/ops/batch_to_space.h
+1
-2
mace/ops/bias_add.h
mace/ops/bias_add.h
+1
-1
mace/ops/bias_add_test.cc
mace/ops/bias_add_test.cc
+22
-36
mace/ops/buffer_to_image_test.cc
mace/ops/buffer_to_image_test.cc
+7
-7
mace/ops/channel_shuffle_test.cc
mace/ops/channel_shuffle_test.cc
+8
-12
mace/ops/concat_test.cc
mace/ops/concat_test.cc
+7
-7
mace/ops/conv_2d_test.cc
mace/ops/conv_2d_test.cc
+283
-354
mace/ops/conv_pool_2d_base.h
mace/ops/conv_pool_2d_base.h
+1
-2
mace/ops/core_test.cc
mace/ops/core_test.cc
+2
-3
mace/ops/deconv_2d_test.cc
mace/ops/deconv_2d_test.cc
+151
-313
mace/ops/depth_to_space_test.cc
mace/ops/depth_to_space_test.cc
+51
-80
mace/ops/depthwise_conv2d_test.cc
mace/ops/depthwise_conv2d_test.cc
+95
-111
mace/ops/eltwise.h
mace/ops/eltwise.h
+7
-7
mace/ops/eltwise_test.cc
mace/ops/eltwise_test.cc
+216
-329
mace/ops/folded_batch_norm.h
mace/ops/folded_batch_norm.h
+1
-1
mace/ops/folded_batch_norm_test.cc
mace/ops/folded_batch_norm_test.cc
+88
-105
mace/ops/fully_connected.h
mace/ops/fully_connected.h
+18
-28
mace/ops/fully_connected_test.cc
mace/ops/fully_connected_test.cc
+48
-50
mace/ops/local_response_norm.h
mace/ops/local_response_norm.h
+2
-3
mace/ops/local_response_norm_test.cc
mace/ops/local_response_norm_test.cc
+11
-11
mace/ops/matmul_test.cc
mace/ops/matmul_test.cc
+11
-13
mace/ops/ops_test_util.h
mace/ops/ops_test_util.h
+75
-77
mace/ops/pad.h
mace/ops/pad.h
+1
-2
mace/ops/pad_test.cc
mace/ops/pad_test.cc
+28
-47
mace/ops/pooling_test.cc
mace/ops/pooling_test.cc
+154
-200
mace/ops/proposal_test.cc
mace/ops/proposal_test.cc
+7
-8
mace/ops/quantize.cc
mace/ops/quantize.cc
+3
-3
mace/ops/quantize.h
mace/ops/quantize.h
+16
-26
mace/ops/quantize_test.cc
mace/ops/quantize_test.cc
+71
-83
mace/ops/resize_bilinear.h
mace/ops/resize_bilinear.h
+2
-3
mace/ops/resize_bilinear_test.cc
mace/ops/resize_bilinear_test.cc
+19
-32
mace/ops/slice.h
mace/ops/slice.h
+2
-2
mace/ops/slice_test.cc
mace/ops/slice_test.cc
+16
-22
mace/ops/softmax.h
mace/ops/softmax.h
+1
-1
mace/ops/softmax_test.cc
mace/ops/softmax_test.cc
+17
-18
mace/ops/space_to_batch.h
mace/ops/space_to_batch.h
+4
-6
mace/ops/space_to_batch_test.cc
mace/ops/space_to_batch_test.cc
+65
-82
mace/ops/space_to_depth.h
mace/ops/space_to_depth.h
+7
-9
mace/ops/transpose.cc
mace/ops/transpose.cc
+3
-3
mace/ops/transpose.h
mace/ops/transpose.h
+8
-8
mace/ops/transpose_test.cc
mace/ops/transpose_test.cc
+17
-22
mace/ops/winograd_convolution_test.cc
mace/ops/winograd_convolution_test.cc
+18
-31
mace/public/mace.h
mace/public/mace.h
+1
-1
未找到文件。
mace/core/mace.cc
浏览文件 @
33415ee9
...
...
@@ -155,13 +155,13 @@ MaceStatus MaceEngine::Impl::Init(
}
}
else
{
#endif
MACE_
FAILURE_RETURN
(
ws_
->
LoadModelTensor
(
MACE_
RETURN_IF_ERROR
(
ws_
->
LoadModelTensor
(
*
net_def
,
device_type_
,
model_data
));
// Init model
auto
net
=
CreateNet
(
op_registry_
,
*
net_def
,
ws_
.
get
(),
device_type_
,
NetMode
::
INIT
);
MACE_
FAILURE_RETURN
(
net
->
Run
());
MACE_
RETURN_IF_ERROR
(
net
->
Run
());
net_
=
CreateNet
(
op_registry_
,
*
net_def
,
ws_
.
get
(),
device_type_
);
#ifdef MACE_ENABLE_HEXAGON
}
...
...
@@ -195,7 +195,7 @@ MaceStatus MaceEngine::Impl::Run(
" please use 1 to fill missing dimensions"
);
Tensor
*
input_tensor
=
ws_
->
GetTensor
(
MakeString
(
"mace_input_node_"
,
input
.
first
));
input_tensor
->
Resize
(
input
.
second
.
shape
(
));
MACE_RETURN_IF_ERROR
(
input_tensor
->
Resize
(
input
.
second
.
shape
()
));
{
Tensor
::
MappingGuard
input_guard
(
input_tensor
);
float
*
input_data
=
input_tensor
->
mutable_data
<
float
>
();
...
...
@@ -221,7 +221,7 @@ MaceStatus MaceEngine::Impl::Run(
hexagon_controller_
->
ExecuteGraph
(
*
input_tensors
[
0
],
output_tensors
[
0
]);
}
else
{
#endif
MACE_
FAILURE_RETURN
(
net_
->
Run
(
run_metadata
));
MACE_
RETURN_IF_ERROR
(
net_
->
Run
(
run_metadata
));
#ifdef MACE_ENABLE_HEXAGON
}
#endif
...
...
mace/core/net.cc
浏览文件 @
33415ee9
...
...
@@ -71,7 +71,7 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
CallStats
call_stats
;
if
(
future_wait
)
{
StatsFuture
future
;
MACE_
FAILURE_RETURN
(
op
->
Run
(
&
future
));
MACE_
RETURN_IF_ERROR
(
op
->
Run
(
&
future
));
if
(
run_metadata
!=
nullptr
)
{
future
.
wait_fn
(
&
call_stats
);
}
else
{
...
...
@@ -79,10 +79,10 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
}
}
else
if
(
run_metadata
!=
nullptr
)
{
call_stats
.
start_micros
=
NowMicros
();
MACE_
FAILURE_RETURN
(
op
->
Run
(
nullptr
));
MACE_
RETURN_IF_ERROR
(
op
->
Run
(
nullptr
));
call_stats
.
end_micros
=
NowMicros
();
}
else
{
MACE_
FAILURE_RETURN
(
op
->
Run
(
nullptr
));
MACE_
RETURN_IF_ERROR
(
op
->
Run
(
nullptr
));
}
if
(
run_metadata
!=
nullptr
)
{
...
...
mace/core/workspace.cc
浏览文件 @
33415ee9
...
...
@@ -83,10 +83,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
}
else
{
tensor_buffer_
=
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
type
)));
MaceStatus
status
=
tensor_buffer_
->
Allocate
(
model_data_size
);
if
(
status
!=
MaceStatus
::
MACE_SUCCESS
)
{
return
status
;
}
MACE_RETURN_IF_ERROR
(
tensor_buffer_
->
Allocate
(
model_data_size
));
tensor_buffer_
->
Map
(
nullptr
);
tensor_buffer_
->
Copy
(
const_cast
<
unsigned
char
*>
(
model_data
),
0
,
model_data_size
);
...
...
@@ -156,11 +153,8 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
if
(
mem_block
.
mem_id
()
>=
20000
)
{
std
::
unique_ptr
<
BufferBase
>
image_buf
(
new
Image
());
MaceStatus
status
=
image_buf
->
Allocate
(
{
mem_block
.
x
(),
mem_block
.
y
()},
dtype
);
if
(
status
!=
MaceStatus
::
MACE_SUCCESS
)
{
return
status
;
}
MACE_RETURN_IF_ERROR
(
image_buf
->
Allocate
(
{
mem_block
.
x
(),
mem_block
.
y
()},
dtype
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
image_buf
));
}
...
...
@@ -168,12 +162,9 @@ MaceStatus Workspace::CreateOutputTensorBuffer(const NetDef &net_def,
if
(
mem_block
.
mem_id
()
<
20000
)
{
std
::
unique_ptr
<
BufferBase
>
tensor_buf
(
new
Buffer
(
GetDeviceAllocator
(
device_type
)));
M
aceStatus
status
=
tensor_buf
->
Allocate
(
M
ACE_RETURN_IF_ERROR
(
tensor_buf
->
Allocate
(
mem_block
.
x
()
*
GetEnumTypeSize
(
dtype
)
+
MACE_EXTRA_BUFFER_PAD_SIZE
);
if
(
status
!=
MaceStatus
::
MACE_SUCCESS
)
{
return
status
;
}
+
MACE_EXTRA_BUFFER_PAD_SIZE
));
preallocated_allocator_
.
SetBuffer
(
mem_block
.
mem_id
(),
std
::
move
(
tensor_buf
));
}
...
...
mace/kernels/addn.h
浏览文件 @
33415ee9
...
...
@@ -40,7 +40,7 @@ struct AddNFunctor {
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
MACE_UNUSED
(
future
);
MACE_
FAILURE_RETURN
(
output_tensor
->
ResizeLike
(
input_tensors
[
0
]));
MACE_
RETURN_IF_ERROR
(
output_tensor
->
ResizeLike
(
input_tensors
[
0
]));
index_t
size
=
output_tensor
->
size
();
Tensor
::
MappingGuard
output_map
(
output_tensor
);
float
*
output_data
=
output_tensor
->
mutable_data
<
float
>
();
...
...
mace/kernels/arm/conv_2d_neon.h
浏览文件 @
33415ee9
...
...
@@ -103,9 +103,9 @@ inline void Conv2dCPUKHxKWCalc(const float *in_ptr,
for
(
index_t
w
=
0
;
w
<
out_width
;
++
w
)
{
for
(
int
i
=
0
;
i
<
filter_height
;
++
i
)
{
for
(
int
j
=
0
;
j
<
filter_width
;
++
j
)
{
out_ptr
[
h
*
out_width
+
w
]
+=
in_ptr
[(
h
*
stride
+
i
)
*
in_width
+
(
w
*
stride
+
j
)]
*
filter_ptr
[
i
*
filter_width
+
j
];
out_ptr
[
h
*
out_width
+
w
]
+=
in_ptr
[(
h
*
stride
+
i
)
*
in_width
+
(
w
*
stride
+
j
)]
*
filter_ptr
[
i
*
filter_width
+
j
];
}
}
}
...
...
mace/kernels/arm/conv_2d_neon_15x1.cc
浏览文件 @
33415ee9
...
...
@@ -38,16 +38,15 @@ inline void Conv2dCPUK15x1Calc(const float *in_ptr,
for
(
index_t
iw
=
0
;
iw
<
tile_width
&&
w
+
iw
<
out_width
;
++
iw
)
{
for
(
int
i
=
0
;
i
<
15
;
++
i
)
{
for
(
int
j
=
0
;
j
<
1
;
++
j
)
{
out_ptr
[
io
*
out_image_size
+
ih
*
out_width
+
w
+
iw
]
+=
in_ptr
[(
ih
*
stride
+
i
)
*
in_width
+
((
w
+
iw
)
*
stride
+
j
)]
*
filter_ptr
[
io
*
in_channels
*
15
+
i
*
1
+
j
];
out_ptr
[
io
*
out_image_size
+
ih
*
out_width
+
w
+
iw
]
+=
in_ptr
[(
ih
*
stride
+
i
)
*
in_width
+
((
w
+
iw
)
*
stride
+
j
)]
*
filter_ptr
[
io
*
in_channels
*
15
+
i
*
1
+
j
];
}
}
}
}
}
// Ho = 4, Wo = 1, Co = 1
void
Conv2dNeonK15x1S1
(
const
float
*
input
,
const
float
*
filter
,
...
...
@@ -69,8 +68,7 @@ void Conv2dNeonK15x1S1(const float *input,
const
index_t
out_width
=
out_shape
[
3
];
const
index_t
in_channels
=
in_shape
[
1
];
const
index_t
in_width
=
in_shape
[
3
];
float
*
out_ptr_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
float
*
out_ptr_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
const
float
*
in_ptr_base
=
input
+
b
*
in_batch_size
+
c
*
in_image_size
;
...
...
mace/kernels/arm/conv_2d_neon_1x1.cc
浏览文件 @
33415ee9
...
...
@@ -31,12 +31,8 @@ void Conv2dNeonK1x1S1(const float *input,
const
index_t
out_channels
,
float
*
output
)
{
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
Gemm
(
filter
,
input
+
b
*
in_channels
*
height
*
width
,
1
,
out_channels
,
in_channels
,
height
*
width
,
Gemm
(
filter
,
input
+
b
*
in_channels
*
height
*
width
,
1
,
out_channels
,
in_channels
,
height
*
width
,
output
+
b
*
out_channels
*
height
*
width
);
}
}
...
...
mace/kernels/arm/conv_2d_neon_1x15.cc
浏览文件 @
33415ee9
...
...
@@ -17,8 +17,8 @@
#endif
#include "mace/kernels/arm/conv_2d_neon.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -39,16 +39,15 @@ inline void Conv2dCPUK1x15Calc(const float *in_ptr,
for
(
index_t
iw
=
0
;
iw
<
out_width
;
++
iw
)
{
for
(
int
i
=
0
;
i
<
1
;
++
i
)
{
for
(
int
j
=
0
;
j
<
15
;
++
j
)
{
out_ptr
[
io
*
out_image_size
+
(
h
+
ih
)
*
out_width
+
iw
]
+=
in_ptr
[((
h
+
ih
)
*
stride
+
i
)
*
in_width
+
(
iw
*
stride
+
j
)]
*
filter_ptr
[
io
*
in_channels
*
15
+
i
*
15
+
j
];
out_ptr
[
io
*
out_image_size
+
(
h
+
ih
)
*
out_width
+
iw
]
+=
in_ptr
[((
h
+
ih
)
*
stride
+
i
)
*
in_width
+
(
iw
*
stride
+
j
)]
*
filter_ptr
[
io
*
in_channels
*
15
+
i
*
15
+
j
];
}
}
}
}
}
// Ho = 1, Wo = 4, Co = 1
void
Conv2dNeonK1x15S1
(
const
float
*
input
,
const
float
*
filter
,
...
...
@@ -70,8 +69,7 @@ void Conv2dNeonK1x15S1(const float *input,
const
index_t
out_width
=
out_shape
[
3
];
const
index_t
in_channels
=
in_shape
[
1
];
const
index_t
in_width
=
in_shape
[
3
];
float
*
out_ptr_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
float
*
out_ptr_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
const
float
*
in_ptr_base
=
input
+
b
*
in_batch_size
+
c
*
in_image_size
;
...
...
mace/kernels/arm/conv_2d_neon_1x7.cc
浏览文件 @
33415ee9
...
...
@@ -41,8 +41,7 @@ void Conv2dNeonK1x7S1(const float *input,
const
index_t
in_channels
=
in_shape
[
1
];
const
index_t
in_width
=
in_shape
[
3
];
if
(
m
+
3
<
out_channels
)
{
float
*
out_ptr0_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
float
*
out_ptr0_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
#if defined(MACE_ENABLE_NEON)
float
*
out_ptr1_base
=
output
+
b
*
out_batch_size
+
(
m
+
1
)
*
out_image_size
;
...
...
@@ -56,12 +55,9 @@ void Conv2dNeonK1x7S1(const float *input,
input
+
b
*
in_batch_size
+
c
*
in_image_size
;
const
float
*
filter_ptr0
=
filter
+
m
*
in_channels
*
7
+
c
*
7
;
#if defined(MACE_ENABLE_NEON)
const
float
*
filter_ptr1
=
filter
+
(
m
+
1
)
*
in_channels
*
7
+
c
*
7
;
const
float
*
filter_ptr2
=
filter
+
(
m
+
2
)
*
in_channels
*
7
+
c
*
7
;
const
float
*
filter_ptr3
=
filter
+
(
m
+
3
)
*
in_channels
*
7
+
c
*
7
;
const
float
*
filter_ptr1
=
filter
+
(
m
+
1
)
*
in_channels
*
7
+
c
*
7
;
const
float
*
filter_ptr2
=
filter
+
(
m
+
2
)
*
in_channels
*
7
+
c
*
7
;
const
float
*
filter_ptr3
=
filter
+
(
m
+
3
)
*
in_channels
*
7
+
c
*
7
;
/* load filter (4 outch x 1 height x 4 width) */
float32x4_t
vf00
,
vf01
;
float32x4_t
vf10
,
vf11
;
...
...
@@ -241,9 +237,8 @@ void Conv2dNeonK1x7S1(const float *input,
}
// w
}
// h
#else
Conv2dCPUKHxKWCalc
(
in_ptr_base
,
filter_ptr0
,
in_width
,
1
,
7
,
out_height
,
out_width
,
out_ptr0_base
,
1
);
Conv2dCPUKHxKWCalc
(
in_ptr_base
,
filter_ptr0
,
in_width
,
1
,
7
,
out_height
,
out_width
,
out_ptr0_base
,
1
);
#endif
}
// c
}
...
...
mace/kernels/arm/conv_2d_neon_3x3.cc
浏览文件 @
33415ee9
...
...
@@ -75,7 +75,6 @@ void Conv2dNeonK3x3S1(const float *input,
vf11
=
vld1q_f32
(
filter_ptr1
+
3
);
vf12
=
vld1q_f32
(
filter_ptr1
+
6
);
for
(
index_t
h
=
0
;
h
+
1
<
out_height
;
h
+=
2
)
{
for
(
index_t
w
=
0
;
w
+
3
<
out_width
;
w
+=
4
)
{
// input (4 height x 3 slide): vi_height_slide
...
...
@@ -198,7 +197,6 @@ void Conv2dNeonK3x3S1(const float *input,
vf167
=
vld1_f32
(
filter_ptr1
+
6
);
vf189
=
vld1_f32
(
filter_ptr1
+
8
);
for
(
index_t
h
=
0
;
h
+
1
<
out_height
;
h
+=
2
)
{
for
(
index_t
w
=
0
;
w
+
3
<
out_width
;
w
+=
4
)
{
// input (4 height x 3 slide): vi_height_slide
...
...
@@ -313,11 +311,11 @@ void Conv2dNeonK3x3S1(const float *input,
}
// c
}
else
{
for
(
index_t
mm
=
m
;
mm
<
out_channels
;
++
mm
)
{
float
*
out_ptr0_base
=
output
+
b
*
out_batch_size
+
mm
*
out_image_size
;
float
*
out_ptr0_base
=
output
+
b
*
out_batch_size
+
mm
*
out_image_size
;
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
const
float
*
in_ptr0
=
input
+
b
*
in_batch_size
+
c
*
in_image_size
;
const
float
*
in_ptr0
=
input
+
b
*
in_batch_size
+
c
*
in_image_size
;
#if defined(MACE_ENABLE_NEON)
const
float
*
in_ptr1
=
input
+
b
*
in_batch_size
+
c
*
in_image_size
+
1
*
in_width
;
...
...
@@ -396,7 +394,6 @@ void Conv2dNeonK3x3S1(const float *input,
vst1q_f32
(
out_ptr0
,
vo00
);
vst1q_f32
(
out_ptr0
+
out_width
,
vo01
);
in_ptr0
+=
4
;
in_ptr1
+=
4
;
in_ptr2
+=
4
;
...
...
@@ -482,7 +479,6 @@ void Conv2dNeonK3x3S1(const float *input,
vst1q_f32
(
out_ptr0
,
vo00
);
vst1q_f32
(
out_ptr0
+
out_width
,
vo01
);
in_ptr0
+=
4
;
in_ptr1
+=
4
;
in_ptr2
+=
4
;
...
...
@@ -499,9 +495,8 @@ void Conv2dNeonK3x3S1(const float *input,
out_ptr0
+=
out_width
;
}
// h
#else
Conv2dCPUKHxKWCalc
(
in_ptr0
,
filter_ptr0
,
in_width
,
3
,
3
,
out_height
,
out_width
,
out_ptr0_base
,
1
);
Conv2dCPUKHxKWCalc
(
in_ptr0
,
filter_ptr0
,
in_width
,
3
,
3
,
out_height
,
out_width
,
out_ptr0_base
,
1
);
#endif
}
// c
}
// mm
...
...
@@ -529,8 +524,7 @@ void Conv2dNeonK3x3S2(const float *input,
const
index_t
out_height
=
out_shape
[
2
];
const
index_t
out_width
=
out_shape
[
3
];
const
float
*
in_base
=
input
+
b
*
in_batch_size
+
c
*
in_image_size
;
const
float
*
filter_ptr
=
filter
+
m
*
in_channels
*
9
+
c
*
9
;
const
float
*
filter_ptr
=
filter
+
m
*
in_channels
*
9
+
c
*
9
;
float
*
out_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
...
...
@@ -656,9 +650,8 @@ void Conv2dNeonK3x3S2(const float *input,
}
// w
}
// h
#else
Conv2dCPUKHxKWCalc
(
in_base
,
filter_ptr
,
in_width
,
3
,
3
,
out_height
,
out_width
,
out_base
,
2
);
Conv2dCPUKHxKWCalc
(
in_base
,
filter_ptr
,
in_width
,
3
,
3
,
out_height
,
out_width
,
out_base
,
2
);
#endif
}
// c
}
// m
...
...
mace/kernels/arm/conv_2d_neon_5x5.cc
浏览文件 @
33415ee9
...
...
@@ -205,9 +205,8 @@ void Conv2dNeonK5x5S1(const float *input,
}
// w
}
// h
#else
Conv2dCPUKHxKWCalc
(
in_ptr_base
,
filter_ptr0
,
in_width
,
5
,
5
,
out_height
,
out_width
,
out_ptr0_base
,
1
);
Conv2dCPUKHxKWCalc
(
in_ptr_base
,
filter_ptr0
,
in_width
,
5
,
5
,
out_height
,
out_width
,
out_ptr0_base
,
1
);
#endif
}
// c
}
// mm
...
...
mace/kernels/arm/conv_2d_neon_7x1.cc
浏览文件 @
33415ee9
...
...
@@ -41,8 +41,7 @@ void Conv2dNeonK7x1S1(const float *input,
const
index_t
in_channels
=
in_shape
[
1
];
const
index_t
in_width
=
in_shape
[
3
];
if
(
m
+
3
<
out_channels
)
{
float
*
out_ptr0_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
float
*
out_ptr0_base
=
output
+
b
*
out_batch_size
+
m
*
out_image_size
;
#if defined(MACE_ENABLE_NEON)
float
*
out_ptr1_base
=
output
+
b
*
out_batch_size
+
(
m
+
1
)
*
out_image_size
;
...
...
@@ -56,12 +55,9 @@ void Conv2dNeonK7x1S1(const float *input,
input
+
b
*
in_batch_size
+
c
*
in_image_size
;
const
float
*
filter_ptr0
=
filter
+
m
*
in_channels
*
7
+
c
*
7
;
#if defined(MACE_ENABLE_NEON)
const
float
*
filter_ptr1
=
filter
+
(
m
+
1
)
*
in_channels
*
7
+
c
*
7
;
const
float
*
filter_ptr2
=
filter
+
(
m
+
2
)
*
in_channels
*
7
+
c
*
7
;
const
float
*
filter_ptr3
=
filter
+
(
m
+
3
)
*
in_channels
*
7
+
c
*
7
;
const
float
*
filter_ptr1
=
filter
+
(
m
+
1
)
*
in_channels
*
7
+
c
*
7
;
const
float
*
filter_ptr2
=
filter
+
(
m
+
2
)
*
in_channels
*
7
+
c
*
7
;
const
float
*
filter_ptr3
=
filter
+
(
m
+
3
)
*
in_channels
*
7
+
c
*
7
;
/* load filter (4 outch x 4 height x 1 width) */
float32x4_t
vf00
,
vf01
;
float32x4_t
vf10
,
vf11
;
...
...
@@ -98,7 +94,6 @@ void Conv2dNeonK7x1S1(const float *input,
out_ptr3_base
[
out_offset
+
2
*
out_width
],
out_ptr3_base
[
out_offset
+
3
*
out_width
]};
// input offset
index_t
in_offset
=
h
*
in_width
+
w
;
// input (3 slide)
...
...
@@ -282,9 +277,8 @@ void Conv2dNeonK7x1S1(const float *input,
}
// w
}
// h
#else
Conv2dCPUKHxKWCalc
(
in_ptr_base
,
filter_ptr0
,
in_width
,
7
,
1
,
out_height
,
out_width
,
out_ptr0_base
,
1
);
Conv2dCPUKHxKWCalc
(
in_ptr_base
,
filter_ptr0
,
in_width
,
7
,
1
,
out_height
,
out_width
,
out_ptr0_base
,
1
);
#endif
}
// c
}
...
...
mace/kernels/arm/conv_2d_neon_7x7.cc
浏览文件 @
33415ee9
...
...
@@ -298,9 +298,8 @@ void Conv2dNeonK7x7S1(const float *input,
}
// w
}
// h
#else
Conv2dCPUKHxKWCalc
(
in_ptr_base
,
filter_ptr0
,
in_width
,
7
,
7
,
out_height
,
out_width
,
out_ptr0_base
,
1
);
Conv2dCPUKHxKWCalc
(
in_ptr_base
,
filter_ptr0
,
in_width
,
7
,
7
,
out_height
,
out_width
,
out_ptr0_base
,
1
);
#endif
}
// c
}
// mm
...
...
@@ -464,9 +463,8 @@ void Conv2dNeonK7x7S2(const float *input,
}
// w
}
// h
#else
Conv2dCPUKHxKWCalc
(
in_ptr_base
,
filter_ptr0
,
in_width
,
7
,
7
,
out_height
,
out_width
,
out_ptr0_base
,
2
);
Conv2dCPUKHxKWCalc
(
in_ptr_base
,
filter_ptr0
,
in_width
,
7
,
7
,
out_height
,
out_width
,
out_ptr0_base
,
2
);
#endif
}
// c
}
// mm
...
...
@@ -630,9 +628,8 @@ void Conv2dNeonK7x7S3(const float *input,
}
// w
}
// h
#else
Conv2dCPUKHxKWCalc
(
in_ptr_base
,
filter_ptr0
,
in_width
,
7
,
7
,
out_height
,
out_width
,
out_ptr0_base
,
3
);
Conv2dCPUKHxKWCalc
(
in_ptr_base
,
filter_ptr0
,
in_width
,
7
,
7
,
out_height
,
out_width
,
out_ptr0_base
,
3
);
#endif
}
// c
}
// mm
...
...
mace/kernels/arm/conv_winograd.cc
浏览文件 @
33415ee9
...
...
@@ -17,8 +17,8 @@
#include "mace/kernels/arm/conv_winograd.h"
#include "mace/kernels/gemm.h"
#include "mace/utils/utils.h"
#include "mace/utils/logging.h"
#include "mace/utils/utils.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -49,9 +49,8 @@ void TransformInput4x4(const float *input,
s15
;
// load tile data
const
float
*
input_ptr
=
input
+
n
*
input_batch_size
+
c
*
in_height_width
+
h
*
in_width
+
w
;
const
float
*
input_ptr
=
input
+
n
*
input_batch_size
+
c
*
in_height_width
+
h
*
in_width
+
w
;
d0
=
input_ptr
[
0
];
d1
=
input_ptr
[
1
];
d2
=
input_ptr
[
2
];
...
...
@@ -166,9 +165,8 @@ void TransformInput8x8(const float *input,
float
s
[
8
][
8
];
for
(
index_t
h
=
0
;
h
<
in_height
-
2
;
h
+=
6
)
{
for
(
index_t
w
=
0
;
w
<
in_width
-
2
;
w
+=
6
)
{
const
float
*
input_ptr
=
input
+
n
*
input_batch_size
+
c
*
in_height_width
+
h
*
in_width
+
w
;
const
float
*
input_ptr
=
input
+
n
*
input_batch_size
+
c
*
in_height_width
+
h
*
in_width
+
w
;
for
(
int
i
=
0
;
i
<
8
;
++
i
)
{
float
d0
,
d1
,
d2
,
d3
,
d4
,
d5
,
d6
,
d7
;
...
...
@@ -258,25 +256,16 @@ void BatchGemm(const float *input,
const
index_t
out_stride
=
out_channels
*
tile_count
;
if
(
batch
==
1
)
{
Gemm
(
filter
,
input
,
in_tile_area
,
out_channels
,
in_channels
,
tile_count
,
Gemm
(
filter
,
input
,
in_tile_area
,
out_channels
,
in_channels
,
tile_count
,
output
);
}
else
{
#pragma omp parallel for collapse(2)
for
(
int
b
=
0
;
b
<
batch
;
++
b
)
{
for
(
int
i
=
0
;
i
<
in_tile_area
;
++
i
)
{
const
float
*
in_ptr
=
input
+
b
*
in_batch_size
+
i
*
in_stride
;
const
float
*
in_ptr
=
input
+
b
*
in_batch_size
+
i
*
in_stride
;
const
float
*
filter_ptr
=
filter
+
i
*
filter_stride
;
float
*
out_ptr
=
output
+
b
*
out_batch_size
+
i
*
out_stride
;
Gemm
(
filter_ptr
,
in_ptr
,
1
,
out_channels
,
/* rows */
Gemm
(
filter_ptr
,
in_ptr
,
1
,
out_channels
,
/* rows */
in_channels
,
/* K */
tile_count
,
/* cols */
out_ptr
);
...
...
@@ -345,9 +334,8 @@ void TransformOutput4x4(const float *input,
v2
=
s2
-
s4
-
s6
;
v3
=
s3
-
s5
-
s7
;
float
*
output_ptr
=
output
+
n
*
output_batch_size
+
m
*
out_image_size
+
h
*
out_width
+
w
;
float
*
output_ptr
=
output
+
n
*
output_batch_size
+
m
*
out_image_size
+
h
*
out_width
+
w
;
output_ptr
[
0
]
=
v0
;
output_ptr
[
1
]
=
v1
;
output_ptr
[
out_width
]
=
v2
;
...
...
@@ -433,9 +421,8 @@ void TransformOutput8x8(const float *input,
input_ptr
+=
8
*
stride
;
}
float
*
output_ptr
=
output
+
n
*
output_batch_size
+
m
*
out_image_size
+
h
*
out_width
+
w
;
float
*
output_ptr
=
output
+
n
*
output_batch_size
+
m
*
out_image_size
+
h
*
out_width
+
w
;
for
(
int
i
=
0
;
i
<
6
;
++
i
)
{
float
d0
,
d1
,
d2
,
d3
,
d4
,
d5
,
d6
,
d7
;
...
...
@@ -471,7 +458,6 @@ void TransformOutput8x8(const float *input,
}
}
// namespace
// OCHW => TOC
// no need to optimize, it will exist in converter
void
TransformFilter4x4
(
const
float
*
filter
,
...
...
@@ -573,16 +559,14 @@ void TransformFilter8x8(const float *filter,
float
*
output
)
{
const
index_t
stride
=
out_channels
*
in_channels
;
const
float
G
[
8
][
3
]
=
{
{
1.0
f
,
0.0
f
,
0.0
f
},
const
float
G
[
8
][
3
]
=
{{
1.0
f
,
0.0
f
,
0.0
f
},
{
-
2.0
f
/
9
,
-
2.0
f
/
9
,
-
2.0
f
/
9
},
{
-
2.0
f
/
9
,
2.0
f
/
9
,
-
2.0
f
/
9
},
{
1.0
f
/
90
,
1.0
f
/
45
,
2.0
f
/
45
},
{
1.0
f
/
90
,
-
1.0
f
/
45
,
2.0
f
/
45
},
{
1.0
f
/
45
,
1.0
f
/
90
,
1.0
f
/
180
},
{
1.0
f
/
45
,
-
1.0
f
/
90
,
1.0
f
/
180
},
{
0.0
f
,
0.0
f
,
1.0
f
}
};
{
0.0
f
,
0.0
f
,
1.0
f
}};
#pragma omp parallel for collapse(2)
for
(
index_t
m
=
0
;
m
<
out_channels
;
++
m
)
{
...
...
@@ -640,55 +624,31 @@ void WinoGradConv3x3s1(const float *input,
switch
(
out_tile_size
)
{
case
2
:
TransformInput4x4
(
input
,
batch
,
in_height
,
in_width
,
in_channels
,
tile_count
,
transformed_input
);
TransformInput4x4
(
input
,
batch
,
in_height
,
in_width
,
in_channels
,
tile_count
,
transformed_input
);
break
;
case
6
:
TransformInput8x8
(
input
,
batch
,
in_height
,
in_width
,
in_channels
,
tile_count
,
transformed_input
);
TransformInput8x8
(
input
,
batch
,
in_height
,
in_width
,
in_channels
,
tile_count
,
transformed_input
);
break
;
default:
MACE_NOT_IMPLEMENTED
;
default:
MACE_NOT_IMPLEMENTED
;
}
BatchGemm
(
transformed_input
,
transformed_filter
,
batch
,
in_channels
,
out_channels
,
tile_count
,
out_tile_size
,
transformed_output
);
BatchGemm
(
transformed_input
,
transformed_filter
,
batch
,
in_channels
,
out_channels
,
tile_count
,
out_tile_size
,
transformed_output
);
switch
(
out_tile_size
)
{
case
2
:
TransformOutput4x4
(
transformed_output
,
batch
,
out_height
,
out_width
,
out_channels
,
tile_count
,
output
);
TransformOutput4x4
(
transformed_output
,
batch
,
out_height
,
out_width
,
out_channels
,
tile_count
,
output
);
break
;
case
6
:
TransformOutput8x8
(
transformed_output
,
batch
,
out_height
,
out_width
,
out_channels
,
tile_count
,
output
);
TransformOutput8x8
(
transformed_output
,
batch
,
out_height
,
out_width
,
out_channels
,
tile_count
,
output
);
break
;
default:
MACE_NOT_IMPLEMENTED
;
default:
MACE_NOT_IMPLEMENTED
;
}
}
...
...
@@ -712,8 +672,8 @@ void WinoGradConv3x3s1(const float *input,
index_t
transformed_input_size
=
in_tile_area
*
batch
*
in_channels
*
tile_count
;
index_t
transformed_filter_size
=
in_tile_area
*
out_channels
*
in_channels
;
index_t
transformed_output_size
=
in_tile_area
*
batch
*
out_channels
*
tile_count
;
index_t
transformed_output_size
=
in_tile_area
*
batch
*
out_channels
*
tile_count
;
float
*
transformed_input
=
new
float
[
transformed_input_size
];
// TNCB
float
*
transformed_filter
=
new
float
[
transformed_filter_size
];
// TOC
...
...
@@ -721,35 +681,22 @@ void WinoGradConv3x3s1(const float *input,
switch
(
out_tile_size
)
{
case
2
:
TransformFilter4x4
(
filter
,
in_channels
,
out_channels
,
transformed_filter
);
TransformFilter4x4
(
filter
,
in_channels
,
out_channels
,
transformed_filter
);
break
;
case
6
:
TransformFilter8x8
(
filter
,
in_channels
,
out_channels
,
transformed_filter
);
TransformFilter8x8
(
filter
,
in_channels
,
out_channels
,
transformed_filter
);
break
;
default:
MACE_NOT_IMPLEMENTED
;
}
WinoGradConv3x3s1
(
input
,
transformed_filter
,
batch
,
in_height
,
in_width
,
in_channels
,
out_channels
,
out_tile_size
,
transformed_input
,
transformed_output
,
output
);
default:
MACE_NOT_IMPLEMENTED
;
}
WinoGradConv3x3s1
(
input
,
transformed_filter
,
batch
,
in_height
,
in_width
,
in_channels
,
out_channels
,
out_tile_size
,
transformed_input
,
transformed_output
,
output
);
delete
[]
transformed_input
;
delete
[]
transformed_filter
;
delete
[]
transformed_output
;
delete
[]
transformed_input
;
delete
[]
transformed_filter
;
delete
[]
transformed_output
;
}
void
ConvRef3x3s1
(
const
float
*
input
,
...
...
@@ -778,10 +725,9 @@ void ConvRef3x3s1(const float *input,
index_t
iw
=
w
+
kw
;
index_t
in_offset
=
((
b
*
in_channels
+
c
)
*
in_height
+
ih
)
*
in_width
+
iw
;
index_t
filter_offset
=
(((
m
*
in_channels
)
+
c
)
*
3
+
kh
)
*
3
+
kw
;
output
[
out_offset
]
+=
input
[
in_offset
]
*
filter
[
filter_offset
];
index_t
filter_offset
=
(((
m
*
in_channels
)
+
c
)
*
3
+
kh
)
*
3
+
kw
;
output
[
out_offset
]
+=
input
[
in_offset
]
*
filter
[
filter_offset
];
}
}
}
...
...
mace/kernels/arm/conv_winograd_test.cc
浏览文件 @
33415ee9
...
...
@@ -13,13 +13,13 @@
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include <algorithm>
#include <memory>
#include <random>
#include "mace/kernels/arm/conv_winograd.h"
#include "mace/core/types.h"
#include "mace/core/tensor.h"
#include "mace/core/types.h"
#include "mace/kernels/arm/conv_winograd.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -55,32 +55,18 @@ TEST(ConvWinogradTest, winograd) {
std
::
random_device
rd
;
std
::
mt19937
gen
(
rd
());
std
::
normal_distribution
<
float
>
nd
(
0
,
1
);
std
::
generate
(
input_data
,
input_data
+
input_size
,
[
&
gen
,
&
nd
]
{
std
::
generate
(
input_data
,
input_data
+
input_size
,
[
&
gen
,
&
nd
]
{
return
std
::
max
(
-
1.0
f
,
std
::
min
(
1.0
f
,
nd
(
gen
)));
});
std
::
generate
(
filter_data
,
filter_data
+
filter_size
,
[
&
gen
,
&
nd
]
{
std
::
generate
(
filter_data
,
filter_data
+
filter_size
,
[
&
gen
,
&
nd
]
{
return
std
::
max
(
-
1.0
f
,
std
::
min
(
1.0
f
,
nd
(
gen
)));
});
kernels
::
ConvRef3x3s1
(
input_data
,
filter_data
,
batch
,
in_height
,
in_width
,
in_channels
,
out_channels
,
output_data_ref
);
kernels
::
ConvRef3x3s1
(
input_data
,
filter_data
,
batch
,
in_height
,
in_width
,
in_channels
,
out_channels
,
output_data_ref
);
kernels
::
WinoGradConv3x3s1
(
input_data
,
filter_data
,
batch
,
in_height
,
in_width
,
in_channels
,
out_channels
,
6
,
kernels
::
WinoGradConv3x3s1
(
input_data
,
filter_data
,
batch
,
in_height
,
in_width
,
in_channels
,
out_channels
,
6
,
output_data
);
// test
...
...
mace/kernels/arm/depthwise_conv2d_neon.h
浏览文件 @
33415ee9
mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
浏览文件 @
33415ee9
...
...
@@ -16,8 +16,8 @@
#include <arm_neon.h>
#endif
#include "mace/kernels/arm/depthwise_conv2d_neon.h"
#include "mace/core/macros.h"
#include "mace/kernels/arm/depthwise_conv2d_neon.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -52,9 +52,9 @@ void DepthwiseConv2dPixel(const float *in_base,
// Ho = 2, Wo = 4, Co = 1
void
DepthwiseConv2dNeonK3x3S1
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
const
int
*
pad_hw
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
const
int
*
pad_hw
,
const
index_t
valid_h_start
,
const
index_t
valid_h_stop
,
const
index_t
valid_w_start
,
...
...
@@ -88,18 +88,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
// top
for
(
h
=
0
;
h
<
valid_h_start
;
++
h
)
{
for
(
w
=
0
;
w
<
out_shape
[
3
];
++
w
)
{
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
-
pad_top
,
w
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
-
pad_top
,
w
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
}
}
...
...
@@ -113,30 +104,12 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
for
(
h
=
valid_h_start
;
h
+
1
<
valid_h_stop
;
h
+=
2
)
{
// left
for
(
w
=
0
;
w
<
valid_w_start
;
++
w
)
{
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
-
pad_top
,
w
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
+
1
,
w
,
h
+
1
-
pad_top
,
w
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
-
pad_top
,
w
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
+
1
,
w
,
h
+
1
-
pad_top
,
w
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
}
for
(
w
=
valid_w_start
;
w
+
3
<
valid_w_stop
;
w
+=
4
)
{
...
...
@@ -227,47 +200,20 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
// right
for
(;
w
<
out_width
;
++
w
)
{
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
-
pad_top
,
w
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
+
1
,
w
,
h
+
1
-
pad_top
,
w
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
-
pad_top
,
w
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
+
1
,
w
,
h
+
1
-
pad_top
,
w
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
}
}
// h
#else
for
(
index_t
ih
=
valid_h_start
;
ih
<
valid_h_stop
;
++
ih
)
{
for
(
index_t
iw
=
0
;
iw
<
out_shape
[
3
];
++
iw
)
{
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
ih
,
iw
,
ih
-
pad_top
,
iw
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
ih
,
iw
,
ih
-
pad_top
,
iw
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
}
}
#endif
...
...
@@ -275,18 +221,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
// bottom
for
(;
h
<
out_shape
[
2
];
++
h
)
{
for
(
w
=
0
;
w
<
out_shape
[
3
];
++
w
)
{
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
-
pad_top
,
w
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
-
pad_top
,
w
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
}
}
}
// m
...
...
@@ -295,9 +232,9 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
void
DepthwiseConv2dNeonK3x3S2
(
const
float
*
input
,
const
float
*
filter
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
const
int
*
pad_hw
,
const
index_t
*
in_shape
,
const
index_t
*
out_shape
,
const
int
*
pad_hw
,
const
index_t
valid_h_start
,
const
index_t
valid_h_stop
,
const
index_t
valid_w_start
,
...
...
@@ -330,18 +267,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
// top
for
(
h
=
0
;
h
<
valid_h_start
;
++
h
)
{
for
(
w
=
0
;
w
<
out_width
;
++
w
)
{
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
*
2
-
pad_top
,
w
*
2
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
*
2
-
pad_top
,
w
*
2
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
}
}
...
...
@@ -355,18 +283,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
for
(
h
=
valid_h_start
;
h
<
valid_h_stop
;
++
h
)
{
// left
for
(
w
=
0
;
w
<
valid_w_start
;
++
w
)
{
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
*
2
-
pad_top
,
w
*
2
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
*
2
-
pad_top
,
w
*
2
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
}
for
(
w
=
valid_w_start
;
w
+
3
<
valid_w_stop
;
w
+=
4
)
{
...
...
@@ -435,35 +354,17 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
// right
for
(;
w
<
out_width
;
++
w
)
{
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
*
2
-
pad_top
,
w
*
2
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
*
2
-
pad_top
,
w
*
2
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
}
}
// h
#else
for
(
index_t
ih
=
valid_h_start
;
ih
<
valid_h_stop
;
++
ih
)
{
for
(
index_t
iw
=
0
;
iw
<
out_width
;
++
iw
)
{
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
ih
,
iw
,
ih
*
2
-
pad_top
,
iw
*
2
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
ih
,
iw
,
ih
*
2
-
pad_top
,
iw
*
2
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
}
}
#endif
...
...
@@ -471,18 +372,9 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
// bottom
for
(;
h
<
out_shape
[
2
];
++
h
)
{
for
(
w
=
0
;
w
<
out_shape
[
3
];
++
w
)
{
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
*
2
-
pad_top
,
w
*
2
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
DepthwiseConv2dPixel
(
in_base
,
filter_ptr
,
h
,
w
,
h
*
2
-
pad_top
,
w
*
2
-
pad_left
,
out_width
,
in_height
,
in_width
,
3
,
3
,
out_base
);
}
}
}
// m
...
...
mace/kernels/channel_shuffle.h
浏览文件 @
33415ee9
...
...
@@ -32,7 +32,7 @@ struct ChannelShuffleFunctor {
Tensor
*
output
,
StatsFuture
*
future
)
{
MACE_UNUSED
(
future
);
MACE_
FAILURE_RETURN
(
output
->
ResizeLike
(
input
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeLike
(
input
));
Tensor
::
MappingGuard
logits_guard
(
input
);
Tensor
::
MappingGuard
output_guard
(
output
);
...
...
mace/kernels/concat.h
浏览文件 @
33415ee9
...
...
@@ -68,7 +68,7 @@ struct ConcatFunctor : ConcatFunctorBase {
outer_sizes
[
i
]
=
input
->
size
()
/
inner_size
;
output_shape
[
axis_
]
+=
input
->
dim
(
axis_
);
}
MACE_
FAILURE_RETURN
(
output
->
Resize
(
output_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
Resize
(
output_shape
));
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
...
...
mace/kernels/conv_2d.h
浏览文件 @
33415ee9
...
...
@@ -296,7 +296,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
RoundType
::
FLOOR
,
output_shape
.
data
());
}
MACE_
FAILURE_RETURN
(
output
->
Resize
(
output_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
Resize
(
output_shape
));
index_t
batch
=
output
->
dim
(
0
);
index_t
channels
=
output
->
dim
(
1
);
...
...
@@ -497,7 +497,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
if
(
is_filter_transformed_
)
{
transformed_filter_ptr
=
filter_data
;
}
else
{
MACE_
FAILURE_RETURN
(
transformed_filter_
.
Resize
(
MACE_
RETURN_IF_ERROR
(
transformed_filter_
.
Resize
(
transformed_filter_shape
));
switch
(
winograd_out_tile_size
)
{
case
2
:
...
...
@@ -644,7 +644,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
const
Tensor
*
pad_input_ptr
=
input
;
if
(
extra_input_height
!=
input_height
||
extra_input_width
!=
input_width
)
{
MACE_
FAILURE_RETURN
(
ConstructNCHWInputWithSpecificPadding
(
input
,
MACE_
RETURN_IF_ERROR
(
ConstructNCHWInputWithSpecificPadding
(
input
,
pad_top
,
pad_bottom
,
pad_left
,
...
...
mace/kernels/conv_pool_2d_util.cc
浏览文件 @
33415ee9
...
...
@@ -306,7 +306,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor,
const
int
padded_top
=
paddings
[
0
]
/
2
;
const
int
padded_left
=
paddings
[
1
]
/
2
;
MACE_
FAILURE_RETURN
(
output_tensor
->
Resize
(
output_shape
));
MACE_
RETURN_IF_ERROR
(
output_tensor
->
Resize
(
output_shape
));
Tensor
::
MappingGuard
padded_output_mapper
(
output_tensor
);
float
*
output_data
=
output_tensor
->
mutable_data
<
float
>
();
...
...
@@ -378,7 +378,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
const
int
pad_width
=
pad_left
+
pad_right
;
std
::
vector
<
index_t
>
output_shape
(
{
batch
,
channels
,
height
+
pad_height
,
width
+
pad_width
});
MACE_
FAILURE_RETURN
(
output_tensor
->
Resize
(
output_shape
));
MACE_
RETURN_IF_ERROR
(
output_tensor
->
Resize
(
output_shape
));
output_tensor
->
Clear
();
Tensor
::
MappingGuard
padded_output_mapper
(
output_tensor
);
float
*
output_data
=
output_tensor
->
mutable_data
<
float
>
();
...
...
@@ -428,7 +428,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
const
int
padded_top
=
paddings
[
0
]
/
2
;
const
int
padded_left
=
paddings
[
1
]
/
2
;
MACE_
FAILURE_RETURN
(
output_tensor
->
Resize
(
output_shape
));
MACE_
RETURN_IF_ERROR
(
output_tensor
->
Resize
(
output_shape
));
Tensor
::
MappingGuard
padded_output_mapper
(
output_tensor
);
float
*
output_data
=
output_tensor
->
mutable_data
<
float
>
();
...
...
mace/kernels/deconv_2d.h
浏览文件 @
33415ee9
...
...
@@ -250,7 +250,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
strides_
,
padding_type_
,
output_shape
.
data
(),
paddings_
.
data
(),
true
);
MACE_
FAILURE_RETURN
(
output
->
Resize
(
output_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
Resize
(
output_shape
));
}
else
{
output_shape_
.
clear
();
output_shape_
=
std
::
vector
<
index_t
>
(
4
,
0
);
...
...
@@ -259,7 +259,7 @@ struct Deconv2dFunctor : Deconv2dFunctorBase {
strides_
,
output_shape_
.
data
(),
paddings_
.
data
(),
true
);
MACE_
FAILURE_RETURN
(
output
->
Resize
(
output_shape_
));
MACE_
RETURN_IF_ERROR
(
output
->
Resize
(
output_shape_
));
}
index_t
kernel_h
=
filter
->
dim
(
2
);
index_t
kernel_w
=
filter
->
dim
(
3
);
...
...
mace/kernels/depth_to_space.h
浏览文件 @
33415ee9
...
...
@@ -55,7 +55,7 @@ struct DepthToSpaceOpFunctor {
std
::
vector
<
index_t
>
output_shape
=
{
batch_size
,
output_depth
,
output_height
,
output_width
};
MACE_
FAILURE_RETURN
(
output
->
Resize
(
output_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
Resize
(
output_shape
));
Tensor
::
MappingGuard
logits_guard
(
input
);
Tensor
::
MappingGuard
output_guard
(
output
);
...
...
mace/kernels/depthwise_conv2d.h
浏览文件 @
33415ee9
...
...
@@ -161,7 +161,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
RoundType
::
FLOOR
,
output_shape
.
data
());
}
MACE_
FAILURE_RETURN
(
output
->
Resize
(
output_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
Resize
(
output_shape
));
output
->
Clear
();
index_t
batch
=
output
->
dim
(
0
);
...
...
mace/kernels/eltwise.h
浏览文件 @
33415ee9
...
...
@@ -494,7 +494,7 @@ struct EltwiseFunctor<DeviceType::CPU, float>: EltwiseFunctorBase {
}
}
}
MACE_
FAILURE_RETURN
(
output
->
ResizeLike
(
input0
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeLike
(
input0
));
Tensor
::
MappingGuard
input0_guard
(
input0
);
Tensor
::
MappingGuard
output_guard
(
output
);
...
...
mace/kernels/fully_connected.h
浏览文件 @
33415ee9
...
...
@@ -57,7 +57,7 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
StatsFuture
*
future
)
{
MACE_UNUSED
(
future
);
std
::
vector
<
index_t
>
output_shape
=
{
input
->
dim
(
0
),
weight
->
dim
(
0
),
1
,
1
};
MACE_
FAILURE_RETURN
(
output
->
Resize
(
output_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
Resize
(
output_shape
));
const
index_t
N
=
output
->
dim
(
0
);
const
index_t
input_size
=
weight
->
dim
(
1
)
*
weight
->
dim
(
2
)
*
weight
->
dim
(
3
);
const
index_t
output_size
=
weight
->
dim
(
0
);
...
...
mace/kernels/matmul.h
浏览文件 @
33415ee9
...
...
@@ -44,7 +44,7 @@ struct MatMulFunctor {
StatsFuture
*
future
)
{
MACE_UNUSED
(
future
);
std
::
vector
<
index_t
>
c_shape
=
{
A
->
dim
(
0
),
A
->
dim
(
1
),
B
->
dim
(
2
),
1
};
MACE_
FAILURE_RETURN
(
C
->
Resize
(
c_shape
));
MACE_
RETURN_IF_ERROR
(
C
->
Resize
(
c_shape
));
Tensor
::
MappingGuard
guarda
(
A
);
Tensor
::
MappingGuard
guardb
(
B
);
...
...
mace/kernels/opencl/activation.cc
浏览文件 @
33415ee9
...
...
@@ -21,9 +21,9 @@
namespace
mace
{
namespace
kernels
{
template
<
typename
T
>
MaceStatus
ActivationFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
template
<
typename
T
>
MaceStatus
ActivationFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
alpha
,
Tensor
*
output
,
StatsFuture
*
future
)
{
...
...
@@ -47,7 +47,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -56,22 +56,28 @@ MaceStatus ActivationFunctor<DeviceType::GPU,
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
switch
(
activation_
)
{
case
RELU
:
tuning_key_prefix_
=
"relu_opencl_kernel"
;
case
RELU
:
tuning_key_prefix_
=
"relu_opencl_kernel"
;
built_options
.
emplace
(
"-DUSE_RELU"
);
break
;
case
RELUX
:
tuning_key_prefix_
=
"relux_opencl_kernel"
;
case
RELUX
:
tuning_key_prefix_
=
"relux_opencl_kernel"
;
built_options
.
emplace
(
"-DUSE_RELUX"
);
break
;
case
PRELU
:
tuning_key_prefix_
=
"prelu_opencl_kernel"
;
case
PRELU
:
tuning_key_prefix_
=
"prelu_opencl_kernel"
;
built_options
.
emplace
(
"-DUSE_PRELU"
);
break
;
case
TANH
:
tuning_key_prefix_
=
"tanh_opencl_kernel"
;
case
TANH
:
tuning_key_prefix_
=
"tanh_opencl_kernel"
;
built_options
.
emplace
(
"-DUSE_TANH"
);
break
;
case
SIGMOID
:
tuning_key_prefix_
=
"sigmoid_opencl_kernel"
;
case
SIGMOID
:
tuning_key_prefix_
=
"sigmoid_opencl_kernel"
;
built_options
.
emplace
(
"-DUSE_SIGMOID"
);
break
;
default:
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
default:
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
kernel_
=
runtime
->
BuildKernel
(
"activation"
,
kernel_name
,
built_options
);
...
...
@@ -121,9 +127,7 @@ MaceStatus ActivationFunctor<DeviceType::GPU,
return
MACE_SUCCESS
;
}
template
struct
ActivationFunctor
<
DeviceType
::
GPU
,
float
>;
template
struct
ActivationFunctor
<
DeviceType
::
GPU
,
half
>;
template
struct
ActivationFunctor
<
DeviceType
::
GPU
,
float
>;
template
struct
ActivationFunctor
<
DeviceType
::
GPU
,
half
>;
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/addn.cc
浏览文件 @
33415ee9
...
...
@@ -59,7 +59,7 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -87,8 +87,8 @@ MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
output_image_shape
);
MACE_
FAILURE_RETURN
(
output_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
));
MACE_
RETURN_IF_ERROR
(
output_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
));
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
mace/kernels/opencl/batch_norm.cc
浏览文件 @
33415ee9
...
...
@@ -23,7 +23,8 @@ namespace mace {
namespace
kernels
{
template
<
typename
T
>
MaceStatus
BatchNormFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
MaceStatus
BatchNormFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
scale
,
const
Tensor
*
offset
,
const
Tensor
*
mean
,
...
...
@@ -57,7 +58,7 @@ MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
mace/kernels/opencl/bias_add.cc
浏览文件 @
33415ee9
...
...
@@ -50,7 +50,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -91,8 +91,7 @@ MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
}
else
{
std
::
vector
<
uint32_t
>
roundup_gws
(
lws
.
size
());
for
(
size_t
i
=
0
;
i
<
lws
.
size
();
++
i
)
{
if
(
lws
[
i
]
!=
0
)
roundup_gws
[
i
]
=
RoundUp
(
gws
[
i
],
lws
[
i
]);
if
(
lws
[
i
]
!=
0
)
roundup_gws
[
i
]
=
RoundUp
(
gws
[
i
],
lws
[
i
]);
}
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
...
...
mace/kernels/opencl/buffer_to_image.cc
浏览文件 @
33415ee9
...
...
@@ -25,14 +25,13 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
const
BufferType
type
,
Tensor
*
image
,
StatsFuture
*
future
)
{
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
buffer
->
shape
(),
type
,
&
image_shape
);
if
(
type
==
WINOGRAD_FILTER
)
{
std
::
vector
<
index_t
>
new_shape
=
CalWinogradShape
(
buffer
->
shape
(),
type
);
MACE_
FAILURE_RETURN
(
image
->
ResizeImage
(
new_shape
,
image_shape
));
MACE_
RETURN_IF_ERROR
(
image
->
ResizeImage
(
new_shape
,
image_shape
));
}
else
{
MACE_
FAILURE_RETURN
(
image
->
ResizeImage
(
buffer
->
shape
(),
image_shape
));
MACE_
RETURN_IF_ERROR
(
image
->
ResizeImage
(
buffer
->
shape
(),
image_shape
));
}
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
image_shape
[
0
]),
...
...
@@ -94,7 +93,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
if
(
!
kernel_error_
)
{
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -120,8 +119,7 @@ MaceStatus BufferToImageFunctor<DeviceType::GPU, T>::operator()(
static_cast
<
uint32_t
>
(
buffer
->
buffer_offset
()
/
GetEnumTypeSize
(
buffer
->
dtype
())));
if
(
type
==
CONV2D_FILTER
)
{
const
index_t
inner_size
=
buffer
->
dim
(
1
)
*
buffer
->
dim
(
2
)
*
buffer
->
dim
(
3
);
const
index_t
inner_size
=
buffer
->
dim
(
1
)
*
buffer
->
dim
(
2
)
*
buffer
->
dim
(
3
);
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
0
)));
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
2
)));
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
3
)));
...
...
mace/kernels/opencl/channel_shuffle.cc
浏览文件 @
33415ee9
...
...
@@ -16,18 +16,16 @@
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace
mace
{
namespace
kernels
{
template
<
typename
T
>
MaceStatus
ChannelShuffleFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
)
{
MACE_FAILURE_RETURN
(
output
->
ResizeLike
(
input
));
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
)
{
MACE_RETURN_IF_ERROR
(
output
->
ResizeLike
(
input
));
const
index_t
batch
=
input
->
dim
(
0
);
const
index_t
height
=
input
->
dim
(
1
);
...
...
@@ -36,8 +34,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const
index_t
channels_per_group
=
channels
/
groups_
;
MACE_CHECK
(
channels_per_group
%
4
==
0
,
"channels per group must be multiple of 4"
);
MACE_CHECK
(
groups_
%
4
==
0
,
"groups must be multiple of 4"
);
MACE_CHECK
(
groups_
%
4
==
0
,
"groups must be multiple of 4"
);
const
index_t
group_channel_blocks
=
RoundUpDiv4
(
channels_per_group
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
group_channel_blocks
),
...
...
@@ -57,7 +54,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -65,8 +62,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"channel_shuffle"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"channel_shuffle"
,
kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
@@ -93,8 +90,8 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
);
std
::
string
tuning_key
=
Concat
(
"channel_shuffle_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
Concat
(
"channel_shuffle_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
@@ -107,9 +104,7 @@ MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
return
MACE_SUCCESS
;
}
template
struct
ChannelShuffleFunctor
<
DeviceType
::
GPU
,
float
>;
template
struct
ChannelShuffleFunctor
<
DeviceType
::
GPU
,
half
>;
template
struct
ChannelShuffleFunctor
<
DeviceType
::
GPU
,
float
>;
template
struct
ChannelShuffleFunctor
<
DeviceType
::
GPU
,
half
>;
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/concat.cc
浏览文件 @
33415ee9
...
...
@@ -22,11 +22,9 @@ namespace mace {
namespace
kernels
{
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws
[
1
]);
...
...
@@ -37,8 +35,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
}
// namespace
static
void
Concat2
(
cl
::
Kernel
*
kernel
,
static
MaceStatus
Concat2
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input0
,
const
Tensor
*
input1
,
const
DataType
dt
,
...
...
@@ -68,7 +65,7 @@ static void Concat2(cl::Kernel *kernel,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
(
*
kernel_error
)
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
((
*
kernel_error
)
->
Allocate
(
1
)
);
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
...
...
@@ -115,8 +112,8 @@ static void Concat2(cl::Kernel *kernel,
const
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
*
kwg_size
);
std
::
string
tuning_key
=
Concat
(
"concat_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
Concat
(
"concat_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
@@ -125,9 +122,11 @@ static void Concat2(cl::Kernel *kernel,
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
return
MACE_SUCCESS
;
}
static
void
ConcatN
(
cl
::
Kernel
*
kernel
,
static
MaceStatus
ConcatN
(
cl
::
Kernel
*
kernel
,
const
std
::
vector
<
const
Tensor
*>
&
input_list
,
const
DataType
dt
,
Tensor
*
output
,
...
...
@@ -150,7 +149,7 @@ static void ConcatN(cl::Kernel *kernel,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
(
*
kernel_error
)
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
((
*
kernel_error
)
->
Allocate
(
1
)
);
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
...
...
@@ -218,8 +217,8 @@ static void ConcatN(cl::Kernel *kernel,
if
(
runtime
->
is_profiling_enabled
())
{
CallStats
tmp_stats
;
runtime
->
GetCallStats
(
event
,
&
tmp_stats
);
call_stats
.
start_micros
=
std
::
min
<
int64_t
>
(
tmp_stats
.
start_micros
,
call_stats
.
start_micros
);
call_stats
.
start_micros
=
std
::
min
<
int64_t
>
(
tmp_stats
.
start_micros
,
call_stats
.
start_micros
);
call_stats
.
end_micros
+=
tmp_stats
.
end_micros
-
tmp_stats
.
start_micros
;
}
}
...
...
@@ -232,6 +231,8 @@ static void ConcatN(cl::Kernel *kernel,
}
};
}
return
MACE_SUCCESS
;
}
template
<
typename
T
>
...
...
@@ -266,17 +267,17 @@ MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()(
"Dimensions of inputs should be divisible by 4 when inputs_count > 2."
);
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
image_shape
);
MACE_
FAILURE_RETURN
(
output
->
ResizeImage
(
output_shape
,
image_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeImage
(
output_shape
,
image_shape
));
switch
(
inputs_count
)
{
case
2
:
Concat2
(
&
kernel_
,
input_list
[
0
],
input_list
[
1
],
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
,
&
kernel_error_
);
break
;
return
Concat2
(
&
kernel_
,
input_list
[
0
],
input_list
[
1
]
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
,
&
kernel_error_
)
;
default:
if
(
divisible_four
)
{
ConcatN
(
&
kernel_
,
input_list
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
,
&
kwg_size_
,
&
kernel_error_
);
return
ConcatN
(
&
kernel_
,
input_list
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
,
&
kwg_size_
,
&
kernel_error_
);
}
else
{
MACE_NOT_IMPLEMENTED
;
}
...
...
mace/kernels/opencl/conv_2d.cc
浏览文件 @
33415ee9
...
...
@@ -18,7 +18,7 @@
namespace
mace
{
namespace
kernels
{
extern
void
Conv2dOpenclK1x1
(
cl
::
Kernel
*
kernel
,
extern
MaceStatus
Conv2dOpenclK1x1
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
...
...
@@ -34,7 +34,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
);
extern
void
Conv2dOpenclK3x3
(
cl
::
Kernel
*
kernel
,
extern
MaceStatus
Conv2dOpenclK3x3
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
...
...
@@ -50,7 +50,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
);
extern
void
Conv2dOpencl
(
cl
::
Kernel
*
kernel
,
extern
MaceStatus
Conv2dOpencl
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
...
...
@@ -72,7 +72,7 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
)
{
typedef
void
(
*
Conv2dOpenclFunction
)(
typedef
MaceStatus
(
*
Conv2dOpenclFunction
)(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
int
stride
,
const
int
*
padding
,
const
int
*
dilations
,
const
ActivationType
activation
,
...
...
@@ -111,23 +111,21 @@ MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
output_image_shape
);
MACE_
FAILURE_RETURN
(
output
->
ResizeImage
(
output_shape
,
output_image_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeImage
(
output_shape
,
output_image_shape
));
if
(
kernel_h
==
kernel_w
&&
kernel_h
<=
5
&&
selector
[
kernel_h
-
1
]
!=
nullptr
)
{
auto
conv2d_func
=
selector
[
kernel_h
-
1
];
conv2d_func
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit
_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
,
&
kernel_error_
);
return
conv2d_func
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations
_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
,
&
kernel_error_
);
}
else
{
Conv2dOpencl
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit
_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
,
&
kernel_error_
);
return
Conv2dOpencl
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations
_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
,
&
kernel_error_
);
}
return
MACE_SUCCESS
;
}
template
struct
Conv2dFunctor
<
DeviceType
::
GPU
,
float
>;
...
...
mace/kernels/opencl/conv_2d_1x1.cc
浏览文件 @
33415ee9
...
...
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
...
...
@@ -25,11 +25,9 @@ namespace {
const
uint32_t
kernel_cache_size
=
(
4
+
4
+
4
)
*
4
*
4
;
// TODO(liuqi): Fix the specific value.
const
uint32_t
lws_limit
=
128
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
...
...
@@ -46,8 +44,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
)
*
8
,
gws
[
2
]);
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
)
*
8
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
}
...
...
@@ -57,7 +54,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
}
// namespace
extern
void
Conv2dOpenclK1x1
(
cl
::
Kernel
*
kernel
,
extern
MaceStatus
Conv2dOpenclK1x1
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
...
...
@@ -101,7 +98,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
(
*
kernel_error
)
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
((
*
kernel_error
)
->
Allocate
(
1
)
);
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
...
...
@@ -172,8 +169,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
*
kwg_size
);
std
::
string
tuning_key
=
Concat
(
"conv2d_1x1_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
Concat
(
"conv2d_1x1_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
@@ -182,6 +179,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
return
MACE_SUCCESS
;
}
}
// namespace kernels
...
...
mace/kernels/opencl/conv_2d_3x3.cc
浏览文件 @
33415ee9
...
...
@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
...
...
@@ -24,22 +24,20 @@ namespace kernels {
namespace
{
// (inputs + weights + outputs) * array_size * sizeof(float)
const
uint32_t
kernel_cache_size
=
(
5
+
4
+
5
)
*
4
*
4
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
std
::
max
<
uint32_t
>
(
OpenCLRuntime
::
Global
()
->
device_compute_units
()
/
2
,
1
);
const
uint32_t
base
=
std
::
min
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
4
);
const
uint32_t
base
=
std
::
min
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
4
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
),
kwg_size
/
lws
[
1
]);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
RoundUp
<
uint32_t
>
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
,
base
),
RoundUp
<
uint32_t
>
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
,
base
),
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
...
...
@@ -50,7 +48,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
}
// namespace
extern
void
Conv2dOpenclK3x3
(
cl
::
Kernel
*
kernel
,
extern
MaceStatus
Conv2dOpenclK3x3
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
...
...
@@ -87,7 +85,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
(
*
kernel_error
)
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
((
*
kernel_error
)
->
Allocate
(
1
)
);
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
...
...
@@ -159,8 +157,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
*
kwg_size
);
std
::
string
tuning_key
=
Concat
(
"conv2d_3x3_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
Concat
(
"conv2d_3x3_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
@@ -169,6 +167,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
return
MACE_SUCCESS
;
}
}
// namespace kernels
...
...
mace/kernels/opencl/conv_2d_general.cc
浏览文件 @
33415ee9
...
...
@@ -12,9 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/conv_2d.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
...
...
@@ -30,8 +30,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
const
uint32_t
kernel_size
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
...
...
@@ -41,9 +40,9 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
(
cache_size
/
kernel_cache_size
/
kernel_size
/
lws_size
/
compute_units
)
*
8
,
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
(
cache_size
/
kernel_cache_size
/
kernel_size
/
lws_size
/
compute_units
)
*
8
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
if
(
gws
[
2
]
<
lws_limit
)
{
...
...
@@ -58,7 +57,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
}
// namespace
extern
void
Conv2dOpencl
(
cl
::
Kernel
*
kernel
,
extern
MaceStatus
Conv2dOpencl
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
...
...
@@ -95,7 +94,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
(
*
kernel_error
)
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
((
*
kernel_error
)
->
Allocate
(
1
)
);
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
...
...
@@ -168,9 +167,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
}
std
::
string
tuning_key
=
Concat
(
"conv2d_general_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
filter
->
dim
(
2
),
filter
->
dim
(
3
));
Concat
(
"conv2d_general_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
filter
->
dim
(
2
),
filter
->
dim
(
3
));
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
filter
->
dim
(
2
)
*
filter
->
dim
(
3
),
*
kwg_size
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
...
...
@@ -181,6 +179,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
return
MACE_SUCCESS
;
}
}
// namespace kernels
...
...
mace/kernels/opencl/deconv_2d_opencl.cc
浏览文件 @
33415ee9
...
...
@@ -20,7 +20,7 @@ namespace kernels {
namespace
{
void
Deconv2dOpencl
(
cl
::
Kernel
*
kernel
,
MaceStatus
Deconv2dOpencl
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
...
...
@@ -46,10 +46,10 @@ void Deconv2dOpencl(cl::Kernel *kernel,
#define MACE_WIDTH_BLK 5
const
index_t
n_strides
=
(
width
+
stride
-
1
)
/
stride
;
const
index_t
width_blocks
=
((
n_strides
+
MACE_WIDTH_BLK
-
1
)
/
MACE_WIDTH_BLK
)
*
stride
;
((
n_strides
+
MACE_WIDTH_BLK
-
1
)
/
MACE_WIDTH_BLK
)
*
stride
;
const
float
stride_r
=
1.
f
/
static_cast
<
float
>
(
stride
);
const
int
padding_h
=
(
paddings
[
0
]
+
1
)
>>
1
;
const
int
padding_w
=
(
paddings
[
0
]
+
1
)
>>
1
;
const
int
padding_h
=
(
paddings
[
0
]
+
1
)
>>
1
;
const
int
padding_w
=
(
paddings
[
0
]
+
1
)
>>
1
;
const
int
align_h
=
stride
-
1
-
padding_h
;
const
int
align_w
=
stride
-
1
-
padding_w
;
...
...
@@ -67,7 +67,7 @@ void Deconv2dOpencl(cl::Kernel *kernel,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
(
*
kernel_error
)
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
((
*
kernel_error
)
->
Allocate
(
1
)
);
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
...
...
@@ -77,16 +77,22 @@ void Deconv2dOpencl(cl::Kernel *kernel,
}
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
switch
(
activation
)
{
case
NOOP
:
break
;
case
RELU
:
built_options
.
emplace
(
"-DUSE_RELU"
);
case
NOOP
:
break
;
case
RELUX
:
built_options
.
emplace
(
"-DUSE_RELUX"
);
case
RELU
:
built_options
.
emplace
(
"-DUSE_RELU"
);
break
;
case
TANH
:
built_options
.
emplace
(
"-DUSE_TANH"
);
case
RELUX
:
built_options
.
emplace
(
"-DUSE_RELUX"
);
break
;
case
SIGMOID
:
built_options
.
emplace
(
"-DUSE_SIGMOID"
);
case
TANH
:
built_options
.
emplace
(
"-DUSE_TANH"
);
break
;
default:
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
case
SIGMOID
:
built_options
.
emplace
(
"-DUSE_SIGMOID"
);
break
;
default:
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
*
kernel
=
runtime
->
BuildKernel
(
"deconv_2d"
,
kernel_name
,
built_options
);
...
...
@@ -150,12 +156,15 @@ void Deconv2dOpencl(cl::Kernel *kernel,
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
return
MACE_SUCCESS
;
}
}
// namespace
template
<
typename
T
>
MaceStatus
Deconv2dFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
MaceStatus
Deconv2dFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
,
...
...
@@ -167,34 +176,25 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
if
(
output_shape_
.
size
()
==
4
)
{
paddings_
.
clear
();
paddings_
=
std
::
vector
<
int
>
(
2
,
0
);
CalcDeconvPaddingAndInputSize
(
input
->
shape
().
data
(),
filter
->
shape
().
data
(),
strides_
,
padding_type_
,
output_shape_
.
data
(),
CalcDeconvPaddingAndInputSize
(
input
->
shape
().
data
(),
filter
->
shape
().
data
(),
strides_
,
padding_type_
,
output_shape_
.
data
(),
paddings_
.
data
());
}
else
{
output_shape_
.
clear
();
output_shape_
=
std
::
vector
<
index_t
>
(
4
,
0
);
CalcDeconvOutputSize
(
input
->
shape
().
data
(),
filter
->
shape
().
data
(),
strides_
,
output_shape_
.
data
(),
paddings_
.
data
());
CalcDeconvOutputSize
(
input
->
shape
().
data
(),
filter
->
shape
().
data
(),
strides_
,
output_shape_
.
data
(),
paddings_
.
data
());
}
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape_
,
BufferType
::
IN_OUT_CHANNEL
,
&
output_image_shape
);
MACE_FAILURE_RETURN
(
output
->
ResizeImage
(
output_shape_
,
output_image_shape
));
Deconv2dOpencl
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings_
.
data
(),
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
,
&
kernel_error_
);
MACE_RETURN_IF_ERROR
(
output
->
ResizeImage
(
output_shape_
,
output_image_shape
));
return
MACE_SUCCESS
;
return
Deconv2dOpencl
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings_
.
data
(),
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
,
&
kernel_error_
);
}
template
struct
Deconv2dFunctor
<
DeviceType
::
GPU
,
float
>;
...
...
mace/kernels/opencl/depth_to_space.cc
浏览文件 @
33415ee9
...
...
@@ -70,7 +70,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
image_shape
);
MACE_
FAILURE_RETURN
(
output
->
ResizeImage
(
output_shape
,
image_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeImage
(
output_shape
,
image_shape
));
auto
runtime
=
OpenCLRuntime
::
Global
();
...
...
@@ -87,7 +87,7 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -95,9 +95,8 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"depth_to_space"
,
obfuscated_kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"depth_to_space"
,
obfuscated_kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
mace/kernels/opencl/depthwise_conv.cc
浏览文件 @
33415ee9
...
...
@@ -24,8 +24,7 @@ namespace kernels {
namespace
{
// (inputs + weights + outputs) * array_size * sizeof(float)
const
uint32_t
kernel_cache_size
=
(
4
+
4
+
1
)
*
4
*
4
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
min_lws0
=
cache_size
/
kBaseGPUMemCacheSize
;
...
...
@@ -40,8 +39,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
}
}
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
(
cache_size
/
kernel_cache_size
/
lws_size
)
*
4
,
lws
[
2
]
=
std
::
min
<
uint32_t
>
((
cache_size
/
kernel_cache_size
/
lws_size
)
*
4
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
gws
[
2
];
...
...
@@ -52,7 +50,7 @@ std::vector<uint32_t> LocalWS(const uint32_t *gws,
}
// namespace
static
void
DepthwiseConv2d
(
cl
::
Kernel
*
kernel
,
static
MaceStatus
DepthwiseConv2d
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
// NHWC
const
Tensor
*
filter
,
// HWIM
const
Tensor
*
bias
,
...
...
@@ -98,7 +96,7 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
(
*
kernel_error
)
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
((
*
kernel_error
)
->
Allocate
(
1
)
);
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
...
...
@@ -181,8 +179,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
}
const
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
*
kwg_size
);
std
::
string
tuning_key
=
Concat
(
"depthwise_conv2d_ocl_kernel"
,
gws
[
0
],
gws
[
1
],
gws
[
2
],
multiplier
);
std
::
string
tuning_key
=
Concat
(
"depthwise_conv2d_ocl_kernel"
,
gws
[
0
],
gws
[
1
],
gws
[
2
],
multiplier
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
@@ -191,6 +189,8 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
return
MACE_SUCCESS
;
}
template
<
typename
T
>
...
...
@@ -200,7 +200,6 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
)
{
index_t
kernel_h
=
filter
->
dim
(
2
);
index_t
kernel_w
=
filter
->
dim
(
3
);
if
(
strides_
[
0
]
!=
strides_
[
1
])
{
...
...
@@ -237,14 +236,12 @@ MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
output_image_shape
);
MACE_
FAILURE_RETURN
(
output
->
ResizeImage
(
output_shape
,
output_image_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeImage
(
output_shape
,
output_image_shape
));
DepthwiseConv2d
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
,
&
kernel_error_
);
return
MACE_SUCCESS
;
return
DepthwiseConv2d
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
,
&
kernel_error_
);
}
template
struct
DepthwiseConv2dFunctor
<
DeviceType
::
GPU
,
float
>;
...
...
mace/kernels/opencl/eltwise.cc
浏览文件 @
33415ee9
...
...
@@ -28,9 +28,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
MACE_UNUSED
(
future
);
bool
swapped
=
false
;
if
(
input1
!=
nullptr
)
{
MACE_CHECK
(
input0
->
dim_size
()
==
input1
->
dim_size
()
||
input0
->
dim_size
()
==
1
||
input1
->
dim_size
()
==
1
)
MACE_CHECK
(
input0
->
dim_size
()
==
input1
->
dim_size
()
||
input0
->
dim_size
()
==
1
||
input1
->
dim_size
()
==
1
)
<<
"Inputs of Eltwise op must be same shape"
;
if
(
input0
->
size
()
!=
input1
->
size
())
{
if
(
input0
->
size
()
<
input1
->
size
())
{
...
...
@@ -42,25 +41,23 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
<<
"Element-Wise op only support channel dimension broadcast"
;
}
else
{
MACE_CHECK
((
input0
->
dim
(
0
)
==
input1
->
dim
(
0
)
||
input1
->
dim
(
0
)
==
1
)
&&
input0
->
dim
(
3
)
==
input1
->
dim
(
3
)
&&
input1
->
dim
(
1
)
==
1
&&
input0
->
dim
(
3
)
==
input1
->
dim
(
3
)
&&
input1
->
dim
(
1
)
==
1
&&
input1
->
dim
(
2
)
==
1
)
<<
"Element-Wise op only support channel dimension broadcast"
;
}
}
}
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
output_shape
(
4
);
output_shape
[
0
]
=
input0
->
dim
(
0
);
output_shape
[
1
]
=
input0
->
dim
(
1
);
output_shape
[
2
]
=
input0
->
dim
(
2
);
output_shape
[
3
]
=
input0
->
dim
(
3
);
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
output_image_shape
);
MACE_
FAILURE_RETURN
(
output
->
ResizeImage
(
output_shape
,
output_image_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeImage
(
output_shape
,
output_image_shape
));
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
...
...
@@ -98,7 +95,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -142,8 +139,8 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
);
std
::
string
tuning_key
=
Concat
(
"eltwise_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
Concat
(
"eltwise_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/fully_connected.cc
浏览文件 @
33415ee9
...
...
@@ -20,7 +20,7 @@ namespace kernels {
namespace
{
template
<
typename
T
>
void
FCWXKernel
(
cl
::
Kernel
*
kernel
,
MaceStatus
FCWXKernel
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
weight
,
const
Tensor
*
bias
,
...
...
@@ -75,7 +75,7 @@ void FCWXKernel(cl::Kernel *kernel,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
(
*
kernel_error
)
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
((
*
kernel_error
)
->
Allocate
(
1
)
);
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
...
...
@@ -170,10 +170,12 @@ void FCWXKernel(cl::Kernel *kernel,
}
};
}
return
MACE_SUCCESS
;
}
template
<
typename
T
>
void
FCWTXKernel
(
cl
::
Kernel
*
kernel
,
MaceStatus
FCWTXKernel
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
weight
,
const
Tensor
*
bias
,
...
...
@@ -202,7 +204,7 @@ void FCWTXKernel(cl::Kernel *kernel,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
(
*
kernel_error
)
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
((
*
kernel_error
)
->
Allocate
(
1
)
);
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
...
...
@@ -233,7 +235,7 @@ void FCWTXKernel(cl::Kernel *kernel,
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
*
lws
=
{
16
,
kwg_size
/
16
,
0
};
*
lws
=
{
16
,
kwg_size
/
16
,
0
};
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
const
index_t
batch
=
output
->
dim
(
0
);
...
...
@@ -268,8 +270,8 @@ void FCWTXKernel(cl::Kernel *kernel,
}
std
::
string
tuning_key
=
Concat
(
"fc_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
Concat
(
"fc_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun2DKernel
(
*
kernel
,
tuning_key
,
gws
->
data
(),
*
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
@@ -278,6 +280,8 @@ void FCWTXKernel(cl::Kernel *kernel,
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
return
MACE_SUCCESS
;
}
}
// namespace
...
...
@@ -292,13 +296,11 @@ MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
output_image_shape
);
MACE_
FAILURE_RETURN
(
output
->
ResizeImage
(
output_shape
,
output_image_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeImage
(
output_shape
,
output_image_shape
));
FCWXKernel
<
T
>
(
&
kernel_
,
input
,
weight
,
bias
,
&
input_shape_
,
output
,
return
FCWXKernel
<
T
>
(
&
kernel_
,
input
,
weight
,
bias
,
&
input_shape_
,
output
,
activation_
,
&
gws_
,
&
lws_
,
relux_max_limit_
,
future
,
&
kernel_error_
);
return
MACE_SUCCESS
;
}
template
struct
FullyConnectedFunctor
<
DeviceType
::
GPU
,
float
>;
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
33415ee9
...
...
@@ -209,12 +209,11 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
std
::
vector
<
uint32_t
>
Default3DLocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
1
]
*
lws
[
2
];
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
);
return
lws
;
...
...
@@ -278,7 +277,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
MACE_CHECK
(
params
.
size
()
==
4
)
<<
"Tuning parameters of 3D kernel must be 4D"
;
cl_int
error
=
CL_SUCCESS
;
std
::
vector
<
uint32_t
>
internal_gws
(
gws
,
gws
+
3
);
std
::
vector
<
uint32_t
>
internal_gws
(
gws
,
gws
+
3
);
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
for
(
size_t
i
=
0
;
i
<
3
;
++
i
)
{
internal_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
...
...
@@ -287,12 +286,12 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
if
(
timer
==
nullptr
)
{
uint32_t
block_size
=
params
[
3
]
==
0
?
internal_gws
[
2
]
:
params
[
3
];
const
uint32_t
num_blocks
=
RoundUpDiv
<
uint32_t
>
(
internal_gws
[
2
],
block_size
);
const
uint32_t
num_blocks
=
RoundUpDiv
<
uint32_t
>
(
internal_gws
[
2
],
block_size
);
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
block_size
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
()
&&
(
i
==
num_blocks
-
1
))
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
()
&&
(
i
==
num_blocks
-
1
))
{
gws2
=
(
internal_gws
[
2
]
-
(
i
*
block_size
));
}
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
...
...
@@ -324,8 +323,8 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
num_blocks
=
RoundUpDiv
<
uint32_t
>
(
internal_gws
[
2
],
block_size
);
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
block_size
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
()
&&
(
i
==
num_blocks
-
1
))
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
()
&&
(
i
==
num_blocks
-
1
))
{
gws2
=
(
internal_gws
[
2
]
-
(
i
*
block_size
));
}
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
...
...
@@ -365,17 +364,11 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel
));
std
::
vector
<
std
::
vector
<
uint32_t
>>
results
;
std
::
vector
<
std
::
vector
<
uint32_t
>>
candidates
=
{
{
kwg_size
/
2
,
2
,
0
},
{
kwg_size
/
4
,
4
,
0
},
{
kwg_size
/
8
,
8
,
0
},
{
kwg_size
/
16
,
16
,
0
},
{
kwg_size
/
32
,
32
,
0
},
{
kwg_size
/
64
,
64
,
0
},
{
kwg_size
/
128
,
128
,
0
},
{
kwg_size
/
256
,
256
,
0
},
{
kwg_size
,
1
,
0
},
{
1
,
kwg_size
,
0
}
};
{
kwg_size
/
2
,
2
,
0
},
{
kwg_size
/
4
,
4
,
0
},
{
kwg_size
/
8
,
8
,
0
},
{
kwg_size
/
16
,
16
,
0
},
{
kwg_size
/
32
,
32
,
0
},
{
kwg_size
/
64
,
64
,
0
},
{
kwg_size
/
128
,
128
,
0
},
{
kwg_size
/
256
,
256
,
0
},
{
kwg_size
,
1
,
0
},
{
1
,
kwg_size
,
0
}};
for
(
auto
&
ele
:
candidates
)
{
const
uint32_t
tmp
=
ele
[
0
]
*
ele
[
1
]
*
ele
[
2
];
if
(
0
<
tmp
&&
tmp
<=
kwg_size
)
{
...
...
@@ -390,7 +383,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
MACE_CHECK
(
params
.
size
()
==
3
)
<<
"Tuning parameters of 2D kernel must be 3d"
;
cl_int
error
=
CL_SUCCESS
;
std
::
vector
<
uint32_t
>
internal_gws
(
gws
,
gws
+
2
);
std
::
vector
<
uint32_t
>
internal_gws
(
gws
,
gws
+
2
);
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
for
(
size_t
i
=
0
;
i
<
2
;
++
i
)
{
internal_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
...
...
@@ -399,12 +392,12 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
if
(
timer
==
nullptr
)
{
uint32_t
block_size
=
params
[
2
]
==
0
?
internal_gws
[
1
]
:
params
[
2
];
const
uint32_t
num_blocks
=
RoundUpDiv
<
uint32_t
>
(
internal_gws
[
1
],
block_size
);
const
uint32_t
num_blocks
=
RoundUpDiv
<
uint32_t
>
(
internal_gws
[
1
],
block_size
);
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
block_size
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
()
&&
(
i
==
num_blocks
-
1
))
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
()
&&
(
i
==
num_blocks
-
1
))
{
gws1
=
(
internal_gws
[
1
]
-
(
i
*
block_size
));
}
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
...
...
@@ -435,8 +428,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
num_blocks
=
RoundUpDiv
<
uint32_t
>
(
internal_gws
[
1
],
block_size
);
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
block_size
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
()
&&
(
i
==
num_blocks
-
1
))
{
if
(
runtime
->
IsNonUniformWorkgroupsSupported
()
&&
(
i
==
num_blocks
-
1
))
{
gws1
=
(
internal_gws
[
1
]
-
(
i
*
block_size
));
}
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
...
...
@@ -463,6 +456,5 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/helper.h
浏览文件 @
33415ee9
...
...
@@ -88,8 +88,7 @@ inline bool LimitKernelTime() {
}
template
<
typename
T
>
bool
IsVecEqual
(
const
std
::
vector
<
T
>
&
input0
,
const
std
::
vector
<
T
>
&
input1
)
{
bool
IsVecEqual
(
const
std
::
vector
<
T
>
&
input0
,
const
std
::
vector
<
T
>
&
input1
)
{
return
((
input0
.
size
()
==
input1
.
size
())
&&
(
std
::
equal
(
input0
.
begin
(),
input0
.
end
(),
input1
.
begin
())));
}
...
...
mace/kernels/opencl/image_to_buffer.cc
浏览文件 @
33415ee9
...
...
@@ -25,10 +25,9 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
const
BufferType
type
,
Tensor
*
buffer
,
StatsFuture
*
future
)
{
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
image
->
shape
(),
type
,
&
image_shape
);
MACE_
FAILURE_RETURN
(
buffer
->
Resize
(
image
->
shape
()));
MACE_
RETURN_IF_ERROR
(
buffer
->
Resize
(
image
->
shape
()));
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
image_shape
[
0
]),
static_cast
<
uint32_t
>
(
image_shape
[
1
])};
...
...
@@ -87,7 +86,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
if
(
!
kernel_error_
)
{
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -108,8 +107,7 @@ MaceStatus ImageToBufferFunctor<DeviceType::GPU, T>::operator()(
}
b2f_kernel
.
setArg
(
idx
++
,
*
(
buffer
->
opencl_buffer
()));
if
(
type
==
CONV2D_FILTER
)
{
const
index_t
inner_size
=
buffer
->
dim
(
1
)
*
buffer
->
dim
(
2
)
*
buffer
->
dim
(
3
);
const
index_t
inner_size
=
buffer
->
dim
(
1
)
*
buffer
->
dim
(
2
)
*
buffer
->
dim
(
3
);
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
0
)));
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
2
)));
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
3
)));
...
...
mace/kernels/opencl/matmul.cc
浏览文件 @
33415ee9
...
...
@@ -29,7 +29,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
std
::
vector
<
index_t
>
c_shape
=
{
A
->
dim
(
0
),
A
->
dim
(
1
),
B
->
dim
(
2
),
1
};
std
::
vector
<
size_t
>
c_image_shape
;
CalImage2DShape
(
c_shape
,
BufferType
::
IN_OUT_HEIGHT
,
&
c_image_shape
);
MACE_
FAILURE_RETURN
(
C
->
ResizeImage
(
c_shape
,
c_image_shape
));
MACE_
RETURN_IF_ERROR
(
C
->
ResizeImage
(
c_shape
,
c_image_shape
));
const
index_t
batch
=
C
->
dim
(
0
);
const
index_t
height
=
C
->
dim
(
1
);
...
...
@@ -55,7 +55,7 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -87,9 +87,8 @@ MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
A
->
dim
(
2
))));
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
64
,
64
,
0
};
std
::
string
tuning_key
=
Concat
(
"matmul_opencl_kernel"
,
C
->
dim
(
0
),
C
->
dim
(
1
),
C
->
dim
(
2
),
C
->
dim
(
3
));
std
::
string
tuning_key
=
Concat
(
"matmul_opencl_kernel"
,
C
->
dim
(
0
),
C
->
dim
(
1
),
C
->
dim
(
2
),
C
->
dim
(
3
));
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
mace/kernels/opencl/out_of_range_check_test.cc
浏览文件 @
33415ee9
...
...
@@ -58,7 +58,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error
->
Allocate
(
1
)
);
kernel_error
->
Map
(
nullptr
);
*
(
kernel_error
->
mutable_data
<
char
>
())
=
0
;
kernel_error
->
UnMap
();
...
...
@@ -113,8 +113,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
bool
is_out_of_range
=
false
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error
->
Map
(
nullptr
);
is_out_of_range
=
*
(
kernel_error
->
mutable_data
<
char
>
())
==
1
?
true
:
false
;
is_out_of_range
=
*
(
kernel_error
->
mutable_data
<
char
>
())
==
1
?
true
:
false
;
kernel_error
->
UnMap
();
}
return
is_out_of_range
;
...
...
@@ -124,9 +123,7 @@ bool BufferToImageOpImpl(Tensor *buffer,
class
OutOfRangeCheckTest
:
public
::
testing
::
Test
{
protected:
virtual
void
SetUp
()
{
setenv
(
"OUT_OF_RANGE_CHECK"
,
"1"
,
1
);
}
virtual
void
SetUp
()
{
setenv
(
"OUT_OF_RANGE_CHECK"
,
"1"
,
1
);
}
};
TEST
(
OutOfRangeCheckTest
,
RandomTest
)
{
...
...
@@ -137,14 +134,13 @@ TEST(OutOfRangeCheckTest, RandomTest) {
std
::
vector
<
index_t
>
buffer_shape
=
{
batch
,
height
,
width
,
channels
};
Workspace
ws
;
Tensor
*
buffer
=
ws
.
CreateTensor
(
"Buffer"
,
GetDeviceAllocator
(
DeviceType
::
GPU
),
Tensor
*
buffer
=
ws
.
CreateTensor
(
"Buffer"
,
GetDeviceAllocator
(
DeviceType
::
GPU
),
DataTypeToEnum
<
float
>::
v
());
buffer
->
Resize
(
buffer_shape
);
std
::
vector
<
size_t
>
image_shape
;
Tensor
*
image
=
ws
.
CreateTensor
(
"Image"
,
GetDeviceAllocator
(
DeviceType
::
GPU
),
Tensor
*
image
=
ws
.
CreateTensor
(
"Image"
,
GetDeviceAllocator
(
DeviceType
::
GPU
),
DataTypeToEnum
<
float
>::
v
());
CalImage2DShape
(
buffer
->
shape
(),
IN_OUT_CHANNEL
,
&
image_shape
);
image
->
ResizeImage
(
buffer
->
shape
(),
image_shape
);
...
...
mace/kernels/opencl/pad.cc
浏览文件 @
33415ee9
...
...
@@ -20,26 +20,25 @@
namespace
mace
{
namespace
kernels
{
template
<
typename
T
>
MaceStatus
PadFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
template
<
typename
T
>
MaceStatus
PadFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
)
{
MACE_CHECK
(
this
->
paddings_
.
size
()
==
static_cast
<
size_t
>
((
input
->
dim_size
()
*
2
)));
MACE_CHECK
((
this
->
paddings_
[
0
]
==
0
)
&&
(
this
->
paddings_
[
1
]
==
0
)
&&
(
this
->
paddings_
[
6
]
==
0
)
&&
(
this
->
paddings_
[
7
]
==
0
))
MACE_CHECK
(
this
->
paddings_
.
size
()
==
static_cast
<
size_t
>
((
input
->
dim_size
()
*
2
)));
MACE_CHECK
((
this
->
paddings_
[
0
]
==
0
)
&&
(
this
->
paddings_
[
1
]
==
0
)
&&
(
this
->
paddings_
[
6
]
==
0
)
&&
(
this
->
paddings_
[
7
]
==
0
))
<<
"Mace only support height/width dimension now"
;
auto
input_shape
=
input
->
shape
();
std
::
vector
<
index_t
>
output_shape
=
{
input_shape
[
0
]
+
this
->
paddings_
[
0
]
+
this
->
paddings_
[
1
],
std
::
vector
<
index_t
>
output_shape
=
{
input_shape
[
0
]
+
this
->
paddings_
[
0
]
+
this
->
paddings_
[
1
],
input_shape
[
1
]
+
this
->
paddings_
[
2
]
+
this
->
paddings_
[
3
],
input_shape
[
2
]
+
this
->
paddings_
[
4
]
+
this
->
paddings_
[
5
],
input_shape
[
3
]
+
this
->
paddings_
[
6
]
+
this
->
paddings_
[
7
]};
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
image_shape
);
MACE_
FAILURE_RETURN
(
output
->
ResizeImage
(
output_shape
,
image_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeImage
(
output_shape
,
image_shape
));
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
...
...
@@ -61,7 +60,7 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -103,9 +102,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
}
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
);
std
::
string
tuning_key
=
Concat
(
"pad"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
std
::
string
tuning_key
=
Concat
(
"pad"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
@@ -118,10 +116,8 @@ MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(
return
MACE_SUCCESS
;
}
template
struct
PadFunctor
<
DeviceType
::
GPU
,
float
>;
template
struct
PadFunctor
<
DeviceType
::
GPU
,
half
>;
template
struct
PadFunctor
<
DeviceType
::
GPU
,
float
>;
template
struct
PadFunctor
<
DeviceType
::
GPU
,
half
>;
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/pooling.cc
浏览文件 @
33415ee9
...
...
@@ -23,15 +23,13 @@ namespace kernels {
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
1
]
*
lws
[
2
];
lws
[
0
]
=
gws
[
0
]
/
4
;
if
(
lws
[
0
]
==
0
)
{
...
...
@@ -73,7 +71,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -108,7 +106,7 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
output_image_shape
);
MACE_
FAILURE_RETURN
(
output
->
ResizeImage
(
output_shape
,
output_image_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeImage
(
output_shape
,
output_image_shape
));
index_t
batch
=
output
->
dim
(
0
);
index_t
out_height
=
output
->
dim
(
1
);
...
...
@@ -159,8 +157,8 @@ MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
const
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
.
data
(),
kwg_size_
);
std
::
string
tuning_key
=
Concat
(
"pooling_opencl_kernel_"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
Concat
(
"pooling_opencl_kernel_"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
.
data
(),
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
mace/kernels/opencl/resize_bilinear.cc
浏览文件 @
33415ee9
...
...
@@ -23,11 +23,9 @@ namespace mace {
namespace
kernels
{
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
...
...
@@ -79,7 +77,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -100,7 +98,7 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
output_image_shape
);
MACE_
FAILURE_RETURN
(
output
->
ResizeImage
(
output_shape
,
output_image_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeImage
(
output_shape
,
output_image_shape
));
float
height_scale
=
CalculateResizeScale
(
in_height
,
out_height
,
align_corners_
);
...
...
@@ -130,8 +128,8 @@ MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
const
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
kwg_size_
);
std
::
string
tuning_key
=
Concat
(
"resize_bilinear_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
Concat
(
"resize_bilinear_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
mace/kernels/opencl/slice.cc
浏览文件 @
33415ee9
...
...
@@ -20,7 +20,7 @@
namespace
mace
{
namespace
kernels
{
template
<
typename
T
>
template
<
typename
T
>
MaceStatus
SliceFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
const
std
::
vector
<
Tensor
*>
&
output_list
,
...
...
@@ -30,13 +30,14 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
const
index_t
output_channels
=
input_channels
/
outputs_count
;
MACE_CHECK
(
output_channels
%
4
==
0
)
<<
"output channels of slice op must be divisible by 4"
;
std
::
vector
<
index_t
>
output_shape
(
{
input
->
dim
(
0
),
input
->
dim
(
1
),
input
->
dim
(
2
),
output_channels
});
std
::
vector
<
index_t
>
output_shape
(
{
input
->
dim
(
0
),
input
->
dim
(
1
),
input
->
dim
(
2
),
output_channels
});
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
image_shape
);
for
(
size_t
i
=
0
;
i
<
outputs_count
;
++
i
)
{
MACE_FAILURE_RETURN
(
output_list
[
i
]
->
ResizeImage
(
output_shape
,
image_shape
));
for
(
size_t
i
=
0
;
i
<
outputs_count
;
++
i
)
{
MACE_RETURN_IF_ERROR
(
output_list
[
i
]
->
ResizeImage
(
output_shape
,
image_shape
));
}
auto
runtime
=
OpenCLRuntime
::
Global
();
...
...
@@ -46,13 +47,13 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"slice"
);
built_options
.
emplace
(
"-Dslice="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -68,8 +69,7 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
const
index_t
channel_blk
=
RoundUpDiv4
(
output_channels
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blk
),
static_cast
<
uint32_t
>
(
input
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
channel_blk
),
static_cast
<
uint32_t
>
(
input
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
input
->
dim
(
0
)
*
input
->
dim
(
1
)),
};
...
...
@@ -117,8 +117,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
is_profiling_enabled
())
{
CallStats
tmp_stats
;
runtime
->
GetCallStats
(
event
,
&
tmp_stats
);
call_stats
.
start_micros
=
std
::
min
<
int64_t
>
(
tmp_stats
.
start_micros
,
call_stats
.
start_micros
);
call_stats
.
start_micros
=
std
::
min
<
int64_t
>
(
tmp_stats
.
start_micros
,
call_stats
.
start_micros
);
call_stats
.
end_micros
+=
tmp_stats
.
end_micros
-
tmp_stats
.
start_micros
;
}
}
...
...
@@ -135,10 +135,8 @@ MaceStatus SliceFunctor<DeviceType::GPU, T>::operator()(
return
MACE_SUCCESS
;
}
template
struct
SliceFunctor
<
DeviceType
::
GPU
,
float
>;
template
struct
SliceFunctor
<
DeviceType
::
GPU
,
half
>;
template
struct
SliceFunctor
<
DeviceType
::
GPU
,
float
>;
template
struct
SliceFunctor
<
DeviceType
::
GPU
,
half
>;
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/softmax.cc
浏览文件 @
33415ee9
...
...
@@ -24,10 +24,8 @@ namespace kernels {
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
...
...
@@ -71,7 +69,7 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -105,8 +103,8 @@ MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
kwg_size_
);
std
::
string
tuning_key
=
Concat
(
"softmax_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
Concat
(
"softmax_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
mace/kernels/opencl/space_to_batch.cc
浏览文件 @
33415ee9
...
...
@@ -26,17 +26,13 @@ namespace kernels {
template
<
typename
T
>
MaceStatus
SpaceToBatchFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
Tensor
*
space_tensor
,
Tensor
*
batch_tensor
,
StatsFuture
*
future
)
{
Tensor
*
space_tensor
,
Tensor
*
batch_tensor
,
StatsFuture
*
future
)
{
std
::
vector
<
index_t
>
output_shape
(
4
,
0
);
if
(
b2s_
)
{
CalculateBatchToSpaceOutputShape
(
batch_tensor
,
DataFormat
::
NHWC
,
CalculateBatchToSpaceOutputShape
(
batch_tensor
,
DataFormat
::
NHWC
,
output_shape
.
data
());
}
else
{
CalculateSpaceToBatchOutputShape
(
space_tensor
,
DataFormat
::
NHWC
,
CalculateSpaceToBatchOutputShape
(
space_tensor
,
DataFormat
::
NHWC
,
output_shape
.
data
());
}
...
...
@@ -45,12 +41,12 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
output_image_shape
);
if
(
b2s_
)
{
MACE_
FAILURE_RETURN
(
space_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
));
MACE_
RETURN_IF_ERROR
(
space_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
));
kernel_name
=
"batch_to_space"
;
}
else
{
MACE_
FAILURE_RETURN
(
batch_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
));
MACE_
RETURN_IF_ERROR
(
batch_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
));
kernel_name
=
"space_to_batch"
;
}
const
uint32_t
chan_blk
=
RoundUpDiv4
<
uint32_t
>
(
batch_tensor
->
dim
(
3
));
...
...
@@ -73,7 +69,7 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -81,9 +77,8 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"space_to_batch"
,
obfuscated_kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"space_to_batch"
,
obfuscated_kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
33415ee9
...
...
@@ -24,7 +24,6 @@ namespace kernels {
template
<
typename
T
>
MaceStatus
WinogradTransformFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
...
...
@@ -40,7 +39,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -78,7 +77,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
output_shape
=
{
16
,
input_tensor
->
dim
(
3
),
out_width
,
1
};
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_HEIGHT
,
&
image_shape
);
MACE_
FAILURE_RETURN
(
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
));
MACE_
RETURN_IF_ERROR
(
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
));
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
@@ -103,10 +102,9 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
}
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
0
};
std
::
string
tuning_key
=
Concat
(
"winograd_transform_kernel"
,
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
),
output_tensor
->
dim
(
3
));
std
::
string
tuning_key
=
Concat
(
"winograd_transform_kernel"
,
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
),
output_tensor
->
dim
(
3
));
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
@@ -125,7 +123,6 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
const
Tensor
*
bias
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
...
...
@@ -142,7 +139,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
GPU
))));
kernel_error_
->
Allocate
(
1
);
MACE_RETURN_IF_ERROR
(
kernel_error_
->
Allocate
(
1
)
);
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
...
...
@@ -188,7 +185,7 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
input_tensor
->
dim
(
1
)};
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
image_shape
);
MACE_
FAILURE_RETURN
(
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
));
MACE_
RETURN_IF_ERROR
(
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
));
const
uint32_t
round_h
=
(
height_
+
1
)
/
2
;
const
uint32_t
round_w
=
(
width_
+
1
)
/
2
;
...
...
mace/kernels/pad.h
浏览文件 @
33415ee9
...
...
@@ -51,7 +51,7 @@ struct PadFunctor : public PadFunctorBase {
MACE_CHECK
(
this
->
paddings_
.
size
()
==
static_cast
<
size_t
>
(
input
->
dim_size
())
*
2
);
auto
input_shape
=
input
->
shape
();
MACE_
FAILURE_RETURN
(
output
->
Resize
({
input_shape
[
0
]
+
this
->
paddings_
[
0
]
MACE_
RETURN_IF_ERROR
(
output
->
Resize
({
input_shape
[
0
]
+
this
->
paddings_
[
0
]
+
this
->
paddings_
[
1
],
input_shape
[
1
]
+
this
->
paddings_
[
2
]
+
this
->
paddings_
[
3
],
...
...
mace/kernels/pooling.h
浏览文件 @
33415ee9
...
...
@@ -190,7 +190,7 @@ struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
RoundType
::
CEIL
,
output_shape
.
data
());
}
MACE_
FAILURE_RETURN
(
output_tensor
->
Resize
(
output_shape
));
MACE_
RETURN_IF_ERROR
(
output_tensor
->
Resize
(
output_shape
));
Tensor
::
MappingGuard
input_guard
(
input_tensor
);
Tensor
::
MappingGuard
output_guard
(
output_tensor
);
...
...
mace/kernels/proposal.h
浏览文件 @
33415ee9
...
...
@@ -267,7 +267,7 @@ struct ProposalFunctor {
// Our RPN implementation only supports a single input image, so all
// batch inds are 0
size
=
static_cast
<
int
>
(
nms_result
.
size
());
MACE_
FAILURE_RETURN
(
output
->
Resize
({
size
,
1
,
1
,
5
}));
MACE_
RETURN_IF_ERROR
(
output
->
Resize
({
size
,
1
,
1
,
5
}));
auto
output_ptr
=
output
->
mutable_data
<
float
>
();
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
...
...
mace/kernels/psroi_align.h
浏览文件 @
33415ee9
...
...
@@ -50,7 +50,7 @@ struct PSROIAlignFunctor {
const
index_t
num_rois
=
rois
->
dim
(
0
);
const
index_t
batch_size
=
input
->
dim
(
0
);
MACE_
FAILURE_RETURN
(
output
->
Resize
({
num_rois
,
pooled_height
,
pooled_width
,
MACE_
RETURN_IF_ERROR
(
output
->
Resize
({
num_rois
,
pooled_height
,
pooled_width
,
output_dim_
}));
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
...
...
mace/kernels/resize_bilinear.h
浏览文件 @
33415ee9
...
...
@@ -150,7 +150,7 @@ struct ResizeBilinearFunctor<DeviceType::CPU, float>
index_t
out_width
=
out_width_
;
MACE_CHECK
(
out_height
>
0
&&
out_width
>
0
);
std
::
vector
<
index_t
>
out_shape
{
batch
,
channels
,
out_height
,
out_width
};
MACE_
FAILURE_RETURN
(
output
->
Resize
(
out_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
Resize
(
out_shape
));
Tensor
::
MappingGuard
input_mapper
(
input
);
Tensor
::
MappingGuard
output_mapper
(
output
);
...
...
mace/kernels/slice.h
浏览文件 @
33415ee9
...
...
@@ -61,7 +61,7 @@ struct SliceFunctor : SliceFunctorBase {
1
,
std
::
multiplies
<
index_t
>
());
for
(
size_t
i
=
0
;
i
<
outputs_count
;
++
i
)
{
MACE_
FAILURE_RETURN
(
output_list
[
i
]
->
Resize
(
output_shape
));
MACE_
RETURN_IF_ERROR
(
output_list
[
i
]
->
Resize
(
output_shape
));
output_ptrs
[
i
]
=
output_list
[
i
]
->
mutable_data
<
T
>
();
}
const
T
*
input_ptr
=
input
->
data
<
T
>
();
...
...
mace/kernels/space_to_batch.h
浏览文件 @
33415ee9
...
...
@@ -150,12 +150,12 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
CalculateBatchToSpaceOutputShape
(
batch_tensor
,
DataFormat
::
NCHW
,
output_shape
.
data
());
MACE_
FAILURE_RETURN
(
space_tensor
->
Resize
(
output_shape
));
MACE_
RETURN_IF_ERROR
(
space_tensor
->
Resize
(
output_shape
));
}
else
{
CalculateSpaceToBatchOutputShape
(
space_tensor
,
DataFormat
::
NCHW
,
output_shape
.
data
());
MACE_
FAILURE_RETURN
(
batch_tensor
->
Resize
(
output_shape
));
MACE_
RETURN_IF_ERROR
(
batch_tensor
->
Resize
(
output_shape
));
}
Tensor
::
MappingGuard
input_guard
(
space_tensor
);
...
...
mace/ops/BUILD
浏览文件 @
33415ee9
...
...
@@ -15,7 +15,6 @@ cc_library(
hdrs
=
[
"ops_test_util.h"
,
],
copts
=
[
"-Werror"
,
"-Wextra"
,
"-Wno-missing-field-initializers"
],
deps
=
[
"//mace/core"
,
"@gtest//:gtest"
,
...
...
@@ -36,18 +35,23 @@ cc_library(
[
"buffer_to_image.cc"
,
"image_to_buffer.cc"
,
]),
],
),
hdrs
=
glob
(
[
"*.h"
],
exclude
=
[
"ops_test_util.h"
],
),
copts
=
[
"-Werror"
,
"-Wextra"
,
"-Wno-missing-field-initializers"
]
+
if_openmp_enabled
([
"-fopenmp"
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
])
+
if_android_armv7
([
"-mfpu=neon"
])
+
if_android_armv7
([
"-mfloat-abi=softfp"
])
+
if_android
([
"-DMACE_ENABLE_OPENCL"
])
+
if_hexagon_enabled
([
"-DMACE_ENABLE_HEXAGON"
]),
copts
=
if_openmp_enabled
([
"-fopenmp"
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
])
+
if_android_armv7
([
"-mfloat-abi=softfp"
,
])
+
if_android
([
"-DMACE_ENABLE_OPENCL"
,
])
+
if_hexagon_enabled
([
"-DMACE_ENABLE_HEXAGON"
,
]),
deps
=
[
"//mace/kernels"
,
],
...
...
@@ -60,13 +64,17 @@ cc_test(
srcs
=
glob
(
[
"*_test.cc"
],
),
copts
=
[
"-Werror"
,
"-Wextra"
,
"-Wno-missing-field-initializers"
]
+
if_openmp_enabled
([
"-fopenmp"
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
])
+
if_android_armv7
([
"-mfpu=neon"
])
+
if_android_armv7
([
"-mfloat-abi=softfp"
])
+
if_android
([
"-DMACE_ENABLE_OPENCL"
])
+
if_hexagon_enabled
([
"-DMACE_ENABLE_HEXAGON"
]),
copts
=
if_openmp_enabled
([
"-fopenmp"
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
])
+
if_android_armv7
([
"-mfloat-abi=softfp"
,
])
+
if_android
([
"-DMACE_ENABLE_OPENCL"
,
])
+
if_hexagon_enabled
([
"-DMACE_ENABLE_HEXAGON"
,
]),
linkopts
=
[
"-fopenmp"
],
linkstatic
=
1
,
deps
=
[
...
...
@@ -80,13 +88,17 @@ cc_test(
name
=
"ops_benchmark"
,
testonly
=
1
,
srcs
=
glob
([
"*_benchmark.cc"
]),
copts
=
[
"-Werror"
,
"-Wextra"
,
"-Wno-missing-field-initializers"
]
+
if_openmp_enabled
([
"-fopenmp"
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
])
+
if_android_armv7
([
"-mfpu=neon"
])
+
if_android_armv7
([
"-mfloat-abi=softfp"
])
+
if_android
([
"-DMACE_ENABLE_OPENCL"
])
+
if_hexagon_enabled
([
"-DMACE_ENABLE_HEXAGON"
]),
copts
=
if_openmp_enabled
([
"-fopenmp"
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
])
+
if_android_armv7
([
"-mfloat-abi=softfp"
,
])
+
if_android
([
"-DMACE_ENABLE_OPENCL"
,
])
+
if_hexagon_enabled
([
"-DMACE_ENABLE_HEXAGON"
,
]),
linkopts
=
[
"-fopenmp"
],
linkstatic
=
1
,
deps
=
[
...
...
mace/ops/activation.h
浏览文件 @
33415ee9
...
...
@@ -31,15 +31,15 @@ class ActivationOp : public Operator<D, T> {
functor_
(
kernels
::
StringToActivationType
(
OperatorBase
::
GetOptionalArg
<
std
::
string
>
(
"activation"
,
"NOOP"
)),
static_cast
<
T
>
(
OperatorBase
::
GetOptionalArg
<
float
>
(
"max_limit"
,
0.0
f
)))
{}
static_cast
<
T
>
(
OperatorBase
::
GetOptionalArg
<
float
>
(
"max_limit"
,
0.0
f
)))
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input_tensor
=
this
->
Input
(
0
);
const
Tensor
*
alpha_tensor
=
this
->
InputSize
()
>=
2
?
this
->
Input
(
1
)
:
nullptr
;
Tensor
*
output_tensor
=
this
->
Output
(
0
);
MACE_
FAILURE_RETURN
(
output_tensor
->
ResizeLike
(
input_tensor
));
MACE_
RETURN_IF_ERROR
(
output_tensor
->
ResizeLike
(
input_tensor
));
return
functor_
(
input_tensor
,
alpha_tensor
,
output_tensor
,
future
);
}
...
...
mace/ops/activation_test.cc
浏览文件 @
33415ee9
...
...
@@ -120,7 +120,6 @@ TEST_F(ActivationOpTest, OPENCLUnalignedSimpleRelu) {
TestUnalignedSimpleRelu
<
DeviceType
::
GPU
>
();
}
namespace
{
template
<
DeviceType
D
>
void
TestSimpleRelux
()
{
...
...
@@ -169,9 +168,7 @@ void TestSimpleRelux() {
TEST_F
(
ActivationOpTest
,
CPUSimple
)
{
TestSimpleRelux
<
DeviceType
::
CPU
>
();
}
TEST_F
(
ActivationOpTest
,
OPENCLSimple
)
{
TestSimpleRelux
<
DeviceType
::
GPU
>
();
}
TEST_F
(
ActivationOpTest
,
OPENCLSimple
)
{
TestSimpleRelux
<
DeviceType
::
GPU
>
();
}
namespace
{
template
<
DeviceType
D
>
...
...
@@ -278,9 +275,7 @@ void TestSimplePrelu() {
}
}
// namespace
TEST_F
(
ActivationOpTest
,
CPUSimplePrelu
)
{
TestSimplePrelu
<
DeviceType
::
CPU
>
();
}
TEST_F
(
ActivationOpTest
,
CPUSimplePrelu
)
{
TestSimplePrelu
<
DeviceType
::
CPU
>
();
}
TEST_F
(
ActivationOpTest
,
OPENCLSimplePrelu
)
{
TestSimplePrelu
<
DeviceType
::
GPU
>
();
...
...
mace/ops/addn_test.cc
浏览文件 @
33415ee9
...
...
@@ -97,8 +97,8 @@ void SimpleAdd3() {
net
.
RunOp
(
D
);
}
auto
expected
=
CreateTensor
<
float
>
({
1
,
2
,
3
,
1
},
{
-
0.000713
,
8
,
12
,
16
,
20
,
24
});
auto
expected
=
CreateTensor
<
float
>
({
1
,
2
,
3
,
1
},
{
-
0.000713
,
8
,
12
,
16
,
20
,
24
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-4
,
1e-3
);
}
...
...
@@ -160,8 +160,8 @@ void RandomTest() {
ImageToBuffer
<
D
,
float
>
(
&
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-2
,
1e-2
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-2
,
1e-2
);
}
}
}
// namespace
...
...
mace/ops/batch_norm.h
浏览文件 @
33415ee9
...
...
@@ -51,7 +51,7 @@ class BatchNormOp : public Operator<D, T> {
var
->
dim_size
());
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
MACE_
FAILURE_RETURN
(
output
->
ResizeLike
(
input
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeLike
(
input
));
return
functor_
(
input
,
scale
,
offset
,
mean
,
var
,
epsilon_
,
output
,
future
);
}
...
...
mace/ops/batch_norm_test.cc
浏览文件 @
33415ee9
...
...
@@ -22,7 +22,7 @@ namespace test {
class
BatchNormOpTest
:
public
OpsTestBase
{};
namespace
{
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
Simple
()
{
OpsTestNet
net
;
...
...
@@ -79,10 +79,9 @@ void Simple() {
}
// Check
auto
expected
=
CreateTensor
<
float
>
({
1
,
6
,
2
,
1
},
{
-
3.8543
,
-
3.8543
,
-
1.5125
,
-
1.5125
,
0.8291
,
0.8291
,
3.1708
,
3.1708
,
5.5125
,
5.5125
,
7.8543
,
7.8543
});
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
6
,
2
,
1
},
{
-
3.8543
,
-
3.8543
,
-
1.5125
,
-
1.5125
,
0.8291
,
0.8291
,
3.1708
,
3.1708
,
5.5125
,
5.5125
,
7.8543
,
7.8543
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-4
);
}
...
...
@@ -103,16 +102,14 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Mean"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Var"
,
{
channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
// Construct graph
...
...
@@ -129,9 +126,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
// run cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
@@ -186,16 +181,14 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Mean"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Var"
,
{
channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"BatchNorm"
,
"BatchNormTest"
)
...
...
@@ -211,9 +204,7 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
// run cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
@@ -269,16 +260,14 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Mean"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Var"
,
{
channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"BatchNorm"
,
"BatchNormTest"
)
...
...
@@ -294,9 +283,7 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
// run cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
@@ -351,16 +338,14 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Mean"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Var"
,
{
channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"BatchNorm"
,
"BatchNormTest"
)
...
...
@@ -376,9 +361,7 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
// run cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
mace/ops/batch_to_space.h
浏览文件 @
33415ee9
...
...
@@ -36,8 +36,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
batch_tensor
=
this
->
Input
(
INPUT
);
Tensor
*
space_tensor
=
this
->
Output
(
OUTPUT
);
return
functor_
(
space_tensor
,
const_cast
<
Tensor
*>
(
batch_tensor
),
future
);
return
functor_
(
space_tensor
,
const_cast
<
Tensor
*>
(
batch_tensor
),
future
);
}
private:
...
...
mace/ops/bias_add.h
浏览文件 @
33415ee9
...
...
@@ -37,7 +37,7 @@ class BiasAddOp : public Operator<D, T> {
bias
->
dim_size
());
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
MACE_
FAILURE_RETURN
(
output
->
ResizeLike
(
input
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeLike
(
input
));
return
functor_
(
input
,
bias
,
output
,
future
);
}
...
...
mace/ops/bias_add_test.cc
浏览文件 @
33415ee9
...
...
@@ -32,9 +32,7 @@ void BiasAddSimple() {
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
1
},
{
0.5
f
});
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"BiasAdd"
,
"BiasAddTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -43,10 +41,8 @@ void BiasAddSimple() {
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
else
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
float
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -79,9 +75,7 @@ void BiasAddSimple() {
TEST_F
(
BiasAddOpTest
,
BiasAddSimpleCPU
)
{
BiasAddSimple
<
DeviceType
::
CPU
>
();
}
TEST_F
(
BiasAddOpTest
,
BiasAddSimpleOPENCL
)
{
BiasAddSimple
<
DeviceType
::
GPU
>
();
}
TEST_F
(
BiasAddOpTest
,
BiasAddSimpleOPENCL
)
{
BiasAddSimple
<
DeviceType
::
GPU
>
();
}
TEST_F
(
BiasAddOpTest
,
SimpleRandomOPENCL
)
{
// generate random input
...
...
@@ -94,13 +88,11 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Bias"
,
{
channels
},
true
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
// Construct graph
...
...
@@ -113,9 +105,7 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
// run cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
@@ -154,13 +144,11 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Bias"
,
{
channels
},
true
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
// Construct graph
...
...
@@ -173,9 +161,7 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
// run cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
Tensor
expected
;
...
...
mace/ops/buffer_to_image_test.cc
浏览文件 @
33415ee9
...
...
@@ -233,8 +233,8 @@ TEST(BufferToImageTest, ArgStringHalfToHalfSmall) {
const
unsigned
char
input_data
[]
=
{
0xCD
,
0x3C
,
0x33
,
0x40
,
};
TestStringHalfBidirectionTransform
<
DeviceType
::
GPU
,
half
>
(
kernels
::
ARGUMENT
,
{
2
},
input_data
);
TestStringHalfBidirectionTransform
<
DeviceType
::
GPU
,
half
>
(
kernels
::
ARGUMENT
,
{
2
},
input_data
);
}
}
// namespace test
...
...
mace/ops/channel_shuffle_test.cc
浏览文件 @
33415ee9
...
...
@@ -29,9 +29,7 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
"Input"
,
{
1
,
1
,
2
,
8
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
// Construct graph
...
...
@@ -43,9 +41,7 @@ TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
// Run
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
mace/ops/concat_test.cc
浏览文件 @
33415ee9
...
...
@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <functional>
#include <string>
#include "gmock/gmock.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/concat.h"
#include "mace/ops/ops_test_util.h"
namespace
mace
{
namespace
ops
{
...
...
@@ -163,7 +163,7 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
int
concat_axis_size
=
0
;
// Construct graph
std
::
vector
<
std
::
vector
<
float
>>
inputs
(
num_inputs
,
std
::
vector
<
float
>
());
std
::
vector
<
const
float
*>
input_ptrs
(
num_inputs
);
std
::
vector
<
const
float
*>
input_ptrs
(
num_inputs
);
OpsTestNet
net
;
for
(
int
i
=
0
;
i
<
num_inputs
;
++
i
)
{
const
std
::
string
input_name
=
MakeString
(
"Input"
,
i
);
...
...
@@ -171,8 +171,8 @@ void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
concat_axis_size
+=
shapes
[
i
][
axis
];
GenerateRandomRealTypeData
(
shapes
[
i
],
&
inputs
[
i
]);
input_ptrs
[
i
]
=
inputs
[
i
].
data
();
net
.
AddInputFromArray
<
DeviceType
::
GPU
,
float
>
(
input_name
,
shapes
[
i
],
inputs
[
i
]);
net
.
AddInputFromArray
<
DeviceType
::
GPU
,
float
>
(
input_name
,
shapes
[
i
],
inputs
[
i
]);
BufferToImage
<
DeviceType
::
GPU
,
T
>
(
&
net
,
input_name
,
image_name
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
}
...
...
mace/ops/conv_2d_test.cc
浏览文件 @
33415ee9
...
...
@@ -25,7 +25,7 @@ namespace test {
class
Conv2dOpTest
:
public
OpsTestBase
{};
namespace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
TestNHWCSimple3x3VALID
()
{
OpsTestNet
net
;
// Add input data
...
...
@@ -39,9 +39,7 @@ void TestNHWCSimple3x3VALID() {
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -55,10 +53,8 @@ void TestNHWCSimple3x3VALID() {
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
else
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
T
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -91,7 +87,7 @@ void TestNHWCSimple3x3VALID() {
ExpectTensorNear
<
float
,
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
}
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
TestNHWCSimple3x3SAME
()
{
OpsTestNet
net
;
...
...
@@ -106,9 +102,7 @@ void TestNHWCSimple3x3SAME() {
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
1
},
{
0.1
f
});
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -122,10 +116,8 @@ void TestNHWCSimple3x3SAME() {
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
else
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
T
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -173,7 +165,7 @@ TEST_F(Conv2dOpTest, OPENCLSimple) {
}
namespace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
TestNHWCSimple3x3WithoutBias
()
{
OpsTestNet
net
;
...
...
@@ -187,9 +179,7 @@ void TestNHWCSimple3x3WithoutBias() {
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -203,10 +193,8 @@ void TestNHWCSimple3x3WithoutBias() {
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
else
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
T
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -247,7 +235,7 @@ TEST_F(Conv2dOpTest, OPENCLWithoutBias) {
}
namespace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
TestNHWCCombined3x3
()
{
// Construct graph
OpsTestNet
net
;
...
...
@@ -259,16 +247,13 @@ void TestNHWCCombined3x3() {
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Filter"
,
{
2
,
2
,
3
,
3
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
});
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
});
net
.
AddInputFromArray
<
D
,
T
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -282,10 +267,8 @@ void TestNHWCCombined3x3() {
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
else
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
T
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -330,14 +313,13 @@ TEST_F(Conv2dOpTest, OPENCLStride2) {
}
namespace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
TestFusedNHWCSimple3x3VALID
()
{
OpsTestNet
net
;
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
3
,
3
,
2
},
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
});
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
1
,
2
,
3
,
3
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
...
...
@@ -345,9 +327,7 @@ void TestFusedNHWCSimple3x3VALID() {
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
1
},
{
-
0.1
f
});
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -362,10 +342,8 @@ void TestFusedNHWCSimple3x3VALID() {
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
else
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
T
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -398,25 +376,21 @@ void TestFusedNHWCSimple3x3VALID() {
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
1
},
{
0.0
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
));
}
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
TestFusedNHWCSimple3x3WithoutBias
()
{
OpsTestNet
net
;
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
3
,
3
,
2
},
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
});
{
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
1
,
2
,
3
,
3
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -431,10 +405,8 @@ void TestFusedNHWCSimple3x3WithoutBias() {
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
else
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
T
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -478,9 +450,8 @@ TEST_F(Conv2dOpTest, FusedOPENCLSimple) {
TestFusedNHWCSimple3x3WithoutBias
<
DeviceType
::
GPU
,
float
>
();
}
namespace
{
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
TestConv1x1
()
{
// Construct graph
OpsTestNet
net
;
...
...
@@ -501,9 +472,7 @@ void TestConv1x1() {
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -516,10 +485,8 @@ void TestConv1x1() {
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
else
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
float
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -565,7 +532,7 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
TEST_F
(
Conv2dOpTest
,
OPENCLConv1x1
)
{
TestConv1x1
<
DeviceType
::
GPU
>
();
}
namespace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
TestComplexConvNxNS12
(
const
std
::
vector
<
index_t
>
&
shape
,
const
int
stride
)
{
testing
::
internal
::
LogToStderr
();
...
...
@@ -586,9 +553,7 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
output_channels
,
input_channels
,
kernel_h
,
kernel_w
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
// Construct graph
...
...
@@ -603,14 +568,11 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// run on cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
Tensor
expected
;
...
...
@@ -639,8 +601,8 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
ImageToBuffer
<
D
,
T
>
(
&
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-4
,
1e-4
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-4
,
1e-4
);
};
for
(
int
kernel_size
:
{
1
,
3
,
5
,
7
})
{
...
...
@@ -666,7 +628,7 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
}
namespace
{
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
TestHalfComplexConvNxNS12
(
const
std
::
vector
<
index_t
>
&
input_shape
,
const
std
::
vector
<
index_t
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
dilations
)
{
...
...
@@ -702,9 +664,7 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
float_filter_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
output_channels
},
float_bias_data
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
...
...
@@ -720,10 +680,8 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
// run on cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
Tensor
expected
;
...
...
@@ -753,8 +711,8 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
ImageToBuffer
<
D
,
float
>
(
&
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-2
,
1e-1
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-2
,
1e-1
);
};
func
(
1
,
1
,
VALID
);
...
...
@@ -767,20 +725,16 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
}
// namespace
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv1x1S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
1
,
1
,
32
,
64
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
1
,
1
,
32
,
64
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv3x3S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
3
,
3
,
32
,
64
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
3
,
3
,
32
,
64
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv5x5S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
5
,
5
,
3
,
64
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
5
,
5
,
3
,
63
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
5
,
5
,
3
,
64
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
5
,
5
,
3
,
63
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv1x7S1
)
{
...
...
@@ -800,55 +754,45 @@ TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x1S1) {
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv7x7S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
7
,
7
,
3
,
64
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
7
,
7
,
3
,
63
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
7
,
7
,
3
,
64
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
7
,
7
,
3
,
63
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv15x1S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
15
,
1
,
256
,
2
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
64
,
64
},
{
15
,
1
,
64
,
2
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
15
,
1
,
256
,
2
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
64
,
64
},
{
15
,
1
,
64
,
2
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
256
,
256
},
{
15
,
1
,
32
,
2
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfAlignedConv1x15S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
1
,
15
,
256
,
2
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
32
,
32
},
{
1
,
15
,
256
,
2
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
256
,
256
},
{
1
,
15
,
32
,
2
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfUnalignedConv1x1S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
107
,
113
},
{
1
,
1
,
5
,
7
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
107
,
113
},
{
1
,
1
,
5
,
7
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfUnalignedConv3x3S12
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
107
,
113
},
{
3
,
3
,
5
,
7
},
{
1
,
1
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
107
,
113
},
{
3
,
3
,
5
,
7
},
{
1
,
1
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfConv5x5Dilation2
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
64
,
64
},
{
5
,
5
,
16
,
16
},
{
2
,
2
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
64
,
64
},
{
5
,
5
,
16
,
16
},
{
2
,
2
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfConv7x7Dilation2
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
64
,
64
},
{
7
,
7
,
16
,
16
},
{
2
,
2
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
64
,
64
},
{
7
,
7
,
16
,
16
},
{
2
,
2
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalfConv7x7Dilation4
)
{
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
63
,
67
},
{
7
,
7
,
16
,
16
},
{
4
,
4
});
TestHalfComplexConvNxNS12
<
DeviceType
::
GPU
>
({
63
,
67
},
{
7
,
7
,
16
,
16
},
{
4
,
4
});
}
namespace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
TestDilationConvNxN
(
const
std
::
vector
<
index_t
>
&
shape
,
const
int
dilation_rate
)
{
testing
::
internal
::
LogToStderr
();
...
...
@@ -871,9 +815,7 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
"Filter"
,
{
output_channels
,
input_channels
,
kernel_h
,
kernel_w
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
// Construct graph
...
...
@@ -890,11 +832,8 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
// run on cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
Tensor
expected
;
...
...
@@ -923,8 +862,8 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
ImageToBuffer
<
D
,
T
>
(
&
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-4
,
1e-4
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-4
,
1e-4
);
};
for
(
int
kernel_size
:
{
3
})
{
...
...
@@ -949,7 +888,7 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedDilation4) {
}
namespace
{
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
TestGeneralHalfAtrousConv
(
const
std
::
vector
<
index_t
>
&
image_shape
,
const
std
::
vector
<
index_t
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
dilations
)
{
...
...
@@ -975,9 +914,7 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
"Filter"
,
{
output_channels
,
input_channels
,
kernel_h
,
kernel_w
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
// Construct graph
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
...
...
@@ -993,10 +930,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
// run on cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
...
...
@@ -1024,8 +959,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
ImageToBuffer
<
D
,
float
>
(
&
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-
2
,
1e-
1
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-2
,
1e-1
);
};
func
(
1
,
1
,
VALID
);
...
...
@@ -1034,8 +969,7 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
}
// namespace
TEST_F
(
Conv2dOpTest
,
OPENCLHalf7X7AtrousConvD2
)
{
TestGeneralHalfAtrousConv
<
DeviceType
::
GPU
>
({
32
,
32
},
{
7
,
7
,
16
,
3
},
{
2
,
2
});
TestGeneralHalfAtrousConv
<
DeviceType
::
GPU
>
({
32
,
32
},
{
7
,
7
,
16
,
3
},
{
2
,
2
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLHalf15X15AtrousConvD4
)
{
...
...
@@ -1044,7 +978,7 @@ TEST_F(Conv2dOpTest, OPENCLHalf15X15AtrousConvD4) {
}
namespace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
TestArbitraryPadConvNxN
(
const
std
::
vector
<
index_t
>
&
shape
,
const
std
::
vector
<
int
>
&
paddings
)
{
testing
::
internal
::
LogToStderr
();
...
...
@@ -1066,9 +1000,7 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
"Filter"
,
{
output_channels
,
input_channels
,
kernel_h
,
kernel_w
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
// Construct graph
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
...
...
@@ -1084,10 +1016,8 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
// run on cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
Tensor
expected
;
...
...
@@ -1115,8 +1045,8 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
ImageToBuffer
<
D
,
T
>
(
&
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-4
,
1e-4
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-4
,
1e-4
);
};
for
(
int
kernel_size
:
{
3
,
5
,
7
})
{
...
...
@@ -1132,8 +1062,7 @@ TEST_F(Conv2dOpTest, OPENCLAlignedPad1) {
}
TEST_F
(
Conv2dOpTest
,
OPENCLAlignedPad2
)
{
TestArbitraryPadConvNxN
<
DeviceType
::
GPU
,
float
>
({
128
,
128
,
16
,
16
},
{
2
,
2
});
TestArbitraryPadConvNxN
<
DeviceType
::
GPU
,
float
>
({
128
,
128
,
16
,
16
},
{
2
,
2
});
}
TEST_F
(
Conv2dOpTest
,
OPENCLUnalignedPad4
)
{
...
...
mace/ops/conv_pool_2d_base.h
浏览文件 @
33415ee9
...
...
@@ -32,8 +32,7 @@ class ConvPool2dOpBase : public Operator<D, T> {
padding_type_
(
static_cast
<
Padding
>
(
OperatorBase
::
GetOptionalArg
<
int
>
(
"padding"
,
static_cast
<
int
>
(
SAME
)))),
paddings_
(
OperatorBase
::
GetRepeatedArgs
<
int
>
(
"padding_values"
)),
dilations_
(
OperatorBase
::
GetRepeatedArgs
<
int
>
(
"dilations"
,
{
1
,
1
}))
{}
dilations_
(
OperatorBase
::
GetRepeatedArgs
<
int
>
(
"dilations"
,
{
1
,
1
}))
{}
protected:
std
::
vector
<
int
>
strides_
;
...
...
mace/ops/core_test.cc
浏览文件 @
33415ee9
...
...
@@ -31,8 +31,7 @@ TEST(CoreTest, INIT_MODE) {
.
AddIntArg
(
"mode"
,
static_cast
<
int
>
(
NetMode
::
INIT
))
.
Finalize
(
&
op_defs
[
op_defs
.
size
()
-
1
]);
Tensor
*
input
=
ws
.
CreateTensor
(
"Input"
,
GetDeviceAllocator
(
DeviceType
::
GPU
),
Tensor
*
input
=
ws
.
CreateTensor
(
"Input"
,
GetDeviceAllocator
(
DeviceType
::
GPU
),
DataTypeToEnum
<
float
>::
v
());
input
->
Resize
({
1
,
3
,
3
,
3
});
{
...
...
mace/ops/deconv_2d_test.cc
浏览文件 @
33415ee9
...
...
@@ -25,7 +25,7 @@ namespace test {
class
Deconv2dOpTest
:
public
OpsTestBase
{};
namespace
{
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
RunTestSimple
(
const
std
::
vector
<
index_t
>
&
input_shape
,
const
std
::
vector
<
float
>
&
input_data
,
const
int
stride
,
...
...
@@ -40,10 +40,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
// Add input data
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
input_shape
,
input_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
filter_shape
,
filter_data
);
net
.
TransformDataFormat
<
D
,
float
>
(
"Filter"
,
HWOI
,
"FilterOIHW"
,
OIHW
);
net
.
TransformDataFormat
<
D
,
float
>
(
"Filter"
,
HWOI
,
"FilterOIHW"
,
OIHW
);
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
float
>
(
&
net
,
"Input"
,
"InputImage"
,
...
...
@@ -66,9 +63,7 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
ImageToBuffer
<
D
,
float
>
(
&
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
}
else
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Deconv2D"
,
"Deconv2dTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -81,317 +76,165 @@ void RunTestSimple(const std::vector<index_t> &input_shape,
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
auto
expected
=
CreateTensor
<
float
>
(
expected_shape
,
expected_data
);
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.0001
);
}
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
TestNHWCSimple3x3SAME_S1
()
{
RunTestSimple
<
D
>
({
1
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
1
,
Padding
::
SAME
,
{
0
,
0
},
{
1
,
3
,
3
,
3
},
{
3
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
{
1
,
3
,
3
,
3
},
{
4
,
4
,
4
,
6
,
6
,
6
,
4
,
4
,
4
,
6
,
6
,
6
,
9
,
9
,
9
,
6
,
6
,
6
,
4
,
4
,
4
,
6
,
6
,
6
,
4
,
4
,
4
});
RunTestSimple
<
D
>
({
1
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
1
,
Padding
::
VALID
,
{
2
,
2
},
{
0
},
{
3
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
{
1
,
3
,
3
,
3
},
{
4
,
4
,
4
,
6
,
6
,
6
,
4
,
4
,
4
,
6
,
6
,
6
,
9
,
9
,
9
,
6
,
6
,
6
,
4
,
4
,
4
,
6
,
6
,
6
,
4
,
4
,
4
});
RunTestSimple
<
D
>
({
1
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
},
1
,
Padding
::
SAME
,
{
0
,
0
},
{
1
,
3
,
3
,
3
},
{
3
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
},
{
1
,
3
,
3
,
3
},
{
54
,
66
,
78
,
126
,
147
,
168
,
130
,
146
,
162
,
RunTestSimple
<
D
>
({
1
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
1
,
Padding
::
SAME
,
{
0
,
0
},
{
1
,
3
,
3
,
3
},
{
3
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
{
1
,
3
,
3
,
3
},
{
4
,
4
,
4
,
6
,
6
,
6
,
4
,
4
,
4
,
6
,
6
,
6
,
9
,
9
,
9
,
6
,
6
,
6
,
4
,
4
,
4
,
6
,
6
,
6
,
4
,
4
,
4
});
RunTestSimple
<
D
>
({
1
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
1
,
Padding
::
VALID
,
{
2
,
2
},
{
0
},
{
3
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
{
1
,
3
,
3
,
3
},
{
4
,
4
,
4
,
6
,
6
,
6
,
4
,
4
,
4
,
6
,
6
,
6
,
9
,
9
,
9
,
6
,
6
,
6
,
4
,
4
,
4
,
6
,
6
,
6
,
4
,
4
,
4
});
RunTestSimple
<
D
>
({
1
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
},
1
,
Padding
::
SAME
,
{
0
,
0
},
{
1
,
3
,
3
,
3
},
{
3
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
},
{
1
,
3
,
3
,
3
},
{
54
,
66
,
78
,
126
,
147
,
168
,
130
,
146
,
162
,
198
,
225
,
252
,
405
,
450
,
495
,
366
,
399
,
432
,
354
,
378
,
402
,
630
,
669
,
708
,
502
,
530
,
558
});
RunTestSimple
<
D
>
({
1
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
},
1
,
Padding
::
SAME
,
{
2
,
2
},
{
0
},
{
3
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
},
RunTestSimple
<
D
>
(
{
1
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
},
1
,
Padding
::
SAME
,
{
2
,
2
},
{
0
},
{
3
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
},
{
1
,
3
,
3
,
3
},
{
54
,
66
,
78
,
126
,
147
,
168
,
130
,
146
,
162
,
198
,
225
,
252
,
405
,
450
,
495
,
366
,
399
,
432
,
354
,
378
,
402
,
630
,
669
,
708
,
502
,
530
,
558
});
{
54
,
66
,
78
,
126
,
147
,
168
,
130
,
146
,
162
,
198
,
225
,
252
,
405
,
450
,
495
,
366
,
399
,
432
,
354
,
378
,
402
,
630
,
669
,
708
,
502
,
530
,
558
});
}
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
TestNHWCSimple3x3SAME_S2
()
{
RunTestSimple
<
D
>
({
1
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
2
,
Padding
::
SAME
,
{
0
,
0
},
{
1
,
6
,
6
,
3
},
{
3
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
RunTestSimple
<
D
>
(
{
1
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
2
,
Padding
::
SAME
,
{
0
,
0
},
{
1
,
6
,
6
,
3
},
{
3
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
{
1
,
6
,
6
,
3
},
{
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
});
RunTestSimple
<
D
>
({
1
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
2
,
Padding
::
SAME
,
{
2
,
2
},
{
0
},
{
3
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
{
1
,
5
,
5
,
3
},
{
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
});
RunTestSimple
<
D
>
({
1
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
},
2
,
Padding
::
SAME
,
{
0
,
0
},
{
1
,
6
,
6
,
3
},
{
3
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
},
{
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
});
RunTestSimple
<
D
>
(
{
1
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
2
,
Padding
::
SAME
,
{
2
,
2
},
{
0
},
{
3
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
{
1
,
5
,
5
,
3
},
{
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
});
RunTestSimple
<
D
>
(
{
1
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
},
2
,
Padding
::
SAME
,
{
0
,
0
},
{
1
,
6
,
6
,
3
},
{
3
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
},
{
1
,
6
,
6
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
,
9
,
12
,
15
,
8
,
10
,
12
,
17
,
22
,
27
,
12
,
15
,
18
,
10
,
11
,
12
,
13
,
14
,
15
,
36
,
39
,
42
,
26
,
28
,
30
,
62
,
67
,
72
,
39
,
42
,
45
,
23
,
28
,
33
,
38
,
43
,
48
,
96
,
108
,
120
,
64
,
71
,
78
,
148
,
164
,
180
,
90
,
99
,
108
,
40
,
44
,
48
,
52
,
56
,
60
,
114
,
123
,
132
,
65
,
70
,
75
,
140
,
151
,
162
,
78
,
84
,
90
,
83
,
94
,
105
,
116
,
127
,
138
,
252
,
276
,
300
,
142
,
155
,
168
,
304
,
332
,
360
,
168
,
183
,
198
,
70
,
77
,
84
,
91
,
98
,
105
,
192
,
207
,
222
,
104
,
112
,
120
,
218
,
235
,
252
,
117
,
126
,
135
});
RunTestSimple
<
D
>
({
1
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
},
2
,
Padding
::
SAME
,
{
2
,
2
},
{
0
},
{
3
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
},
{
1
,
2
,
3
,
4
,
5
,
6
,
9
,
12
,
15
,
8
,
10
,
12
,
17
,
22
,
27
,
12
,
15
,
18
,
10
,
11
,
12
,
13
,
14
,
15
,
36
,
39
,
42
,
26
,
28
,
30
,
62
,
67
,
72
,
39
,
42
,
45
,
23
,
28
,
33
,
38
,
43
,
48
,
96
,
108
,
120
,
64
,
71
,
78
,
148
,
164
,
180
,
90
,
99
,
108
,
40
,
44
,
48
,
52
,
56
,
60
,
114
,
123
,
132
,
65
,
70
,
75
,
140
,
151
,
162
,
78
,
84
,
90
,
83
,
94
,
105
,
116
,
127
,
138
,
252
,
276
,
300
,
142
,
155
,
168
,
304
,
332
,
360
,
168
,
183
,
198
,
70
,
77
,
84
,
91
,
98
,
105
,
192
,
207
,
222
,
104
,
112
,
120
,
218
,
235
,
252
,
117
,
126
,
135
});
RunTestSimple
<
D
>
(
{
1
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
},
2
,
Padding
::
SAME
,
{
2
,
2
},
{
0
},
{
3
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
},
{
1
,
5
,
5
,
3
},
{
13
,
14
,
15
,
36
,
39
,
42
,
26
,
28
,
30
,
62
,
67
,
72
,
39
,
42
,
45
,
38
,
43
,
48
,
96
,
108
,
120
,
64
,
71
,
78
,
148
,
164
,
180
,
90
,
99
,
108
,
52
,
56
,
60
,
114
,
123
,
132
,
65
,
70
,
75
,
140
,
151
,
162
,
78
,
84
,
90
,
116
,
127
,
138
,
252
,
276
,
300
,
142
,
155
,
168
,
304
,
332
,
360
,
168
,
183
,
198
,
91
,
98
,
105
,
192
,
207
,
222
,
104
,
112
,
120
,
218
,
235
,
252
,
117
,
126
,
135
});
{
13
,
14
,
15
,
36
,
39
,
42
,
26
,
28
,
30
,
62
,
67
,
72
,
39
,
42
,
45
,
38
,
43
,
48
,
96
,
108
,
120
,
64
,
71
,
78
,
148
,
164
,
180
,
90
,
99
,
108
,
52
,
56
,
60
,
114
,
123
,
132
,
65
,
70
,
75
,
140
,
151
,
162
,
78
,
84
,
90
,
116
,
127
,
138
,
252
,
276
,
300
,
142
,
155
,
168
,
304
,
332
,
360
,
168
,
183
,
198
,
91
,
98
,
105
,
192
,
207
,
222
,
104
,
112
,
120
,
218
,
235
,
252
,
117
,
126
,
135
});
}
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
TestNHWCSimple3x3SAME_S2_1
()
{
RunTestSimple
<
D
>
(
{
1
,
3
,
3
,
1
},
{
12
,
18
,
12
,
18
,
27
,
18
,
12
,
18
,
12
}
,
2
,
Padding
::
SAME
,
{
0
,
0
},
RunTestSimple
<
D
>
(
{
1
,
3
,
3
,
1
},
{
12
,
18
,
12
,
18
,
27
,
18
,
12
,
18
,
12
},
2
,
Padding
::
SAME
,
{
0
,
0
},
{
1
,
5
,
5
,
3
},
{
3
,
3
,
3
,
1
}
,
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
{
1
,
5
,
5
,
3
},
{
3
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
{
1
,
5
,
5
,
3
},
{
12
,
12
,
12
,
30
,
30
,
30
,
18
,
18
,
18
,
30
,
30
,
30
,
12
,
12
,
12
,
30
,
30
,
30
,
75
,
75
,
75
,
45
,
45
,
45
,
75
,
75
,
75
,
30
,
30
,
30
,
18
,
18
,
18
,
45
,
45
,
45
,
27
,
27
,
27
,
45
,
45
,
45
,
18
,
18
,
18
,
30
,
30
,
30
,
75
,
75
,
75
,
45
,
45
,
45
,
75
,
75
,
75
,
30
,
30
,
30
,
12
,
12
,
12
,
30
,
30
,
30
,
18
,
18
,
18
,
30
,
30
,
30
,
12
,
12
,
12
});
{
12
,
12
,
12
,
30
,
30
,
30
,
18
,
18
,
18
,
30
,
30
,
30
,
12
,
12
,
12
,
30
,
30
,
30
,
75
,
75
,
75
,
45
,
45
,
45
,
75
,
75
,
75
,
30
,
30
,
30
,
18
,
18
,
18
,
45
,
45
,
45
,
27
,
27
,
27
,
45
,
45
,
45
,
18
,
18
,
18
,
30
,
30
,
30
,
75
,
75
,
75
,
45
,
45
,
45
,
75
,
75
,
75
,
30
,
30
,
30
,
12
,
12
,
12
,
30
,
30
,
30
,
18
,
18
,
18
,
30
,
30
,
30
,
12
,
12
,
12
});
}
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
TestNHWCSimple3x3VALID_S2
()
{
RunTestSimple
<
D
>
({
1
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
2
,
Padding
::
VALID
,
{
0
,
0
},
{
1
,
7
,
7
,
3
},
{
3
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
RunTestSimple
<
D
>
(
{
1
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
2
,
Padding
::
VALID
,
{
0
,
0
},
{
1
,
7
,
7
,
3
},
{
3
,
3
,
3
,
1
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
},
{
1
,
7
,
7
,
3
},
{
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
});
{
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
4
,
4
,
4
,
2
,
2
,
2
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
2
,
2
,
2
,
1
,
1
,
1
,
1
,
1
,
1
});
}
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
TestNHWCSimple3x3VALID_S1
()
{
RunTestSimple
<
D
>
(
{
1
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
},
1
,
Padding
::
VALID
,
{
0
,
0
},
RunTestSimple
<
D
>
(
{
1
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
},
1
,
Padding
::
VALID
,
{
0
,
0
},
{
1
,
5
,
5
,
3
},
{
3
,
3
,
3
,
1
}
,
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
},
{
1
,
5
,
5
,
3
},
{
3
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
},
{
1
,
2
,
3
,
6
,
9
,
12
,
18
,
24
,
30
,
26
,
31
,
36
,
21
,
24
,
27
,
14
,
19
,
24
,
54
,
66
,
78
,
126
,
147
,
168
,
130
,
146
,
162
,
90
,
99
,
108
,
66
,
78
,
90
,
198
,
225
,
252
,
405
,
450
,
495
,
366
,
399
,
432
,
234
,
252
,
270
,
146
,
157
,
168
,
354
,
378
,
402
,
630
,
669
,
708
,
502
,
530
,
558
,
294
,
309
,
324
,
133
,
140
,
147
,
306
,
321
,
336
,
522
,
546
,
570
,
398
,
415
,
432
,
225
,
234
,
243
});
RunTestSimple
<
D
>
(
{
1
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
},
1
,
Padding
::
VALID
,
{
4
,
4
},
{
0
},
{
3
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
},
{
1
,
5
,
5
,
3
},
{
1
,
2
,
3
,
6
,
9
,
12
,
18
,
24
,
30
,
26
,
31
,
36
,
21
,
24
,
27
,
14
,
19
,
24
,
54
,
66
,
78
,
126
,
147
,
168
,
130
,
146
,
162
,
90
,
99
,
108
,
66
,
78
,
90
,
198
,
225
,
252
,
405
,
450
,
495
,
366
,
399
,
432
,
234
,
252
,
270
,
146
,
157
,
168
,
354
,
378
,
402
,
630
,
669
,
708
,
502
,
530
,
558
,
294
,
309
,
324
,
133
,
140
,
147
,
306
,
321
,
336
,
522
,
546
,
570
,
398
,
415
,
432
,
225
,
234
,
243
});
RunTestSimple
<
D
>
({
1
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
},
1
,
Padding
::
VALID
,
{
4
,
4
},
{
0
},
{
3
,
3
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
},
{
1
,
5
,
5
,
3
},
{
1
,
2
,
3
,
6
,
9
,
12
,
18
,
24
,
30
,
26
,
31
,
36
,
21
,
24
,
27
,
14
,
19
,
24
,
54
,
66
,
78
,
126
,
147
,
168
,
130
,
146
,
162
,
90
,
99
,
108
,
66
,
78
,
90
,
198
,
225
,
252
,
405
,
450
,
495
,
366
,
399
,
432
,
234
,
252
,
270
,
146
,
157
,
168
,
354
,
378
,
402
,
630
,
669
,
708
,
502
,
530
,
558
,
294
,
309
,
324
,
133
,
140
,
147
,
306
,
321
,
336
,
522
,
546
,
570
,
398
,
415
,
432
,
225
,
234
,
243
});
{
1
,
2
,
3
,
6
,
9
,
12
,
18
,
24
,
30
,
26
,
31
,
36
,
21
,
24
,
27
,
14
,
19
,
24
,
54
,
66
,
78
,
126
,
147
,
168
,
130
,
146
,
162
,
90
,
99
,
108
,
66
,
78
,
90
,
198
,
225
,
252
,
405
,
450
,
495
,
366
,
399
,
432
,
234
,
252
,
270
,
146
,
157
,
168
,
354
,
378
,
402
,
630
,
669
,
708
,
502
,
530
,
558
,
294
,
309
,
324
,
133
,
140
,
147
,
306
,
321
,
336
,
522
,
546
,
570
,
398
,
415
,
432
,
225
,
234
,
243
});
}
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
TestNHWCSimple2x2SAME
()
{
RunTestSimple
<
D
>
({
1
,
2
,
2
,
1
},
{
1
,
1
,
1
,
1
},
1
,
Padding
::
SAME
,
{
0
,
0
},
{
1
,
2
,
2
,
1
},
{
3
,
3
,
1
,
1
},
RunTestSimple
<
D
>
({
1
,
2
,
2
,
1
},
{
1
,
1
,
1
,
1
},
1
,
Padding
::
SAME
,
{
0
,
0
},
{
1
,
2
,
2
,
1
},
{
3
,
3
,
1
,
1
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
},
{
1
,
2
,
2
,
1
},
{
4.
f
,
4.
f
,
4.
f
,
4.
f
});
{
1
,
2
,
2
,
1
},
{
4.
f
,
4.
f
,
4.
f
,
4.
f
});
}
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
TestNHWCSimple2x2VALID
()
{
RunTestSimple
<
D
>
({
1
,
2
,
2
,
1
},
{
1
,
1
,
1
,
1
},
2
,
Padding
::
VALID
,
{
0
,
0
},
{
1
,
5
,
5
,
1
},
{
3
,
3
,
1
,
1
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
},
RunTestSimple
<
D
>
(
{
1
,
2
,
2
,
1
},
{
1
,
1
,
1
,
1
},
2
,
Padding
::
VALID
,
{
0
,
0
},
{
1
,
5
,
5
,
1
},
{
3
,
3
,
1
,
1
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
},
{
1
,
5
,
5
,
1
},
{
1.
f
,
1.
f
,
2.
f
,
1.
f
,
1.
f
,
1.
f
,
1.
f
,
2.
f
,
1.
f
,
1.
f
,
2.
f
,
2.
f
,
4.
f
,
2.
f
,
2.
f
,
1.
f
,
1.
f
,
2.
f
,
1.
f
,
1.
f
,
1.
f
,
1.
f
,
2.
f
,
1.
f
,
1.
f
});
{
1.
f
,
1.
f
,
2.
f
,
1.
f
,
1.
f
,
1.
f
,
1.
f
,
2.
f
,
1.
f
,
1.
f
,
2.
f
,
2.
f
,
4.
f
,
2.
f
,
2.
f
,
1.
f
,
1.
f
,
2.
f
,
1.
f
,
1.
f
,
1.
f
,
1.
f
,
2.
f
,
1.
f
,
1.
f
});
}
}
// namespace
...
...
@@ -400,11 +243,11 @@ TEST_F(Deconv2dOpTest, CPUSimple3X3PaddingSame_S1) {
}
TEST_F
(
Deconv2dOpTest
,
CPUSimple3X3PaddingSame_S2
)
{
TestNHWCSimple3x3SAME_S2
<
DeviceType
::
CPU
>
();
TestNHWCSimple3x3SAME_S2
<
DeviceType
::
CPU
>
();
}
TEST_F
(
Deconv2dOpTest
,
CPUSimple3X3PaddingSame_S2_1
)
{
TestNHWCSimple3x3SAME_S2_1
<
DeviceType
::
CPU
>
();
TestNHWCSimple3x3SAME_S2_1
<
DeviceType
::
CPU
>
();
}
TEST_F
(
Deconv2dOpTest
,
CPUSimple2X2PaddingSame
)
{
...
...
@@ -432,11 +275,11 @@ TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingSame_S1) {
}
TEST_F
(
Deconv2dOpTest
,
OPENCLSimple3X3PaddingSame_S2
)
{
TestNHWCSimple3x3SAME_S2
<
DeviceType
::
GPU
>
();
TestNHWCSimple3x3SAME_S2
<
DeviceType
::
GPU
>
();
}
TEST_F
(
Deconv2dOpTest
,
OPENCLSimple3X3PaddingSame_S2_1
)
{
TestNHWCSimple3x3SAME_S2_1
<
DeviceType
::
GPU
>
();
TestNHWCSimple3x3SAME_S2_1
<
DeviceType
::
GPU
>
();
}
TEST_F
(
Deconv2dOpTest
,
OPENCLSimple2X2PaddingValid
)
{
...
...
@@ -452,7 +295,7 @@ TEST_F(Deconv2dOpTest, OPENCLSimple3X3PaddingValid_S2) {
}
namespace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
TestComplexDeconvNxNS12
(
const
int
batch
,
const
std
::
vector
<
int
>
&
shape
,
const
int
stride
)
{
...
...
@@ -473,14 +316,12 @@ void TestComplexDeconvNxNS12(const int batch,
net
.
AddRandomInput
<
D
,
T
>
(
"Filter"
,
{
output_channels
,
input_channels
,
kernel_h
,
kernel_w
});
net
.
AddRandomInput
<
D
,
T
>
(
"Bias"
,
{
output_channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
int
out_h
=
0
;
int
out_w
=
0
;
std
::
vector
<
int
>
paddings
;
std
::
vector
<
int
>
paddings
;
std
::
vector
<
int
>
output_shape
;
if
(
padding
<
0
)
{
...
...
@@ -496,8 +337,8 @@ void TestComplexDeconvNxNS12(const int batch,
output_shape
.
push_back
(
out_w
);
output_shape
.
push_back
(
output_channels
);
}
else
{
// out_h = (height - 1) * stride + 1 + padding - kernel_h + 1;
// out_w = (width -1) * stride + 1 + padding - kernel_w + 1;
// out_h = (height - 1) * stride + 1 + padding - kernel_h + 1;
// out_w = (width -1) * stride + 1 + padding - kernel_w + 1;
paddings
.
push_back
(
padding
);
paddings
.
push_back
(
padding
);
}
...
...
@@ -514,14 +355,11 @@ void TestComplexDeconvNxNS12(const int batch,
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// run on cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
Tensor
expected
;
...
...
@@ -551,8 +389,8 @@ void TestComplexDeconvNxNS12(const int batch,
ImageToBuffer
<
D
,
T
>
(
&
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-4
,
1e-4
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-4
,
1e-4
);
};
for
(
int
kernel_size
:
{
1
,
3
,
5
,
7
})
{
...
...
@@ -575,8 +413,8 @@ TEST_F(Deconv2dOpTest, OPENCLAlignedDeconvNxNS34) {
}
TEST_F
(
Deconv2dOpTest
,
OPENCLUnalignedDeconvNxNS12
)
{
TestComplexDeconvNxNS12
<
DeviceType
::
GPU
,
float
>
(
1
,
{
17
,
113
,
5
,
7
},
1
);
TestComplexDeconvNxNS12
<
DeviceType
::
GPU
,
float
>
(
1
,
{
17
,
113
,
5
,
7
},
2
);
TestComplexDeconvNxNS12
<
DeviceType
::
GPU
,
float
>
(
1
,
{
17
,
113
,
5
,
7
},
1
);
TestComplexDeconvNxNS12
<
DeviceType
::
GPU
,
float
>
(
1
,
{
17
,
113
,
5
,
7
},
2
);
}
TEST_F
(
Deconv2dOpTest
,
OPENCLUnalignedDeconvNxNS34
)
{
...
...
mace/ops/depth_to_space_test.cc
浏览文件 @
33415ee9
...
...
@@ -36,9 +36,7 @@ void RunDepthToSpace(const bool d2s,
const
char
*
ops_test_name
=
(
d2s
)
?
"DepthToSpaceTest"
:
"SpaceToDepthTest"
;
// Construct graph
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
ops_name
,
ops_test_name
)
.
Input
(
"InputNCHW"
)
...
...
@@ -47,10 +45,8 @@ void RunDepthToSpace(const bool d2s,
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
else
{
BufferToImage
<
D
,
float
>
(
&
net
,
"Input"
,
"InputImage"
,
...
...
@@ -64,7 +60,6 @@ void RunDepthToSpace(const bool d2s,
net
.
RunOp
(
D
);
}
if
(
D
==
DeviceType
::
GPU
)
{
ImageToBuffer
<
DeviceType
::
GPU
,
float
>
(
&
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -74,108 +69,89 @@ void RunDepthToSpace(const bool d2s,
}
}
// namespace
class
SpaceToDepthOpTest
:
public
OpsTestBase
{};
TEST_F
(
SpaceToDepthOpTest
,
Input2x4x4_B2_CPU
)
{
RunDepthToSpace
<
DeviceType
::
CPU
>
(
false
,
{
1
,
2
,
4
,
4
},
RunDepthToSpace
<
DeviceType
::
CPU
>
(
false
,
{
1
,
2
,
4
,
4
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
},
2
,
{
1
,
1
,
2
,
16
},
2
,
{
1
,
1
,
2
,
16
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
});
}
TEST_F
(
SpaceToDepthOpTest
,
Input2x4x4_B2_OPENCL
)
{
RunDepthToSpace
<
DeviceType
::
GPU
>
(
false
,
{
1
,
2
,
4
,
4
},
RunDepthToSpace
<
DeviceType
::
GPU
>
(
false
,
{
1
,
2
,
4
,
4
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
},
2
,
{
1
,
1
,
2
,
16
},
2
,
{
1
,
1
,
2
,
16
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
});
}
TEST_F
(
SpaceToDepthOpTest
,
Input2x2x4_B2_CPU
)
{
RunDepthToSpace
<
DeviceType
::
CPU
>
(
false
,
{
1
,
2
,
2
,
4
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
},
2
,
{
1
,
1
,
1
,
16
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
});
RunDepthToSpace
<
DeviceType
::
CPU
>
(
false
,
{
1
,
2
,
2
,
4
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
},
2
,
{
1
,
1
,
1
,
16
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
});
}
TEST_F
(
SpaceToDepthOpTest
,
Input4x4x1_B2_OPENCL
)
{
RunDepthToSpace
<
DeviceType
::
GPU
>
(
false
,
{
1
,
2
,
2
,
4
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
},
2
,
{
1
,
1
,
1
,
16
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
});
RunDepthToSpace
<
DeviceType
::
GPU
>
(
false
,
{
1
,
2
,
2
,
4
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
},
2
,
{
1
,
1
,
1
,
16
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
});
}
class
DepthToSpaceOpTest
:
public
OpsTestBase
{};
TEST_F
(
DepthToSpaceOpTest
,
Input1x2x16_B2_CPU
)
{
RunDepthToSpace
<
DeviceType
::
CPU
>
(
true
,
{
1
,
1
,
2
,
16
},
RunDepthToSpace
<
DeviceType
::
CPU
>
(
true
,
{
1
,
1
,
2
,
16
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
},
2
,
{
1
,
2
,
4
,
4
},
2
,
{
1
,
2
,
4
,
4
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
});
}
TEST_F
(
DepthToSpaceOpTest
,
Input1x2x16_B2_OPENCL
)
{
RunDepthToSpace
<
DeviceType
::
GPU
>
(
true
,
{
1
,
1
,
2
,
16
},
RunDepthToSpace
<
DeviceType
::
GPU
>
(
true
,
{
1
,
1
,
2
,
16
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
},
2
,
{
1
,
2
,
4
,
4
},
2
,
{
1
,
2
,
4
,
4
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
});
}
TEST_F
(
DepthToSpaceOpTest
,
Input1x1x16_B2_CPU
)
{
RunDepthToSpace
<
DeviceType
::
CPU
>
(
true
,
{
1
,
1
,
1
,
16
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
},
2
,
{
1
,
2
,
2
,
4
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
});
RunDepthToSpace
<
DeviceType
::
CPU
>
(
true
,
{
1
,
1
,
1
,
16
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
},
2
,
{
1
,
2
,
2
,
4
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
});
}
TEST_F
(
DepthToSpaceOpTest
,
Input1x1x16_B2_OPENCL
)
{
RunDepthToSpace
<
DeviceType
::
GPU
>
(
true
,
{
1
,
1
,
1
,
16
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
},
2
,
{
1
,
2
,
2
,
4
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
});
RunDepthToSpace
<
DeviceType
::
GPU
>
(
true
,
{
1
,
1
,
1
,
16
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
},
2
,
{
1
,
2
,
2
,
4
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
});
}
TEST_F
(
DepthToSpaceOpTest
,
InputLarger_B2_OPENCL
)
{
const
std
::
vector
<
float
>
in
=
std
::
vector
<
float
>
(
192
*
192
*
128
,
1.0
);
const
std
::
vector
<
float
>
in
=
std
::
vector
<
float
>
(
192
*
192
*
128
,
1.0
);
RunDepthToSpace
<
DeviceType
::
GPU
>
(
true
,
{
1
,
192
,
192
,
128
},
in
,
2
,
{
1
,
384
,
384
,
32
},
in
);
RunDepthToSpace
<
DeviceType
::
GPU
>
(
true
,
{
1
,
192
,
192
,
128
},
in
,
2
,
{
1
,
384
,
384
,
32
},
in
);
}
namespace
{
template
<
DeviceType
D
,
typename
T
>
void
RandomTest
(
const
bool
d2s
,
const
int
block_size
,
void
RandomTest
(
const
bool
d2s
,
const
int
block_size
,
const
std
::
vector
<
index_t
>
&
shape
)
{
testing
::
internal
::
LogToStderr
();
srand
(
time
(
NULL
));
...
...
@@ -188,9 +164,7 @@ void RandomTest(const bool d2s, const int block_size,
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
shape
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
ops_name
,
ops_test_name
)
.
Input
(
"InputNCHW"
)
...
...
@@ -201,12 +175,9 @@ void RandomTest(const bool d2s, const int block_size,
// Run
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
BufferToImage
<
D
,
T
>
(
&
net
,
"Input"
,
"InputImg"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -238,15 +209,15 @@ TEST_F(DepthToSpaceOpTest, OPENCLRandomFloat) {
}
TEST_F
(
DepthToSpaceOpTest
,
OPENCLRandomHalf
)
{
RandomTest
<
DeviceType
::
GPU
,
half
>
(
true
,
2
,
{
1
,
192
,
192
,
128
});
RandomTest
<
DeviceType
::
GPU
,
half
>
(
true
,
2
,
{
1
,
192
,
192
,
128
});
}
TEST_F
(
SpaceToDepthOpTest
,
OPENCLRandomFloat
)
{
RandomTest
<
DeviceType
::
GPU
,
float
>
(
false
,
2
,
{
1
,
384
,
384
,
32
});
RandomTest
<
DeviceType
::
GPU
,
float
>
(
false
,
2
,
{
1
,
384
,
384
,
32
});
}
TEST_F
(
SpaceToDepthOpTest
,
OPENCLRandomHalf
)
{
RandomTest
<
DeviceType
::
GPU
,
half
>
(
false
,
2
,
{
1
,
384
,
384
,
32
});
RandomTest
<
DeviceType
::
GPU
,
half
>
(
false
,
2
,
{
1
,
384
,
384
,
32
});
}
}
// namespace test
...
...
mace/ops/depthwise_conv2d_test.cc
浏览文件 @
33415ee9
...
...
@@ -22,7 +22,7 @@ namespace test {
class
DepthwiseConv2dOpTest
:
public
OpsTestBase
{};
namespace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
SimpleValidTest
()
{
testing
::
internal
::
LogToStderr
();
// Construct graph
...
...
@@ -36,9 +36,7 @@ void SimpleValidTest() {
"Filter"
,
{
1
,
2
,
2
,
2
},
{
1.0
f
,
2.0
f
,
3.0
f
,
4.0
f
,
2.0
f
,
4.0
f
,
6.0
f
,
8.0
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
.1
f
,
.2
f
});
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"DepthwiseConv2d"
,
"DepthwiseConv2DTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -51,10 +49,8 @@ void SimpleValidTest() {
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
else
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
T
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -85,8 +81,8 @@ void SimpleValidTest() {
// Check
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
2
,
2
,
2
},
{
37.1
f
,
148.2
f
,
47.1
f
,
188.2
f
,
67.1
f
,
268.2
f
,
77.1
f
,
308.2
f
});
{
1
,
2
,
2
,
2
}
,
{
37.1
f
,
148.2
f
,
47.1
f
,
188.2
f
,
67.1
f
,
268.2
f
,
77.1
f
,
308.2
f
});
if
(
DataTypeToEnum
<
T
>::
value
==
DT_HALF
)
{
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-3
,
1e-3
);
...
...
@@ -109,9 +105,13 @@ TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) {
}
namespace
{
template
<
DeviceType
D
,
typename
T
>
void
ComplexValidTest
(
index_t
batch
,
index_t
channel
,
index_t
height
,
index_t
width
,
index_t
kernel
,
index_t
multiplier
,
template
<
DeviceType
D
,
typename
T
>
void
ComplexValidTest
(
index_t
batch
,
index_t
channel
,
index_t
height
,
index_t
width
,
index_t
kernel
,
index_t
multiplier
,
int
stride
)
{
testing
::
internal
::
LogToStderr
();
// Construct graph
...
...
@@ -125,18 +125,14 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height,
std
::
vector
<
float
>
filter_data
(
kernel
*
kernel
*
channel
*
multiplier
);
GenerateRandomRealTypeData
({
multiplier
,
channel
,
kernel
,
kernel
},
&
filter_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
multiplier
,
channel
,
kernel
,
kernel
},
filter_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
multiplier
,
channel
,
kernel
,
kernel
},
filter_data
);
std
::
vector
<
float
>
bias_data
(
channel
*
multiplier
);
GenerateRandomRealTypeData
({
channel
*
multiplier
},
&
bias_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
channel
*
multiplier
},
bias_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
channel
*
multiplier
},
bias_data
);
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"DepthwiseConv2d"
,
"DepthwiseConv2DTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -150,10 +146,8 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height,
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
else
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
T
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -217,8 +211,8 @@ void ComplexValidTest(index_t batch, index_t channel, index_t height,
}
}
auto
expected
=
CreateTensor
<
T
>
(
{
1
,
out_height
,
out_width
,
out_channels
},
expect
);
auto
expected
=
CreateTensor
<
T
>
(
{
1
,
out_height
,
out_width
,
out_channels
},
expect
);
if
(
DataTypeToEnum
<
T
>::
value
==
DT_FLOAT
)
{
ExpectTensorNear
<
T
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
...
...
@@ -249,7 +243,7 @@ TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) {
}
namespace
{
template
<
typename
T
>
template
<
typename
T
>
void
TestNxNS12
(
const
index_t
height
,
const
index_t
width
)
{
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
...
...
@@ -263,18 +257,14 @@ void TestNxNS12(const index_t height, const index_t width) {
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
input_channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Filter"
,
{
multiplier
,
input_channels
,
kernel_h
,
kernel_w
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Bias"
,
{
multiplier
*
input_channels
});
{
multiplier
*
input_channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"DepthwiseConv2d"
,
"DepthwiseConv2DTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -290,10 +280,8 @@ void TestNxNS12(const index_t height, const index_t width) {
// Run on cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
Tensor
expected
;
...
...
@@ -319,18 +307,16 @@ void TestNxNS12(const index_t height, const index_t width) {
net
.
RunOp
(
DeviceType
::
GPU
);
// Transfer output
ImageToBuffer
<
DeviceType
::
GPU
,
float
>
(
&
net
,
"OutputImage"
,
"DeviceOutput"
,
ImageToBuffer
<
DeviceType
::
GPU
,
float
>
(
&
net
,
"OutputImage"
,
"DeviceOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
// Check
if
(
DataTypeToEnum
<
T
>::
value
==
DT_FLOAT
)
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"DeviceOutput"
),
1e-
5
,
1e-
4
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"DeviceOutput"
),
1e-5
,
1e-4
);
}
else
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"DeviceOutput"
),
1e-2
,
1e-2
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"DeviceOutput"
),
1e-2
,
1e-2
);
}
};
...
...
@@ -343,9 +329,7 @@ void TestNxNS12(const index_t height, const index_t width) {
}
}
// namespace
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLSimpleNxNS12
)
{
TestNxNS12
<
float
>
(
4
,
4
);
}
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLSimpleNxNS12
)
{
TestNxNS12
<
float
>
(
4
,
4
);
}
TEST_F
(
DepthwiseConv2dOpTest
,
OpenCLSimpleNxNS12Half
)
{
TestNxNS12
<
half
>
(
4
,
4
);
...
...
mace/ops/eltwise.h
浏览文件 @
33415ee9
...
...
@@ -26,15 +26,15 @@ class EltwiseOp : public Operator<D, T> {
public:
EltwiseOp
(
const
OperatorDef
&
op_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
op_def
,
ws
),
functor_
(
static_cast
<
kernels
::
EltwiseType
>
(
OperatorBase
::
GetOptionalArg
<
int
>
(
functor_
(
static_cast
<
kernels
::
EltwiseType
>
(
OperatorBase
::
GetOptionalArg
<
int
>
(
"type"
,
static_cast
<
int
>
(
kernels
::
EltwiseType
::
NONE
))),
OperatorBase
::
GetRepeatedArgs
<
float
>
(
"coeff"
),
OperatorBase
::
GetOptionalArg
<
float
>
(
"x"
,
1.0
))
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input0
=
this
->
Input
(
0
);
const
Tensor
*
input1
=
this
->
InputSize
()
==
2
?
this
->
Input
(
1
)
:
nullptr
;
const
Tensor
*
input0
=
this
->
Input
(
0
);
const
Tensor
*
input1
=
this
->
InputSize
()
==
2
?
this
->
Input
(
1
)
:
nullptr
;
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
return
functor_
(
input0
,
input1
,
output
,
future
);
}
...
...
mace/ops/eltwise_test.cc
浏览文件 @
33415ee9
...
...
@@ -36,10 +36,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
shape
,
input
);
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
D
,
float
>
(
"Input"
,
NHWC
,
"TInput"
,
NCHW
);
net
.
TransformDataFormat
<
D
,
float
>
(
"Input"
,
NHWC
,
"TInput"
,
NCHW
);
OpDefBuilder
(
"Eltwise"
,
"EltwiseTest"
)
.
Input
(
"TInput"
)
.
AddIntArg
(
"type"
,
static_cast
<
int
>
(
type
))
...
...
@@ -48,10 +45,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
D
,
float
>
(
"TOutput"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
D
,
float
>
(
"TOutput"
,
NCHW
,
"Output"
,
NHWC
);
}
else
{
BufferToImage
<
D
,
T
>
(
&
net
,
"Input"
,
"InputImg"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -90,10 +84,8 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
net
.
AddInputFromArray
<
D
,
float
>
(
"Input1"
,
shape1
,
input1
);
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
D
,
float
>
(
"Input0"
,
NHWC
,
"TInput0"
,
NCHW
);
net
.
TransformDataFormat
<
D
,
float
>
(
"Input1"
,
NHWC
,
"TInput1"
,
NCHW
);
net
.
TransformDataFormat
<
D
,
float
>
(
"Input0"
,
NHWC
,
"TInput0"
,
NCHW
);
net
.
TransformDataFormat
<
D
,
float
>
(
"Input1"
,
NHWC
,
"TInput1"
,
NCHW
);
OpDefBuilder
(
"Eltwise"
,
"EltwiseTest"
)
.
Input
(
"TInput0"
)
.
Input
(
"TInput1"
)
...
...
@@ -104,8 +96,7 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
D
,
float
>
(
"TOutput"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
D
,
float
>
(
"TOutput"
,
NCHW
,
"Output"
,
NHWC
);
}
else
{
BufferToImage
<
D
,
T
>
(
&
net
,
"Input0"
,
"InputImg0"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -138,278 +129,181 @@ void SimpleTensorEltwise(const kernels::EltwiseType type,
TEST_F
(
EltwiseOpTest
,
CPUSimpleTensorScalar
)
{
SimpleTensorScalar
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
1
,
1
},
{
1
},
1
,
{
2
});
{
1
,
1
,
1
,
1
},
{
1
},
1
,
{
2
});
SimpleTensorScalar
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
1
,
{
0
,
1
,
2
,
3
,
4
,
5
});
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
1
,
{
0
,
1
,
2
,
3
,
4
,
5
});
SimpleTensorScalar
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
2
,
{
2
,
4
,
6
,
8
,
10
,
12
});
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
2
,
{
2
,
4
,
6
,
8
,
10
,
12
});
SimpleTensorScalar
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
1
,
1
,
2
,
3
},
{
2
,
4
,
6
,
8
,
10
,
12
},
2
,
{
1
,
2
,
3
,
4
,
5
,
6
});
{
1
,
1
,
2
,
3
},
{
2
,
4
,
6
,
8
,
10
,
12
},
2
,
{
1
,
2
,
3
,
4
,
5
,
6
});
SimpleTensorScalar
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
1
,
{
1
,
1
,
1
,
1
,
1
,
1
});
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
1
,
{
1
,
1
,
1
,
1
,
1
,
1
});
SimpleTensorScalar
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
MAX
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
3
,
{
3
,
3
,
3
,
4
,
5
,
6
});
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
3
,
{
3
,
3
,
3
,
4
,
5
,
6
});
SimpleTensorScalar
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
NEG
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
3
,
{
-
1
,
-
2
,
-
3
,
-
4
,
-
5
,
-
6
});
SimpleTensorScalar
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
ABS
,
{
1
,
1
,
2
,
3
},
{
-
1
,
-
2
,
-
3
,
-
4
,
-
5
,
-
6
},
3
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
3
,
{
-
1
,
-
2
,
-
3
,
-
4
,
-
5
,
-
6
});
SimpleTensorScalar
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
ABS
,
{
1
,
1
,
2
,
3
},
{
-
1
,
-
2
,
-
3
,
-
4
,
-
5
,
-
6
},
3
,
{
1
,
2
,
3
,
4
,
5
,
6
});
SimpleTensorScalar
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
1
,
{
0
,
1
,
4
,
9
,
16
,
25
});
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
1
,
{
0
,
1
,
4
,
9
,
16
,
25
});
}
TEST_F
(
EltwiseOpTest
,
GPUSimpleTensorScalar
)
{
SimpleTensorScalar
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
1
,
1
},
{
1
},
1
,
{
2
});
{
1
,
1
,
1
,
1
},
{
1
},
1
,
{
2
});
SimpleTensorScalar
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
1
,
{
0
,
1
,
2
,
3
,
4
,
5
});
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
1
,
{
0
,
1
,
2
,
3
,
4
,
5
});
SimpleTensorScalar
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
2
,
{
2
,
4
,
6
,
8
,
10
,
12
});
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
2
,
{
2
,
4
,
6
,
8
,
10
,
12
});
SimpleTensorScalar
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
1
,
1
,
2
,
3
},
{
2
,
4
,
6
,
8
,
10
,
12
},
2
,
{
1
,
2
,
3
,
4
,
5
,
6
});
{
1
,
1
,
2
,
3
},
{
2
,
4
,
6
,
8
,
10
,
12
},
2
,
{
1
,
2
,
3
,
4
,
5
,
6
});
SimpleTensorScalar
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
1
,
{
1
,
1
,
1
,
1
,
1
,
1
});
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
1
,
{
1
,
1
,
1
,
1
,
1
,
1
});
SimpleTensorScalar
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
MAX
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
3
,
{
3
,
3
,
3
,
4
,
5
,
6
});
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
3
,
{
3
,
3
,
3
,
4
,
5
,
6
});
SimpleTensorScalar
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
NEG
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
3
,
{
-
1
,
-
2
,
-
3
,
-
4
,
-
5
,
-
6
});
SimpleTensorScalar
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
ABS
,
{
1
,
1
,
2
,
3
},
{
-
1
,
-
2
,
-
3
,
-
4
,
-
5
,
-
6
},
3
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
3
,
{
-
1
,
-
2
,
-
3
,
-
4
,
-
5
,
-
6
});
SimpleTensorScalar
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
ABS
,
{
1
,
1
,
2
,
3
},
{
-
1
,
-
2
,
-
3
,
-
4
,
-
5
,
-
6
},
3
,
{
1
,
2
,
3
,
4
,
5
,
6
});
SimpleTensorScalar
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
1
,
{
0
,
1
,
4
,
9
,
16
,
25
});
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
1
,
{
0
,
1
,
4
,
9
,
16
,
25
});
}
TEST_F
(
EltwiseOpTest
,
CPUSimpleTensorVector
)
{
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
1
,
3
},
{
1
,
2
,
3
},
{
2
,
4
,
6
,
5
,
7
,
9
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
0
,
0
,
0
,
0
,
0
,
5
,
5
,
5
,
5
,
5
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
0
,
0
,
0
,
0
,
0
,
-
5
,
-
5
,
-
5
,
-
5
,
-
5
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
1
,
1
,
3
},
{
1
,
2
,
3
},
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
4
,
9
,
4
,
10
,
18
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
1
,
1
,
5
},
{
1
,
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
1
,
6
,
7
,
8
,
9
,
2
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
1
,
1
,
1
,
5
},
{
1
,
1
,
1
,
2
,
4
},
{
1
,
2
,
1
,
5
},
{
1
,
1
,
1
,
2
,
2
,
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
2
,
1
,
1
,
1
,
2
,
4
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
MAX
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
0
,
0
,
0
,
0
,
0
,
25
,
25
,
25
,
25
,
25
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
1
,
3
},
{
1
,
2
,
3
},
{
2
,
4
,
6
,
5
,
7
,
9
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
0
,
0
,
0
,
0
,
0
,
5
,
5
,
5
,
5
,
5
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
0
,
0
,
0
,
0
,
0
,
-
5
,
-
5
,
-
5
,
-
5
,
-
5
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
1
,
1
,
3
},
{
1
,
2
,
3
},
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
4
,
9
,
4
,
10
,
18
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
1
,
1
,
5
},
{
1
,
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
1
,
6
,
7
,
8
,
9
,
2
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
1
,
1
,
1
,
5
},
{
1
,
1
,
1
,
2
,
4
},
{
1
,
2
,
1
,
5
},
{
1
,
1
,
1
,
2
,
2
,
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
2
,
1
,
1
,
1
,
2
,
4
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
MAX
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
0
,
0
,
0
,
0
,
0
,
25
,
25
,
25
,
25
,
25
});
}
TEST_F
(
EltwiseOpTest
,
GPUSimpleTensorVector
)
{
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
1
,
3
},
{
1
,
2
,
3
},
{
2
,
4
,
6
,
5
,
7
,
9
});
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
1
,
3
},
{
1
,
2
,
3
},
{
2
,
4
,
6
,
5
,
7
,
9
});
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
0
,
0
,
0
,
0
,
0
,
5
,
5
,
5
,
5
,
5
});
kernels
::
EltwiseType
::
SUB
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
0
,
0
,
0
,
0
,
0
,
5
,
5
,
5
,
5
,
5
});
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
0
,
0
,
0
,
0
,
0
,
-
5
,
-
5
,
-
5
,
-
5
,
-
5
});
kernels
::
EltwiseType
::
SUB
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
0
,
0
,
0
,
0
,
0
,
-
5
,
-
5
,
-
5
,
-
5
,
-
5
});
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
1
,
1
,
3
},
{
1
,
2
,
3
},
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
4
,
9
,
4
,
10
,
18
});
kernels
::
EltwiseType
::
PROD
,
{
1
,
1
,
1
,
3
},
{
1
,
2
,
3
},
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
4
,
9
,
4
,
10
,
18
});
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
1
,
1
,
5
},
{
1
,
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
1
,
6
,
7
,
8
,
9
,
2
});
kernels
::
EltwiseType
::
DIV
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
1
,
1
,
5
},
{
1
,
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
1
,
6
,
7
,
8
,
9
,
2
});
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
1
,
1
,
1
,
5
},
{
1
,
1
,
1
,
2
,
4
},
{
1
,
2
,
1
,
5
},
{
1
,
1
,
1
,
2
,
2
,
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
2
,
1
,
1
,
1
,
2
,
4
});
kernels
::
EltwiseType
::
DIV
,
{
1
,
1
,
1
,
5
},
{
1
,
1
,
1
,
2
,
4
},
{
1
,
2
,
1
,
5
},
{
1
,
1
,
1
,
2
,
2
,
1
,
1
,
1
,
1
,
1
},
{
1
,
1
,
1
,
1
,
2
,
1
,
1
,
1
,
2
,
4
});
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
});
kernels
::
EltwiseType
::
MIN
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
});
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
MAX
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
});
kernels
::
EltwiseType
::
MAX
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
});
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
kernels
::
EltwiseType
::
SQR_DIFF
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
0
,
0
,
0
,
0
,
0
,
25
,
25
,
25
,
25
,
25
});
}
TEST_F
(
EltwiseOpTest
,
CPUSimpleTensorTensor
)
{
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
2
,
4
,
6
,
8
,
10
,
12
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
0.2
,
0.4
,
0.6
,
0.8
,
1
,
1.2
},
{
0.1
,
0.1
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
0
,
0
,
0
,
0
,
0
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
4
,
9
,
16
,
25
,
36
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
1
,
1
,
1
,
1
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
2
,
4
,
6
,
8
,
10
,
12
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
0.2
,
0.4
,
0.6
,
0.8
,
1
,
1.2
},
{
0.1
,
0.1
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
0
,
0
,
0
,
0
,
0
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
4
,
9
,
16
,
25
,
36
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
1
,
1
,
1
,
1
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
MAX
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
},
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
MAX
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
0
,
0
,
0
,
0
,
0
,
25
,
25
,
25
,
25
,
25
});
SimpleTensorEltwise
<
DeviceType
::
CPU
,
float
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
0
,
0
,
0
,
0
,
0
,
25
,
25
,
25
,
25
,
25
});
}
TEST_F
(
EltwiseOpTest
,
GPUSimpleTensorTensor
)
{
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
2
,
4
,
6
,
8
,
10
,
12
});
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
2
,
4
,
6
,
8
,
10
,
12
});
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
0.2
,
0.4
,
0.6
,
0.8
,
1
,
1.2
},
{
0.1
,
0.1
});
kernels
::
EltwiseType
::
SUM
,
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
0.2
,
0.4
,
0.6
,
0.8
,
1
,
1.2
},
{
0.1
,
0.1
});
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
0
,
0
,
0
,
0
,
0
});
kernels
::
EltwiseType
::
SUB
,
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
1
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
},
{
0
,
0
,
0
,
0
,
0
});
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
4
,
9
,
16
,
25
,
36
});
kernels
::
EltwiseType
::
PROD
,
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
4
,
9
,
16
,
25
,
36
});
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
1
,
1
,
1
,
1
});
kernels
::
EltwiseType
::
DIV
,
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
1
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
1
,
1
,
1
,
1
,
1
});
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
},
kernels
::
EltwiseType
::
MIN
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
});
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
MAX
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
kernels
::
EltwiseType
::
MAX
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
});
SimpleTensorEltwise
<
DeviceType
::
GPU
,
float
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
0
,
0
,
0
,
0
,
0
,
25
,
25
,
25
,
25
,
25
});
kernels
::
EltwiseType
::
SQR_DIFF
,
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
1
,
2
,
3
,
4
,
5
},
{
1
,
2
,
1
,
5
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
},
{
0
,
0
,
0
,
0
,
0
,
25
,
25
,
25
,
25
,
25
});
}
namespace
{
...
...
@@ -422,9 +316,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
shape
,
true
,
true
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"TInput"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"TInput"
,
NCHW
);
OpDefBuilder
(
"Eltwise"
,
"EltwiseTest"
)
.
Input
(
"TInput"
)
...
...
@@ -434,9 +326,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
DeviceType
::
CPU
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"TOutput"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"TOutput"
,
NCHW
,
"Output"
,
NHWC
);
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
...
...
@@ -460,8 +350,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
if
(
DataTypeToEnum
<
T
>::
value
==
DT_FLOAT
)
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"GPUOutput"
),
1e-5
);
}
else
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"GPUOutput"
),
1e-2
,
1e-2
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"GPUOutput"
),
1e-2
,
1e-2
);
}
}
...
...
@@ -477,10 +366,10 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input0"
,
shape0
,
true
,
true
);
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input1"
,
shape1
,
true
,
true
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input0"
,
NHWC
,
"TInput0"
,
NCHW
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input1"
,
NHWC
,
"TInput1"
,
NCHW
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input0"
,
NHWC
,
"TInput0"
,
NCHW
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input1"
,
NHWC
,
"TInput1"
,
NCHW
);
OpDefBuilder
(
"Eltwise"
,
"EltwiseTest"
)
.
Input
(
"TInput0"
)
.
Input
(
"TInput1"
)
...
...
@@ -491,8 +380,8 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
// Run
net
.
RunOp
(
DeviceType
::
CPU
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"TOutput"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"TOutput"
,
NCHW
,
"Output"
,
NHWC
);
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
...
...
@@ -518,8 +407,7 @@ void RandomTensorEltwise(const kernels::EltwiseType type,
if
(
DataTypeToEnum
<
T
>::
value
==
DT_FLOAT
)
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"GPUOutput"
),
1e-5
);
}
else
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"GPUOutput"
),
1e-2
,
1e-2
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"GPUOutput"
),
1e-2
,
1e-2
);
}
}
}
// namespace
...
...
@@ -549,88 +437,87 @@ TEST_F(EltwiseOpTest, RandomTensorScalarHalf) {
}
TEST_F
(
EltwiseOpTest
,
RandomTensorVecFloat
)
{
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
32
,
32
,
16
},
{
1
,
1
,
1
,
16
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
5
,
32
,
32
,
16
},
{
5
,
1
,
1
,
16
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
5
,
32
,
32
,
16
},
{
1
,
1
,
1
,
16
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
5
,
1
,
1
,
16
},
{
5
,
32
,
32
,
16
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
31
,
37
,
17
},
{
1
,
1
,
1
,
17
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
1
,
1
,
17
},
{
1
,
31
,
37
,
17
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
3
,
1
,
1
,
17
},
{
3
,
31
,
37
,
17
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
1
,
1
,
16
},
{
1
,
32
,
32
,
16
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
MAX
,
{
5
,
31
,
37
,
17
},
{
5
,
1
,
1
,
17
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
5
,
31
,
37
,
17
},
{
5
,
1
,
1
,
17
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
32
,
32
,
16
},
{
1
,
1
,
1
,
16
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
5
,
32
,
32
,
16
},
{
5
,
1
,
1
,
16
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
5
,
32
,
32
,
16
},
{
1
,
1
,
1
,
16
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
5
,
1
,
1
,
16
},
{
5
,
32
,
32
,
16
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
31
,
37
,
17
},
{
1
,
1
,
1
,
17
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
1
,
1
,
17
},
{
1
,
31
,
37
,
17
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
3
,
1
,
1
,
17
},
{
3
,
31
,
37
,
17
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
1
,
1
,
16
},
{
1
,
32
,
32
,
16
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
MAX
,
{
5
,
31
,
37
,
17
},
{
5
,
1
,
1
,
17
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
5
,
31
,
37
,
17
},
{
5
,
1
,
1
,
17
});
}
TEST_F
(
EltwiseOpTest
,
RandomTensorVecHalf
)
{
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
32
,
32
,
16
},
{
1
,
1
,
1
,
16
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SUB
,
{
3
,
32
,
32
,
16
},
{
3
,
1
,
1
,
16
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SUB
,
{
3
,
32
,
32
,
16
},
{
1
,
1
,
1
,
16
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SUB
,
{
3
,
1
,
1
,
16
},
{
3
,
32
,
32
,
16
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
1
,
1
,
17
},
{
1
,
31
,
37
,
17
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
DIV
,
{
5
,
31
,
37
,
17
},
{
5
,
1
,
1
,
17
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
DIV
,
{
5
,
31
,
37
,
17
},
{
1
,
1
,
1
,
17
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
DIV
,
{
5
,
1
,
1
,
17
},
{
5
,
31
,
37
,
17
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
1
,
1
,
16
},
{
1
,
32
,
32
,
16
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
MAX
,
{
3
,
31
,
37
,
17
},
{
3
,
1
,
1
,
17
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
3
,
31
,
37
,
17
},
{
3
,
1
,
1
,
17
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
32
,
32
,
16
},
{
1
,
1
,
1
,
16
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SUB
,
{
3
,
32
,
32
,
16
},
{
3
,
1
,
1
,
16
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SUB
,
{
3
,
32
,
32
,
16
},
{
1
,
1
,
1
,
16
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SUB
,
{
3
,
1
,
1
,
16
},
{
3
,
32
,
32
,
16
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
1
,
1
,
17
},
{
1
,
31
,
37
,
17
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
DIV
,
{
5
,
31
,
37
,
17
},
{
5
,
1
,
1
,
17
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
DIV
,
{
5
,
31
,
37
,
17
},
{
1
,
1
,
1
,
17
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
DIV
,
{
5
,
1
,
1
,
17
},
{
5
,
31
,
37
,
17
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
1
,
1
,
16
},
{
1
,
32
,
32
,
16
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
MAX
,
{
3
,
31
,
37
,
17
},
{
3
,
1
,
1
,
17
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
3
,
31
,
37
,
17
},
{
3
,
1
,
1
,
17
});
}
TEST_F
(
EltwiseOpTest
,
RandomTensorTensorFloat
)
{
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
32
,
32
,
16
}
,
{
1
,
32
,
32
,
16
}
);
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
3
,
32
,
32
,
16
}
,
{
3
,
32
,
32
,
16
}
);
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
31
,
37
,
17
}
,
{
1
,
31
,
37
,
17
}
);
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
5
,
31
,
37
,
17
}
,
{
5
,
31
,
37
,
17
}
);
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
32
,
32
,
16
}
,
{
1
,
32
,
32
,
16
}
);
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
MAX
,
{
3
,
31
,
37
,
17
}
,
{
3
,
31
,
37
,
17
}
);
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
3
,
31
,
37
,
17
}
,
{
3
,
31
,
37
,
17
}
);
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
32
,
32
,
16
},
{
1
,
32
,
32
,
16
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SUB
,
{
3
,
32
,
32
,
16
},
{
3
,
32
,
32
,
16
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
31
,
37
,
17
},
{
1
,
31
,
37
,
17
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
DIV
,
{
5
,
31
,
37
,
17
},
{
5
,
31
,
37
,
17
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
32
,
32
,
16
},
{
1
,
32
,
32
,
16
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
MAX
,
{
3
,
31
,
37
,
17
},
{
3
,
31
,
37
,
17
});
RandomTensorEltwise
<
float
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
3
,
31
,
37
,
17
},
{
3
,
31
,
37
,
17
});
}
TEST_F
(
EltwiseOpTest
,
RandomTensorTensorHalf
)
{
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
32
,
32
,
16
}
,
{
1
,
32
,
32
,
16
}
);
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SUB
,
{
3
,
32
,
32
,
16
}
,
{
3
,
32
,
32
,
16
}
);
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
31
,
37
,
17
}
,
{
1
,
31
,
37
,
17
}
);
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
DIV
,
{
5
,
31
,
37
,
17
}
,
{
5
,
31
,
37
,
17
}
);
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
32
,
32
,
16
}
,
{
1
,
32
,
32
,
16
}
);
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
MAX
,
{
3
,
31
,
37
,
17
}
,
{
3
,
31
,
37
,
17
}
);
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
3
,
31
,
37
,
17
}
,
{
3
,
31
,
37
,
17
}
);
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SUM
,
{
1
,
32
,
32
,
16
},
{
1
,
32
,
32
,
16
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SUB
,
{
3
,
32
,
32
,
16
},
{
3
,
32
,
32
,
16
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
PROD
,
{
1
,
31
,
37
,
17
},
{
1
,
31
,
37
,
17
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
DIV
,
{
5
,
31
,
37
,
17
},
{
5
,
31
,
37
,
17
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
MIN
,
{
1
,
32
,
32
,
16
},
{
1
,
32
,
32
,
16
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
MAX
,
{
3
,
31
,
37
,
17
},
{
3
,
31
,
37
,
17
});
RandomTensorEltwise
<
half
>
(
kernels
::
EltwiseType
::
SQR_DIFF
,
{
3
,
31
,
37
,
17
},
{
3
,
31
,
37
,
17
});
}
}
// namespace test
}
// namespace ops
}
// namespace mace
mace/ops/folded_batch_norm.h
浏览文件 @
33415ee9
...
...
@@ -47,7 +47,7 @@ class FoldedBatchNormOp : public Operator<D, T> {
offset
->
dim_size
());
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
MACE_
FAILURE_RETURN
(
output
->
ResizeLike
(
input
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeLike
(
input
));
return
functor_
(
input
,
scale
,
offset
,
nullptr
,
nullptr
,
0
,
output
,
future
);
}
...
...
mace/ops/folded_batch_norm_test.cc
浏览文件 @
33415ee9
...
...
@@ -36,7 +36,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma,
}
}
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
Simple
()
{
OpsTestNet
net
;
...
...
@@ -83,10 +83,9 @@ void Simple() {
}
// Check
auto
expected
=
CreateTensor
<
float
>
({
1
,
6
,
2
,
1
},
{
-
3.8543
,
-
3.8543
,
-
1.5125
,
-
1.5125
,
0.8291
,
0.8291
,
3.1708
,
3.1708
,
5.5125
,
5.5125
,
7.8543
,
7.8543
});
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
6
,
2
,
1
},
{
-
3.8543
,
-
3.8543
,
-
1.5125
,
-
1.5125
,
0.8291
,
0.8291
,
3.1708
,
3.1708
,
5.5125
,
5.5125
,
7.8543
,
7.8543
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-4
);
}
...
...
@@ -108,14 +107,12 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"FoldedBatchNorm"
,
"FoldedBatchNormTest"
)
...
...
@@ -128,9 +125,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
// run cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
@@ -173,14 +168,12 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"FoldedBatchNorm"
,
"FoldedBatchNormTest"
)
...
...
@@ -193,9 +186,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
// run cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
@@ -239,14 +230,12 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"FoldedBatchNorm"
,
"FoldedBatchNormTest"
)
...
...
@@ -259,9 +248,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
// run cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
@@ -303,14 +290,12 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Scale"
,
{
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Offset"
,
{
channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"FoldedBatchNorm"
,
"FoldedBatchNormTest"
)
...
...
@@ -323,9 +308,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
// run cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
mace/ops/fully_connected.h
浏览文件 @
33415ee9
...
...
@@ -23,7 +23,7 @@
namespace
mace
{
namespace
ops
{
template
<
DeviceType
D
,
class
T
>
template
<
DeviceType
D
,
class
T
>
class
FullyConnectedOp
:
public
Operator
<
D
,
T
>
{
public:
FullyConnectedOp
(
const
OperatorDef
&
operator_def
,
Workspace
*
ws
)
...
...
@@ -40,29 +40,19 @@ class FullyConnectedOp : public Operator<D, T> {
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
if
(
D
==
DeviceType
::
CPU
)
{
MACE_CHECK
(
input
->
dim
(
1
)
==
weight
->
dim
(
1
)
&&
input
->
dim
(
2
)
==
weight
->
dim
(
2
)
&&
input
->
dim
(
3
)
==
weight
->
dim
(
3
)
&&
weight
->
dim
(
0
)
==
bias
->
dim
(
0
),
"The shape of Input: "
,
MakeString
(
input
->
shape
()),
"The shape of Weight: "
,
MakeString
(
weight
->
shape
()),
" and Bias "
,
bias
->
dim
(
0
),
" don't match."
);
MACE_CHECK
(
input
->
dim
(
1
)
==
weight
->
dim
(
1
)
&&
input
->
dim
(
2
)
==
weight
->
dim
(
2
)
&&
input
->
dim
(
3
)
==
weight
->
dim
(
3
)
&&
weight
->
dim
(
0
)
==
bias
->
dim
(
0
),
"The shape of Input: "
,
MakeString
(
input
->
shape
()),
"The shape of Weight: "
,
MakeString
(
weight
->
shape
()),
" and Bias "
,
bias
->
dim
(
0
),
" don't match."
);
}
else
{
MACE_CHECK
(
input
->
dim
(
1
)
==
weight
->
dim
(
2
)
&&
input
->
dim
(
2
)
==
weight
->
dim
(
3
)
&&
input
->
dim
(
3
)
==
weight
->
dim
(
1
)
&&
weight
->
dim
(
0
)
==
bias
->
dim
(
0
),
"The shape of Input: "
,
MakeString
(
input
->
shape
()),
"The shape of Weight: "
,
MakeString
(
weight
->
shape
()),
" and Bias "
,
bias
->
dim
(
0
),
" don't match."
);
MACE_CHECK
(
input
->
dim
(
1
)
==
weight
->
dim
(
2
)
&&
input
->
dim
(
2
)
==
weight
->
dim
(
3
)
&&
input
->
dim
(
3
)
==
weight
->
dim
(
1
)
&&
weight
->
dim
(
0
)
==
bias
->
dim
(
0
),
"The shape of Input: "
,
MakeString
(
input
->
shape
()),
"The shape of Weight: "
,
MakeString
(
weight
->
shape
()),
" and Bias "
,
bias
->
dim
(
0
),
" don't match."
);
}
return
functor_
(
input
,
weight
,
bias
,
output
,
future
);
...
...
mace/ops/fully_connected_test.cc
浏览文件 @
33415ee9
...
...
@@ -24,7 +24,7 @@ namespace test {
class
FullyConnectedOpTest
:
public
OpsTestBase
{};
namespace
{
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
Simple
(
const
std
::
vector
<
index_t
>
&
input_shape
,
const
std
::
vector
<
float
>
&
input_value
,
const
std
::
vector
<
index_t
>
&
weight_shape
,
...
...
@@ -111,8 +111,8 @@ TEST_F(FullyConnectedOpTest, SimpleOPENCL) {
{
2
},
{
2
,
3
},
{
1
,
1
,
1
,
2
},
{
387
,
3853
});
Simple
<
DeviceType
::
GPU
>
(
{
1
,
1
,
2
,
3
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
5
,
3
,
1
,
2
},
{
1
,
4
,
2
,
5
,
3
,
6
,
10
,
40
,
20
,
50
,
30
,
60
,
1
,
4
,
2
,
5
,
3
,
6
,
10
,
40
,
20
,
50
,
30
,
60
,
1
,
4
,
2
,
5
,
3
,
6
},
{
1
,
4
,
2
,
5
,
3
,
6
,
10
,
40
,
20
,
50
,
30
,
60
,
1
,
4
,
2
,
5
,
3
,
6
,
10
,
40
,
20
,
50
,
30
,
60
,
1
,
4
,
2
,
5
,
3
,
6
},
{
5
},
{
1
,
2
,
3
,
4
,
5
},
{
1
,
1
,
1
,
5
},
{
92
,
912
,
94
,
914
,
96
});
}
...
...
@@ -122,7 +122,7 @@ TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) {
}
namespace
{
template
<
typename
T
>
template
<
typename
T
>
void
Random
(
const
index_t
batch
,
const
index_t
height
,
const
index_t
width
,
...
...
@@ -134,15 +134,13 @@ void Random(const index_t batch,
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Weight"
,
{
out_channel
,
channels
,
height
,
width
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Bias"
,
{
out_channel
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"FullyConnected"
,
"FullyConnectedTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -182,11 +180,11 @@ void Random(const index_t batch,
ImageToBuffer
<
DeviceType
::
GPU
,
float
>
(
&
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
if
(
DataTypeToEnum
<
T
>::
value
==
DataType
::
DT_HALF
)
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-1
,
1e-1
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-1
,
1e-1
);
}
else
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-
2
,
1e-
3
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-2
,
1e-3
);
}
}
}
// namespace
...
...
mace/ops/local_response_norm.h
浏览文件 @
33415ee9
...
...
@@ -25,8 +25,7 @@ template <DeviceType D, class T>
class
LocalResponseNormOp
:
public
Operator
<
D
,
T
>
{
public:
LocalResponseNormOp
(
const
OperatorDef
&
operator_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
operator_def
,
ws
),
functor_
()
{
:
Operator
<
D
,
T
>
(
operator_def
,
ws
),
functor_
()
{
depth_radius_
=
OperatorBase
::
GetOptionalArg
<
int
>
(
"depth_radius"
,
5
);
bias_
=
OperatorBase
::
GetOptionalArg
<
float
>
(
"bias"
,
1.0
f
);
alpha_
=
OperatorBase
::
GetOptionalArg
<
float
>
(
"alpha"
,
1.0
f
);
...
...
@@ -40,7 +39,7 @@ class LocalResponseNormOp : public Operator<D, T> {
input
->
dim_size
());
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
MACE_
FAILURE_RETURN
(
output
->
ResizeLike
(
input
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeLike
(
input
));
return
functor_
(
input
,
depth_radius_
,
bias_
,
alpha_
,
beta_
,
output
,
future
);
}
...
...
mace/ops/local_response_norm_test.cc
浏览文件 @
33415ee9
...
...
@@ -21,7 +21,7 @@ namespace test {
class
LocalResponseNormOpTest
:
public
OpsTestBase
{};
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
Simple
()
{
OpsTestNet
net
;
...
...
@@ -46,9 +46,9 @@ void Simple() {
}
// Check
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
2
,
6
},
{
0.28
,
0.28
,
0.39
,
0.39
,
0.51
,
0.51
,
0.34
,
0.34
,
0.40
,
0.40
,
0.47
,
0.47
});
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
1
,
2
,
6
}
,
{
0.28
,
0.28
,
0.39
,
0.39
,
0.51
,
0.51
,
0.34
,
0.34
,
0.40
,
0.40
,
0.47
,
0.47
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0
,
1e-2
);
}
...
...
mace/ops/matmul_test.cc
浏览文件 @
33415ee9
...
...
@@ -92,8 +92,7 @@ TEST_F(MatMulOpTest, SimpleCPUWithBatch) {
TEST_F
(
MatMulOpTest
,
SimpleOPENCL
)
{
Simple
<
DeviceType
::
GPU
>
({
1
,
2
,
3
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
3
,
2
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
2
,
1
},
{
22
,
28
,
49
,
64
});
{
1
,
2
,
3
,
4
,
5
,
6
},
{
1
,
2
,
2
,
1
},
{
22
,
28
,
49
,
64
});
Simple
<
DeviceType
::
GPU
>
(
{
1
,
5
,
5
,
1
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
},
...
...
@@ -127,10 +126,9 @@ void Complex(const index_t batch,
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"A"
,
{
batch
,
height
,
channels
,
1
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"B"
,
{
batch
,
channels
,
out_width
,
1
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"A"
,
{
batch
,
height
,
channels
,
1
});
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"B"
,
{
batch
,
channels
,
out_width
,
1
});
// run cpu
net
.
RunOp
();
...
...
@@ -158,11 +156,11 @@ void Complex(const index_t batch,
ImageToBuffer
<
DeviceType
::
GPU
,
float
>
(
&
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_HEIGHT
);
if
(
DataTypeToEnum
<
T
>::
value
==
DataType
::
DT_HALF
)
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-
2
,
1e-
1
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-2
,
1e-1
);
}
else
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-5
,
1e-5
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-5
,
1e-5
);
}
}
}
// namespace
...
...
mace/ops/ops_test_util.h
浏览文件 @
33415ee9
...
...
@@ -112,7 +112,7 @@ class OpsTestNet {
public:
OpsTestNet
()
:
op_registry_
(
new
OperatorRegistry
())
{}
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
AddInputFromArray
(
const
std
::
string
&
name
,
const
std
::
vector
<
index_t
>
&
shape
,
const
std
::
vector
<
T
>
&
data
)
{
...
...
@@ -125,7 +125,7 @@ class OpsTestNet {
memcpy
(
input_data
,
data
.
data
(),
data
.
size
()
*
sizeof
(
T
));
}
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
AddRepeatedInput
(
const
std
::
string
&
name
,
const
std
::
vector
<
index_t
>
&
shape
,
const
T
data
)
{
...
...
@@ -137,7 +137,7 @@ class OpsTestNet {
std
::
fill
(
input_data
,
input_data
+
input
->
size
(),
data
);
}
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
AddRandomInput
(
const
std
::
string
&
name
,
const
std
::
vector
<
index_t
>
&
shape
,
bool
positive
=
true
,
...
...
@@ -160,7 +160,7 @@ class OpsTestNet {
if
(
std
::
abs
(
d
)
>
100.
f
)
d
=
100.
f
;
if
(
std
::
abs
(
d
)
<
0.001
f
)
d
=
0.001
f
;
}
return
half_float
::
half_cast
<
half
>
(
positive
?
std
::
abs
(
d
)
:
d
);
return
half_float
::
half_cast
<
half
>
(
positive
?
std
::
abs
(
d
)
:
d
);
});
}
else
{
std
::
generate
(
input_data
,
input_data
+
input
->
size
(),
...
...
@@ -170,17 +170,15 @@ class OpsTestNet {
if
(
std
::
abs
(
d
)
>
100.
f
)
d
=
100.
f
;
if
(
std
::
abs
(
d
)
<
0.001
f
)
d
=
0.001
f
;
}
return
(
positive
?
std
::
abs
(
d
)
:
d
);
return
(
positive
?
std
::
abs
(
d
)
:
d
);
});
}
}
template
<
DeviceType
D
,
typename
T
>
void
Transpose2D
(
const
std
::
string
&
src_name
,
const
std
::
string
&
dst_name
)
{
template
<
DeviceType
D
,
typename
T
>
void
Transpose2D
(
const
std
::
string
&
src_name
,
const
std
::
string
&
dst_name
)
{
Tensor
*
input
=
ws_
.
GetTensor
(
src_name
);
Tensor
*
output
=
ws_
.
CreateTensor
(
dst_name
,
GetDeviceAllocator
(
D
),
Tensor
*
output
=
ws_
.
CreateTensor
(
dst_name
,
GetDeviceAllocator
(
D
),
DataTypeToEnum
<
T
>::
v
());
const
std
::
vector
<
index_t
>
input_shape
=
input
->
shape
();
MACE_CHECK
(
input_shape
.
size
()
==
2
,
"input shape != 2"
);
...
...
@@ -197,14 +195,13 @@ class OpsTestNet {
}
}
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
TransformDataFormat
(
const
std
::
string
&
src_name
,
const
DataFormat
src_format
,
const
std
::
string
&
dst_name
,
const
DataFormat
dst_format
)
{
Tensor
*
input
=
ws_
.
GetTensor
(
src_name
);
Tensor
*
output
=
ws_
.
CreateTensor
(
dst_name
,
GetDeviceAllocator
(
D
),
Tensor
*
output
=
ws_
.
CreateTensor
(
dst_name
,
GetDeviceAllocator
(
D
),
DataTypeToEnum
<
T
>::
v
());
const
std
::
vector
<
index_t
>
input_shape
=
input
->
shape
();
MACE_CHECK
(
input_shape
.
size
()
==
4
,
"input shape != 4"
);
...
...
@@ -300,7 +297,8 @@ class OpsTestNet {
for
(
index_t
c
=
0
;
c
<
in_channels
;
++
c
)
{
for
(
index_t
k
=
0
;
k
<
hw
;
++
k
)
{
output_data
[((
m
*
in_channels
)
+
c
)
*
height
*
width
+
k
]
=
input_data
[
k
*
out_channels
*
in_channels
+
c
*
out_channels
+
m
];
input_data
[
k
*
out_channels
*
in_channels
+
c
*
out_channels
+
m
];
}
}
}
...
...
@@ -309,12 +307,11 @@ class OpsTestNet {
}
}
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
FillNHWCInputToNCHWInput
(
const
std
::
string
&
name_nchw
,
const
std
::
string
&
name_nhwc
)
{
Tensor
*
input
=
ws_
.
GetTensor
(
name_nhwc
);
Tensor
*
output
=
ws_
.
CreateTensor
(
name_nchw
,
GetDeviceAllocator
(
D
),
Tensor
*
output
=
ws_
.
CreateTensor
(
name_nchw
,
GetDeviceAllocator
(
D
),
DataTypeToEnum
<
T
>::
v
());
const
std
::
vector
<
index_t
>
input_shape
=
input
->
shape
();
index_t
batch
=
input_shape
[
0
];
...
...
@@ -370,14 +367,12 @@ class OpsTestNet {
// DEPRECATED(liyin):
// Test and benchmark should setup model once and run multiple times.
// Setup time should not be counted during benchmark.
MaceStatus
RunOp
()
{
return
RunOp
(
DeviceType
::
CPU
);
}
MaceStatus
RunOp
()
{
return
RunOp
(
DeviceType
::
CPU
);
}
MaceStatus
RunNet
(
const
NetDef
&
net_def
,
const
DeviceType
device
)
{
device_
=
device
;
net_
=
CreateNet
(
op_registry_
,
net_def
,
&
ws_
,
device
,
NetMode
::
INIT
);
MACE_
FAILURE_RETURN
(
net_
->
Run
());
MACE_
RETURN_IF_ERROR
(
net_
->
Run
());
net_
=
CreateNet
(
op_registry_
,
net_def
,
&
ws_
,
device
);
return
net_
->
Run
();
}
...
...
@@ -415,7 +410,7 @@ class OpsTestBase : public ::testing::Test {
}
};
template
<
typename
T
>
template
<
typename
T
>
void
GenerateRandomRealTypeData
(
const
std
::
vector
<
index_t
>
&
shape
,
std
::
vector
<
T
>
*
res
,
bool
positive
=
true
)
{
...
...
@@ -430,10 +425,9 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
res
->
resize
(
size
);
if
(
DataTypeToEnum
<
T
>::
value
==
DT_HALF
)
{
std
::
generate
(
res
->
begin
(),
res
->
end
(),
[
&
gen
,
&
nd
,
positive
]
{
return
half_float
::
half_cast
<
half
>
(
positive
?
std
::
abs
(
nd
(
gen
))
:
nd
(
gen
));
std
::
generate
(
res
->
begin
(),
res
->
end
(),
[
&
gen
,
&
nd
,
positive
]
{
return
half_float
::
half_cast
<
half
>
(
positive
?
std
::
abs
(
nd
(
gen
))
:
nd
(
gen
));
});
}
else
{
std
::
generate
(
res
->
begin
(),
res
->
end
(),
[
&
gen
,
&
nd
,
positive
]
{
...
...
@@ -442,7 +436,7 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
}
}
template
<
typename
T
>
template
<
typename
T
>
void
GenerateRandomIntTypeData
(
const
std
::
vector
<
index_t
>
&
shape
,
std
::
vector
<
T
>
*
res
,
const
T
a
=
0
,
...
...
@@ -460,7 +454,7 @@ void GenerateRandomIntTypeData(const std::vector<index_t> &shape,
std
::
generate
(
res
->
begin
(),
res
->
end
(),
[
&
gen
,
&
nd
]
{
return
nd
(
gen
);
});
}
template
<
typename
T
>
template
<
typename
T
>
std
::
vector
<
T
>
VectorStaticCast
(
const
std
::
vector
<
float
>
&&
src
)
{
std
::
vector
<
T
>
dest
;
dest
.
reserve
(
src
.
size
());
...
...
@@ -470,7 +464,7 @@ std::vector<T> VectorStaticCast(const std::vector<float> &&src) {
return
std
::
move
(
dest
);
}
template
<
typename
T
>
template
<
typename
T
>
std
::
unique_ptr
<
Tensor
>
CreateTensor
(
const
std
::
vector
<
index_t
>
&
shape
,
const
std
::
vector
<
T
>
&
data
)
{
std
::
unique_ptr
<
Tensor
>
res
(
...
...
@@ -504,24 +498,24 @@ inline std::string ShapeToString(const Tensor &x) {
return
std
::
string
(
stream
.
str
());
}
template
<
typename
T
>
template
<
typename
T
>
struct
is_floating_point_type
{
static
const
bool
value
=
std
::
is_same
<
T
,
float
>::
value
||
std
::
is_same
<
T
,
double
>::
value
||
std
::
is_same
<
T
,
half
>::
value
;
};
template
<
typename
T
>
template
<
typename
T
>
inline
void
ExpectEqual
(
const
T
&
a
,
const
T
&
b
)
{
EXPECT_EQ
(
a
,
b
);
}
template
<
>
template
<
>
inline
void
ExpectEqual
<
float
>
(
const
float
&
a
,
const
float
&
b
)
{
EXPECT_FLOAT_EQ
(
a
,
b
);
}
template
<
>
template
<
>
inline
void
ExpectEqual
<
double
>
(
const
double
&
a
,
const
double
&
b
)
{
EXPECT_DOUBLE_EQ
(
a
,
b
);
}
...
...
@@ -531,13 +525,13 @@ inline void AssertSameDims(const Tensor &x, const Tensor &y) {
<<
"y.shape [ "
<<
ShapeToString
(
y
)
<<
"]"
;
}
template
<
typename
EXP_TYPE
,
template
<
typename
EXP_TYPE
,
typename
RES_TYPE
,
bool
is_fp
=
is_floating_point_type
<
EXP_TYPE
>
::
value
>
struct
Expector
;
// Partial specialization for float and double.
template
<
typename
EXP_TYPE
,
typename
RES_TYPE
>
template
<
typename
EXP_TYPE
,
typename
RES_TYPE
>
struct
Expector
<
EXP_TYPE
,
RES_TYPE
,
true
>
{
static
void
Equal
(
const
EXP_TYPE
&
a
,
const
RES_TYPE
&
b
)
{
ExpectEqual
(
a
,
b
);
}
...
...
@@ -554,7 +548,8 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
}
}
static
void
Near
(
const
Tensor
&
x
,
const
Tensor
&
y
,
static
void
Near
(
const
Tensor
&
x
,
const
Tensor
&
y
,
const
double
rel_err
,
const
double
abs_err
)
{
ASSERT_EQ
(
x
.
dtype
(),
DataTypeToEnum
<
EXP_TYPE
>::
v
());
...
...
@@ -588,7 +583,7 @@ struct Expector<EXP_TYPE, RES_TYPE, true> {
}
};
template
<
typename
EXP_TYPE
,
typename
RES_TYPE
>
template
<
typename
EXP_TYPE
,
typename
RES_TYPE
>
struct
Expector
<
EXP_TYPE
,
RES_TYPE
,
false
>
{
static
void
Equal
(
const
EXP_TYPE
&
a
,
const
RES_TYPE
&
b
)
{
ExpectEqual
(
a
,
b
);
}
...
...
@@ -605,7 +600,8 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
}
}
static
void
Near
(
const
Tensor
&
x
,
const
Tensor
&
y
,
static
void
Near
(
const
Tensor
&
x
,
const
Tensor
&
y
,
const
double
rel_err
,
const
double
abs_err
)
{
MACE_UNUSED
(
rel_err
);
...
...
@@ -614,21 +610,23 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
}
};
template
<
typename
T
>
void
ExpectTensorNear
(
const
Tensor
&
x
,
const
Tensor
&
y
,
template
<
typename
T
>
void
ExpectTensorNear
(
const
Tensor
&
x
,
const
Tensor
&
y
,
const
double
rel_err
=
1e-5
,
const
double
abs_err
=
1e-8
)
{
Expector
<
T
,
T
>::
Near
(
x
,
y
,
rel_err
,
abs_err
);
}
template
<
typename
EXP_TYPE
,
typename
RES_TYPE
>
void
ExpectTensorNear
(
const
Tensor
&
x
,
const
Tensor
&
y
,
template
<
typename
EXP_TYPE
,
typename
RES_TYPE
>
void
ExpectTensorNear
(
const
Tensor
&
x
,
const
Tensor
&
y
,
const
double
rel_err
=
1e-5
,
const
double
abs_err
=
1e-8
)
{
Expector
<
EXP_TYPE
,
RES_TYPE
>::
Near
(
x
,
y
,
rel_err
,
abs_err
);
}
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
BufferToImage
(
OpsTestNet
*
net
,
const
std
::
string
&
input_name
,
const
std
::
string
&
output_name
,
...
...
@@ -648,7 +646,7 @@ void BufferToImage(OpsTestNet *net,
net
->
Sync
();
}
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
ImageToBuffer
(
OpsTestNet
*
net
,
const
std
::
string
&
input_name
,
const
std
::
string
&
output_name
,
...
...
mace/ops/pad.h
浏览文件 @
33415ee9
...
...
@@ -29,8 +29,7 @@ class PadOp : public Operator<D, T> {
PadOp
(
const
OperatorDef
&
operator_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
operator_def
,
ws
),
functor_
(
OperatorBase
::
GetRepeatedArgs
<
int
>
(
"paddings"
),
OperatorBase
::
GetOptionalArg
<
float
>
(
"constant_value"
,
0.0
))
{}
OperatorBase
::
GetOptionalArg
<
float
>
(
"constant_value"
,
0.0
))
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input_tensor
=
this
->
Input
(
0
);
...
...
mace/ops/pad_test.cc
浏览文件 @
33415ee9
...
...
@@ -45,9 +45,7 @@ void Simple() {
ImageToBuffer
<
D
,
float
>
(
&
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
}
else
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"TInput"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"TInput"
,
NCHW
);
OpDefBuilder
(
"Pad"
,
"PadTest"
)
.
Input
(
"TInput"
)
...
...
@@ -59,33 +57,25 @@ void Simple() {
// Run
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"TOutput"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"TOutput"
,
NCHW
,
"Output"
,
NHWC
);
}
auto
output
=
net
.
GetTensor
(
"Output"
);
auto
expected
=
CreateTensor
<
float
>
({
1
,
5
,
6
,
1
},
{
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
2
,
2
,
2
,
1.0
,
1.0
,
1.0
,
2
,
2
,
2
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
auto
expected
=
CreateTensor
<
float
>
(
{
1
,
5
,
6
,
1
},
{
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
2
,
2
,
2
,
1.0
,
1.0
,
1.0
,
2
,
2
,
2
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
output
,
1e-5
);
}
}
// namespace
TEST_F
(
PadTest
,
SimpleCPU
)
{
Simple
<
DeviceType
::
CPU
>
();
}
TEST_F
(
PadTest
,
SimpleCPU
)
{
Simple
<
DeviceType
::
CPU
>
();
}
TEST_F
(
PadTest
,
SimpleGPU
)
{
Simple
<
DeviceType
::
GPU
>
();
}
TEST_F
(
PadTest
,
SimpleGPU
)
{
Simple
<
DeviceType
::
GPU
>
();
}
TEST_F
(
PadTest
,
ComplexCPU
)
{
// Construct graph
...
...
@@ -93,9 +83,7 @@ TEST_F(PadTest, ComplexCPU) {
// Add input data
net
.
AddRepeatedInput
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
{
1
,
1
,
1
,
2
},
2
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"TInput"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"TInput"
,
NCHW
);
OpDefBuilder
(
"Pad"
,
"PadTest"
)
.
Input
(
"TInput"
)
...
...
@@ -106,9 +94,7 @@ TEST_F(PadTest, ComplexCPU) {
// Run
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"TOutput"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"TOutput"
,
NCHW
,
"Output"
,
NHWC
);
auto
output
=
net
.
GetTensor
(
"Output"
);
...
...
@@ -134,9 +120,7 @@ void Complex(const std::vector<index_t> &input_shape,
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
input_shape
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"TInput"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"TInput"
,
NCHW
);
OpDefBuilder
(
"Pad"
,
"PadTest"
)
.
Input
(
"TInput"
)
...
...
@@ -147,9 +131,7 @@ void Complex(const std::vector<index_t> &input_shape,
// Run
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"TOutput"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"TOutput"
,
NCHW
,
"Output"
,
NHWC
);
Tensor
expected
;
...
...
@@ -181,24 +163,23 @@ void Complex(const std::vector<index_t> &input_shape,
}
// namespace
TEST_F
(
PadTest
,
ComplexFloat
)
{
Complex
<
float
>
({
1
,
32
,
32
,
4
},
{
0
,
0
,
0
,
0
,
2
,
2
,
1
,
1
},
{
0
,
0
,
2
,
2
,
1
,
1
,
0
,
0
});
Complex
<
float
>
({
1
,
31
,
37
,
16
},
{
0
,
0
,
0
,
0
,
2
,
0
,
1
,
0
},
{
0
,
0
,
2
,
0
,
1
,
0
,
0
,
0
});
Complex
<
float
>
({
1
,
128
,
128
,
32
},
{
0
,
0
,
0
,
0
,
0
,
1
,
0
,
2
},
{
0
,
0
,
0
,
1
,
0
,
2
,
0
,
0
});
Complex
<
float
>
({
1
,
32
,
32
,
4
},
{
0
,
0
,
0
,
0
,
2
,
2
,
1
,
1
},
{
0
,
0
,
2
,
2
,
1
,
1
,
0
,
0
});
Complex
<
float
>
({
1
,
31
,
37
,
16
},
{
0
,
0
,
0
,
0
,
2
,
0
,
1
,
0
},
{
0
,
0
,
2
,
0
,
1
,
0
,
0
,
0
});
Complex
<
float
>
({
1
,
128
,
128
,
32
},
{
0
,
0
,
0
,
0
,
0
,
1
,
0
,
2
},
{
0
,
0
,
0
,
1
,
0
,
2
,
0
,
0
});
}
TEST_F
(
PadTest
,
ComplexHalf
)
{
Complex
<
half
>
({
1
,
32
,
32
,
4
},
{
0
,
0
,
0
,
0
,
2
,
2
,
1
,
1
},
{
0
,
0
,
2
,
2
,
1
,
1
,
0
,
0
});
Complex
<
half
>
({
1
,
31
,
37
,
16
},
{
0
,
0
,
0
,
0
,
2
,
0
,
1
,
0
},
{
0
,
0
,
2
,
0
,
1
,
0
,
0
,
0
});
Complex
<
half
>
({
1
,
128
,
128
,
32
},
{
0
,
0
,
0
,
0
,
0
,
1
,
0
,
2
},
{
0
,
0
,
0
,
1
,
0
,
2
,
0
,
0
});
Complex
<
half
>
({
1
,
32
,
32
,
4
},
{
0
,
0
,
0
,
0
,
2
,
2
,
1
,
1
},
{
0
,
0
,
2
,
2
,
1
,
1
,
0
,
0
});
Complex
<
half
>
({
1
,
31
,
37
,
16
},
{
0
,
0
,
0
,
0
,
2
,
0
,
1
,
0
},
{
0
,
0
,
2
,
0
,
1
,
0
,
0
,
0
});
Complex
<
half
>
({
1
,
128
,
128
,
32
},
{
0
,
0
,
0
,
0
,
0
,
1
,
0
,
2
},
{
0
,
0
,
0
,
1
,
0
,
2
,
0
,
0
});
}
}
// namespace test
}
// namespace ops
}
// namespace mace
mace/ops/pooling_test.cc
浏览文件 @
33415ee9
...
...
@@ -35,9 +35,7 @@ TEST_F(PoolingOpTest, MAX_VALID) {
{
0
,
16
,
1
,
17
,
2
,
18
,
3
,
19
,
4
,
20
,
5
,
21
,
6
,
22
,
7
,
23
,
8
,
24
,
9
,
25
,
10
,
26
,
11
,
27
,
12
,
28
,
13
,
29
,
14
,
30
,
15
,
31
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Pooling"
,
"PoolingTest"
)
...
...
@@ -53,9 +51,7 @@ TEST_F(PoolingOpTest, MAX_VALID) {
// Run
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
@@ -73,9 +69,7 @@ TEST_F(PoolingOpTest, MAX_SAME) {
net
.
AddInputFromArray
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
{
1
,
3
,
3
,
1
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Pooling"
,
"PoolingTest"
)
...
...
@@ -91,9 +85,7 @@ TEST_F(PoolingOpTest, MAX_SAME) {
// Run
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
@@ -111,9 +103,7 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
"Input"
,
{
1
,
4
,
4
,
1
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Pooling"
,
"PoolingTest"
)
...
...
@@ -129,9 +119,7 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
// Run
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
@@ -149,9 +137,7 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
"Input"
,
{
1
,
2
,
9
,
1
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Pooling"
,
"PoolingTest"
)
...
...
@@ -164,13 +150,10 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
@@ -180,7 +163,7 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
}
namespace
{
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
SimpleMaxPooling3S2
()
{
// Construct graph
OpsTestNet
net
;
...
...
@@ -192,9 +175,7 @@ void SimpleMaxPooling3S2() {
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
});
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
// Run
OpDefBuilder
(
"Pooling"
,
"PoolingTest"
)
...
...
@@ -207,10 +188,8 @@ void SimpleMaxPooling3S2() {
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
else
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
float
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
...
...
@@ -237,12 +216,10 @@ void SimpleMaxPooling3S2() {
TEST_F
(
PoolingOpTest
,
CPUSimpleMaxPooling3S2
)
{
SimpleMaxPooling3S2
<
CPU
>
();
}
TEST_F
(
PoolingOpTest
,
OPENCLSimpleMaxPooling3S2
)
{
SimpleMaxPooling3S2
<
GPU
>
();
}
TEST_F
(
PoolingOpTest
,
OPENCLSimpleMaxPooling3S2
)
{
SimpleMaxPooling3S2
<
GPU
>
();
}
namespace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
MaxPooling3S2
(
const
std
::
vector
<
index_t
>
&
input_shape
,
const
std
::
vector
<
int
>
strides
,
Padding
padding
)
{
...
...
@@ -252,9 +229,7 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
input_shape
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Pooling"
,
"PoolingTest"
)
...
...
@@ -270,9 +245,7 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
// run on cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
Tensor
expected
;
...
...
@@ -295,8 +268,8 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
if
(
DataTypeToEnum
<
T
>::
value
==
DT_HALF
)
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-
3
,
1e-
4
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-3
,
1e-4
);
}
else
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-5
);
}
...
...
@@ -334,9 +307,7 @@ TEST_F(PoolingOpTest, AVG_VALID) {
{
0
,
16
,
1
,
17
,
2
,
18
,
3
,
19
,
4
,
20
,
5
,
21
,
6
,
22
,
7
,
23
,
8
,
24
,
9
,
25
,
10
,
26
,
11
,
27
,
12
,
28
,
13
,
29
,
14
,
30
,
15
,
31
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Pooling"
,
"PoolingTest"
)
...
...
@@ -349,13 +320,10 @@ TEST_F(PoolingOpTest, AVG_VALID) {
.
AddIntArg
(
"pooling_type"
,
PoolingType
::
AVG
)
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
@@ -366,7 +334,7 @@ TEST_F(PoolingOpTest, AVG_VALID) {
}
namespace
{
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
SimpleAvgPoolingTest
()
{
// Construct graph
OpsTestNet
net
;
...
...
@@ -399,12 +367,10 @@ void SimpleAvgPoolingTest() {
}
}
// namespace
TEST_F
(
PoolingOpTest
,
OPENCLSimpleAvgPooling
)
{
SimpleAvgPoolingTest
<
GPU
>
();
}
TEST_F
(
PoolingOpTest
,
OPENCLSimpleAvgPooling
)
{
SimpleAvgPoolingTest
<
GPU
>
();
}
namespace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
AvgPoolingTest
(
const
std
::
vector
<
index_t
>
&
shape
,
const
std
::
vector
<
int
>
&
kernels
,
const
std
::
vector
<
int
>
&
strides
,
...
...
@@ -415,9 +381,7 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
shape
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"Pooling"
,
"PoolingTest"
)
...
...
@@ -433,9 +397,7 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
// run on cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
Tensor
expected
;
...
...
@@ -458,25 +420,21 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
if
(
DataTypeToEnum
<
T
>::
value
==
DT_HALF
)
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-3
,
1e-3
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-3
,
1e-3
);
}
else
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-5
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-5
);
}
}
}
// namespace
TEST_F
(
PoolingOpTest
,
OPENCLAlignedAvgPooling
)
{
AvgPoolingTest
<
GPU
,
float
>
({
3
,
15
,
15
,
128
},
{
4
,
4
},
{
4
,
4
},
Padding
::
VALID
);
AvgPoolingTest
<
GPU
,
float
>
({
3
,
15
,
15
,
128
},
{
4
,
4
},
{
4
,
4
},
Padding
::
SAME
);
AvgPoolingTest
<
GPU
,
float
>
({
3
,
15
,
15
,
128
},
{
4
,
4
},
{
4
,
4
},
Padding
::
VALID
);
AvgPoolingTest
<
GPU
,
float
>
({
3
,
15
,
15
,
128
},
{
4
,
4
},
{
4
,
4
},
Padding
::
SAME
);
}
TEST_F
(
PoolingOpTest
,
OPENCLHalfAlignedAvgPooling
)
{
AvgPoolingTest
<
GPU
,
half
>
({
3
,
15
,
15
,
128
},
{
4
,
4
},
{
4
,
4
},
Padding
::
VALID
);
AvgPoolingTest
<
GPU
,
half
>
({
3
,
15
,
15
,
128
},
{
4
,
4
},
{
4
,
4
},
Padding
::
VALID
);
AvgPoolingTest
<
GPU
,
half
>
({
3
,
15
,
15
,
128
},
{
4
,
4
},
{
4
,
4
},
Padding
::
SAME
);
}
...
...
@@ -495,17 +453,13 @@ TEST_F(PoolingOpTest, OPENCLHalfAlignedLargeKernelAvgPooling) {
}
TEST_F
(
PoolingOpTest
,
OPENCLUnAlignedAvgPooling
)
{
AvgPoolingTest
<
GPU
,
float
>
({
3
,
31
,
37
,
128
},
{
2
,
2
},
{
2
,
2
},
Padding
::
VALID
);
AvgPoolingTest
<
GPU
,
float
>
({
3
,
31
,
37
,
128
},
{
2
,
2
},
{
2
,
2
},
Padding
::
SAME
);
AvgPoolingTest
<
GPU
,
float
>
({
3
,
31
,
37
,
128
},
{
2
,
2
},
{
2
,
2
},
Padding
::
VALID
);
AvgPoolingTest
<
GPU
,
float
>
({
3
,
31
,
37
,
128
},
{
2
,
2
},
{
2
,
2
},
Padding
::
SAME
);
}
TEST_F
(
PoolingOpTest
,
OPENCLUnAlignedLargeKernelAvgPooling
)
{
AvgPoolingTest
<
GPU
,
float
>
({
3
,
31
,
37
,
128
},
{
8
,
8
},
{
8
,
8
},
Padding
::
VALID
);
AvgPoolingTest
<
GPU
,
float
>
({
3
,
31
,
37
,
128
},
{
8
,
8
},
{
8
,
8
},
Padding
::
SAME
);
AvgPoolingTest
<
GPU
,
float
>
({
3
,
31
,
37
,
128
},
{
8
,
8
},
{
8
,
8
},
Padding
::
VALID
);
AvgPoolingTest
<
GPU
,
float
>
({
3
,
31
,
37
,
128
},
{
8
,
8
},
{
8
,
8
},
Padding
::
SAME
);
}
}
// namespace test
...
...
mace/ops/proposal_test.cc
浏览文件 @
33415ee9
...
...
@@ -45,17 +45,17 @@ TEST_F(ProposalOpTest, CPUSimple) {
.
Finalize
(
net
.
NewOperatorDef
());
std
::
vector
<
float
>
scores
(
height
*
width
*
18
);
for
(
size_t
i
=
0
;
i
<
scores
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
scores
.
size
();
++
i
)
{
scores
[
i
]
=
i
;
}
// Add input data
net
.
AddInputFromArray
<
DeviceType
::
CPU
,
float
>
(
"RpnCLSProb"
,
{
1
,
height
,
width
,
18
},
scores
);
net
.
AddRepeatedInput
<
DeviceType
::
CPU
,
float
>
(
"RpnBBoxPred"
,
{
1
,
height
,
width
,
4
*
9
},
1
);
net
.
AddInputFromArray
<
DeviceType
::
CPU
,
float
>
(
"ImgInfo"
,
{
1
,
1
,
1
,
3
},
{
img_height
,
img_width
,
2
});
net
.
AddInputFromArray
<
DeviceType
::
CPU
,
float
>
(
"RpnCLSProb"
,
{
1
,
height
,
width
,
18
},
scores
);
net
.
AddRepeatedInput
<
DeviceType
::
CPU
,
float
>
(
"RpnBBoxPred"
,
{
1
,
height
,
width
,
4
*
9
},
1
);
net
.
AddInputFromArray
<
DeviceType
::
CPU
,
float
>
(
"ImgInfo"
,
{
1
,
1
,
1
,
3
},
{
img_height
,
img_width
,
2
});
// Run
net
.
RunOp
();
...
...
@@ -65,7 +65,6 @@ TEST_F(ProposalOpTest, CPUSimple) {
ExpectTensorNear
<
float
>
(
*
expected_tensor
,
*
net
.
GetTensor
(
"Output"
),
1e-5
);
}
}
// namespace test
}
// namespace ops
}
// namespace mace
mace/ops/quantize.cc
浏览文件 @
33415ee9
mace/ops/quantize.h
浏览文件 @
33415ee9
...
...
@@ -21,12 +21,11 @@
namespace
mace
{
namespace
ops
{
template
<
DeviceType
D
,
class
T
>
template
<
DeviceType
D
,
class
T
>
class
QuantizeOp
:
public
Operator
<
D
,
T
>
{
public:
QuantizeOp
(
const
OperatorDef
&
operator_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
operator_def
,
ws
)
{
}
:
Operator
<
D
,
T
>
(
operator_def
,
ws
)
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
...
...
@@ -39,9 +38,9 @@ class QuantizeOp : public Operator<D, T> {
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
Tensor
*
out_min
=
this
->
Output
(
OUT_MIN
);
Tensor
*
out_max
=
this
->
Output
(
OUT_MAX
);
MACE_
FAILURE_RETURN
(
output
->
ResizeLike
(
input
));
MACE_
FAILURE_RETURN
(
out_min
->
ResizeLike
(
in_min
));
MACE_
FAILURE_RETURN
(
out_max
->
ResizeLike
(
in_max
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeLike
(
input
));
MACE_
RETURN_IF_ERROR
(
out_min
->
ResizeLike
(
in_min
));
MACE_
RETURN_IF_ERROR
(
out_max
->
ResizeLike
(
in_max
));
return
functor_
(
input
,
in_min
,
in_max
,
output
,
out_min
,
out_max
,
future
);
}
...
...
@@ -54,12 +53,11 @@ class QuantizeOp : public Operator<D, T> {
MACE_OP_OUTPUT_TAGS
(
OUTPUT
,
OUT_MIN
,
OUT_MAX
);
};
template
<
DeviceType
D
,
class
T
>
template
<
DeviceType
D
,
class
T
>
class
DequantizeOp
:
public
Operator
<
D
,
T
>
{
public:
DequantizeOp
(
const
OperatorDef
&
operator_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
operator_def
,
ws
)
{
}
:
Operator
<
D
,
T
>
(
operator_def
,
ws
)
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
...
...
@@ -70,7 +68,7 @@ class DequantizeOp : public Operator<D, T> {
MACE_CHECK
(
in_max
->
size
()
==
1
,
"max val tensor has more than 1 value"
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
MACE_
FAILURE_RETURN
(
output
->
ResizeLike
(
input
));
MACE_
RETURN_IF_ERROR
(
output
->
ResizeLike
(
input
));
return
functor_
(
input
,
in_min
,
in_max
,
output
,
future
);
}
...
...
@@ -83,12 +81,11 @@ class DequantizeOp : public Operator<D, T> {
MACE_OP_OUTPUT_TAGS
(
OUTPUT
);
};
template
<
DeviceType
D
,
class
T
>
template
<
DeviceType
D
,
class
T
>
class
RequantizeOp
:
public
Operator
<
D
,
T
>
{
public:
RequantizeOp
(
const
OperatorDef
&
operator_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
operator_def
,
ws
)
{
}
:
Operator
<
D
,
T
>
(
operator_def
,
ws
)
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
...
...
@@ -112,19 +109,12 @@ class RequantizeOp : public Operator<D, T> {
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
Tensor
*
out_min
=
this
->
Output
(
OUT_MIN
);
Tensor
*
out_max
=
this
->
Output
(
OUT_MAX
);
MACE_FAILURE_RETURN
(
output
->
ResizeLike
(
input
));
MACE_FAILURE_RETURN
(
out_min
->
ResizeLike
(
in_min
));
MACE_FAILURE_RETURN
(
out_max
->
ResizeLike
(
out_max
));
return
functor_
(
input
,
in_min
,
in_max
,
rerange_min
,
rerange_max
,
output
,
out_min
,
out_max
,
future
);
MACE_RETURN_IF_ERROR
(
output
->
ResizeLike
(
input
));
MACE_RETURN_IF_ERROR
(
out_min
->
ResizeLike
(
in_min
));
MACE_RETURN_IF_ERROR
(
out_max
->
ResizeLike
(
out_max
));
return
functor_
(
input
,
in_min
,
in_max
,
rerange_min
,
rerange_max
,
output
,
out_min
,
out_max
,
future
);
}
private:
...
...
mace/ops/quantize_test.cc
浏览文件 @
33415ee9
...
...
@@ -26,9 +26,8 @@ TEST_F(QuantizeTest, TestQuantize) {
OpsTestNet
net
;
// Add input data
net
.
AddInputFromArray
<
CPU
,
float
>
(
"Input"
,
{
1
,
2
,
3
,
1
},
{
-
2
,
-
1
,
1
,
2
,
3
,
4
});
net
.
AddInputFromArray
<
CPU
,
float
>
(
"Input"
,
{
1
,
2
,
3
,
1
},
{
-
2
,
-
1
,
1
,
2
,
3
,
4
});
net
.
AddInputFromArray
<
CPU
,
float
>
(
"InputMin"
,
{
1
},
{
-
3
});
net
.
AddInputFromArray
<
CPU
,
float
>
(
"InputMax"
,
{
1
},
{
5
});
...
...
@@ -50,10 +49,8 @@ TEST_F(QuantizeTest, TestQuantize) {
auto
output_min
=
net
.
GetTensor
(
"OutputMin"
);
auto
output_max
=
net
.
GetTensor
(
"OutputMax"
);
auto
expected_output
=
CreateTensor
<
uint8_t
>
({
1
,
2
,
3
,
1
},
{
32
,
64
,
127
,
159
,
191
,
223
});
auto
expected_output
=
CreateTensor
<
uint8_t
>
({
1
,
2
,
3
,
1
},
{
32
,
64
,
127
,
159
,
191
,
223
});
auto
expected_min
=
CreateTensor
<
float
>
({
1
},
{
-
3.01887
});
auto
expected_max
=
CreateTensor
<
float
>
({
1
},
{
5
});
...
...
@@ -69,16 +66,14 @@ TEST_F(QuantizeTest, TestQuantizeTrend) {
// Add input data
net
.
AddRandomInput
<
CPU
,
float
>
(
"Input"
,
{
100
});
const
float
*
input_data
=
net
.
GetTensor
(
"Input"
)
->
data
<
float
>
();
net
.
AddInputFromArray
<
CPU
,
float
>
(
"InputMin"
,
{
1
},
net
.
AddInputFromArray
<
CPU
,
float
>
(
"InputMin"
,
{
1
},
{
*
std
::
min_element
(
input_data
,
input_data
+
net
.
GetTensor
(
"Input"
)
->
size
())});
net
.
AddInputFromArray
<
CPU
,
float
>
(
"InputMax"
,
{
1
},
input_data
+
net
.
GetTensor
(
"Input"
)
->
size
())});
net
.
AddInputFromArray
<
CPU
,
float
>
(
"InputMax"
,
{
1
},
{
*
std
::
max_element
(
input_data
,
input_data
+
net
.
GetTensor
(
"Input"
)
->
size
())});
input_data
+
net
.
GetTensor
(
"Input"
)
->
size
())});
OpDefBuilder
(
"Quantize"
,
"QuantizeTest"
)
.
Input
(
"Input"
)
...
...
@@ -113,9 +108,8 @@ TEST_F(QuantizeTest, TestDequantize) {
OpsTestNet
net
;
// Add input data
net
.
AddInputFromArray
<
CPU
,
uint8_t
>
(
"Input"
,
{
1
,
2
,
3
,
1
},
{
32
,
64
,
127
,
159
,
191
,
223
});
net
.
AddInputFromArray
<
CPU
,
uint8_t
>
(
"Input"
,
{
1
,
2
,
3
,
1
},
{
32
,
64
,
127
,
159
,
191
,
223
});
net
.
AddInputFromArray
<
CPU
,
float
>
(
"InputMin"
,
{
1
},
{
-
3.01887
});
net
.
AddInputFromArray
<
CPU
,
float
>
(
"InputMax"
,
{
1
},
{
5
});
...
...
@@ -132,10 +126,8 @@ TEST_F(QuantizeTest, TestDequantize) {
net
.
RunOp
();
auto
output
=
net
.
GetTensor
(
"Output"
);
auto
expected_output
=
CreateTensor
<
float
>
({
1
,
2
,
3
,
1
},
{
-
2
,
-
1
,
1
,
2
,
3
,
4
});
auto
expected_output
=
CreateTensor
<
float
>
({
1
,
2
,
3
,
1
},
{
-
2
,
-
1
,
1
,
2
,
3
,
4
});
auto
expected_min
=
CreateTensor
<
float
>
({
1
},
{
-
3.01887
});
auto
expected_max
=
CreateTensor
<
float
>
({
1
},
{
5
});
...
...
@@ -147,9 +139,9 @@ TEST_F(QuantizeTest, TestRequantizeWithMinMax) {
OpsTestNet
net
;
// Add input data
net
.
AddInputFromArray
<
CPU
,
int
>
(
"Input"
,
{
1
,
2
,
3
,
1
},
{
-
1073741824
,
-
536870912
,
536870912
,
1073741824
,
1610612736
,
2147483647
});
net
.
AddInputFromArray
<
CPU
,
int
>
(
"Input"
,
{
1
,
2
,
3
,
1
},
{
-
1073741824
,
-
536870912
,
536870912
,
1073741824
,
1610612736
,
2147483647
});
net
.
AddInputFromArray
<
CPU
,
float
>
(
"InputMin"
,
{
1
},
{
-
3
});
net
.
AddInputFromArray
<
CPU
,
float
>
(
"InputMax"
,
{
1
},
{
5
});
net
.
AddInputFromArray
<
CPU
,
float
>
(
"RerangeMin"
,
{
1
},
{
-
3.01887
});
...
...
@@ -172,10 +164,8 @@ TEST_F(QuantizeTest, TestRequantizeWithMinMax) {
net
.
RunOp
();
auto
output
=
net
.
GetTensor
(
"Output"
);
auto
expected_output
=
CreateTensor
<
uint8_t
>
({
1
,
2
,
3
,
1
},
{
32
,
64
,
128
,
160
,
191
,
223
});
auto
expected_output
=
CreateTensor
<
uint8_t
>
({
1
,
2
,
3
,
1
},
{
32
,
64
,
128
,
160
,
191
,
223
});
auto
expected_min
=
CreateTensor
<
float
>
({
1
},
{
-
3.01887
});
auto
expected_max
=
CreateTensor
<
float
>
({
1
},
{
5
});
...
...
@@ -187,9 +177,9 @@ TEST_F(QuantizeTest, TestRequantizeWithoutMinMax) {
OpsTestNet
net
;
// Add input data
net
.
AddInputFromArray
<
CPU
,
int
>
(
"Input"
,
{
1
,
2
,
3
,
1
},
{
-
1073741824
,
-
536870912
,
536870912
,
1073741824
,
1610612736
,
2147483647
});
net
.
AddInputFromArray
<
CPU
,
int
>
(
"Input"
,
{
1
,
2
,
3
,
1
},
{
-
1073741824
,
-
536870912
,
536870912
,
1073741824
,
1610612736
,
2147483647
});
net
.
AddInputFromArray
<
CPU
,
float
>
(
"InputMin"
,
{
1
},
{
-
3
});
net
.
AddInputFromArray
<
CPU
,
float
>
(
"InputMax"
,
{
1
},
{
5
});
...
...
@@ -208,10 +198,8 @@ TEST_F(QuantizeTest, TestRequantizeWithoutMinMax) {
net
.
RunOp
();
auto
output
=
net
.
GetTensor
(
"Output"
);
auto
expected_output
=
CreateTensor
<
uint8_t
>
({
1
,
2
,
3
,
1
},
{
0
,
43
,
128
,
170
,
213
,
255
});
auto
expected_output
=
CreateTensor
<
uint8_t
>
({
1
,
2
,
3
,
1
},
{
0
,
43
,
128
,
170
,
213
,
255
});
auto
expected_min
=
CreateTensor
<
float
>
({
1
},
{
-
3.01887
});
auto
expected_max
=
CreateTensor
<
float
>
({
1
},
{
5
});
ExpectTensorNear
<
uint8_t
>
(
*
expected_output
,
*
output
);
...
...
mace/ops/resize_bilinear.h
浏览文件 @
33415ee9
...
...
@@ -26,8 +26,7 @@ class ResizeBilinearOp : public Operator<D, T> {
public:
ResizeBilinearOp
(
const
OperatorDef
&
operator_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
operator_def
,
ws
),
functor_
(
OperatorBase
::
GetRepeatedArgs
<
index_t
>
(
"size"
,
{
-
1
,
-
1
}),
functor_
(
OperatorBase
::
GetRepeatedArgs
<
index_t
>
(
"size"
,
{
-
1
,
-
1
}),
OperatorBase
::
GetOptionalArg
<
bool
>
(
"align_corners"
,
false
))
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
...
...
mace/ops/resize_bilinear_test.cc
浏览文件 @
33415ee9
...
...
@@ -15,8 +15,8 @@
#include <vector>
#include "mace/core/operator.h"
#include "mace/ops/resize_bilinear.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/resize_bilinear.h"
namespace
mace
{
namespace
ops
{
...
...
@@ -33,9 +33,7 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
std
::
vector
<
float
>
input
(
24
);
std
::
iota
(
begin
(
input
),
end
(
input
),
0
);
net
.
AddInputFromArray
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
{
1
,
2
,
4
,
3
},
input
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"ResizeBilinear"
,
"ResizeBilinearTest"
)
...
...
@@ -46,9 +44,7 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
// Run
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
...
...
@@ -66,9 +62,7 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
std
::
vector
<
float
>
input
(
24
);
std
::
iota
(
begin
(
input
),
end
(
input
),
0
);
net
.
AddInputFromArray
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
{
1
,
2
,
4
,
3
},
input
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"ResizeBilinear"
,
"ResizeBilinearTest"
)
...
...
@@ -80,12 +74,9 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
// Run
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
// Check
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
2
,
3
},
{
0
,
1
,
2
,
9
,
10
,
11
});
...
...
@@ -111,9 +102,7 @@ void TestRandomResizeBilinear() {
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
in_height
,
in_width
,
channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"ResizeBilinear"
,
"ResizeBilinearTest"
)
...
...
@@ -124,10 +113,8 @@ void TestRandomResizeBilinear() {
.
Finalize
(
net
.
NewOperatorDef
());
// Run on CPU
net
.
RunOp
(
DeviceType
::
CPU
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
...
...
@@ -149,8 +136,8 @@ void TestRandomResizeBilinear() {
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
}
// Check
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"DeviceOutput"
),
1e-
5
,
1e-
6
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"DeviceOutput"
),
1e-5
,
1e-6
);
}
}
}
// namespace
...
...
mace/ops/slice.h
浏览文件 @
33415ee9
mace/ops/slice_test.cc
浏览文件 @
33415ee9
...
...
@@ -16,8 +16,8 @@
#include <vector>
#include "gmock/gmock.h"
#include "mace/ops/slice.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/slice.h"
namespace
mace
{
namespace
ops
{
...
...
@@ -26,7 +26,7 @@ namespace test {
class
SliceOpTest
:
public
OpsTestBase
{};
namespace
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
void
RandomTest
(
const
int
num_outputs
,
const
int
axis
)
{
static
unsigned
int
seed
=
time
(
NULL
);
const
index_t
output_channels
=
4
*
(
1
+
rand_r
(
&
seed
)
%
10
);
...
...
@@ -43,10 +43,8 @@ void RandomTest(const int num_outputs, const int axis) {
input_shape
=
{
batch
,
input_channels
,
height
,
width
};
else
if
(
axis
==
3
)
input_shape
=
{
batch
,
height
,
width
,
input_channels
};
const
index_t
input_size
=
std
::
accumulate
(
input_shape
.
begin
(),
input_shape
.
end
(),
1
,
std
::
multiplies
<
index_t
>
());
const
index_t
input_size
=
std
::
accumulate
(
input_shape
.
begin
(),
input_shape
.
end
(),
1
,
std
::
multiplies
<
index_t
>
());
std
::
vector
<
float
>
input_data
(
input_size
);
GenerateRandomRealTypeData
(
input_shape
,
&
input_data
);
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
input_shape
,
input_data
);
...
...
@@ -60,8 +58,7 @@ void RandomTest(const int num_outputs, const int axis) {
for
(
int
i
=
0
;
i
<
num_outputs
;
++
i
)
{
builder
=
builder
.
Output
(
MakeString
(
"OutputImage"
,
i
));
}
builder
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
builder
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
}
else
{
auto
builder
=
OpDefBuilder
(
"Slice"
,
"SliceTest"
).
AddIntArg
(
"axis"
,
axis
);
...
...
@@ -77,8 +74,7 @@ void RandomTest(const int num_outputs, const int axis) {
if
(
D
==
DeviceType
::
GPU
)
{
for
(
int
i
=
0
;
i
<
num_outputs
;
++
i
)
{
ImageToBuffer
<
D
,
float
>
(
&
net
,
MakeString
(
"OutputImage"
,
i
),
ImageToBuffer
<
D
,
float
>
(
&
net
,
MakeString
(
"OutputImage"
,
i
),
MakeString
(
"Output"
,
i
),
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
}
...
...
@@ -90,14 +86,12 @@ void RandomTest(const int num_outputs, const int axis) {
expected_shape
=
{
batch
,
output_channels
,
height
,
width
};
else
if
(
axis
==
3
)
expected_shape
=
{
batch
,
height
,
width
,
output_channels
};
const
index_t
outer_size
=
std
::
accumulate
(
expected_shape
.
begin
(),
expected_shape
.
begin
()
+
axis
,
1
,
std
::
multiplies
<
index_t
>
());
const
index_t
inner_size
=
std
::
accumulate
(
expected_shape
.
begin
()
+
axis
+
1
,
expected_shape
.
end
(),
1
,
const
index_t
outer_size
=
std
::
accumulate
(
expected_shape
.
begin
(),
expected_shape
.
begin
()
+
axis
,
1
,
std
::
multiplies
<
index_t
>
());
const
index_t
inner_size
=
std
::
accumulate
(
expected_shape
.
begin
()
+
axis
+
1
,
expected_shape
.
end
(),
1
,
std
::
multiplies
<
index_t
>
());
const
float
*
input_ptr
=
input_data
.
data
();
const
float
*
output_ptr
;
for
(
int
i
=
0
;
i
<
num_outputs
;
++
i
)
{
...
...
@@ -106,11 +100,11 @@ void RandomTest(const int num_outputs, const int axis) {
Tensor
::
MappingGuard
output_mapper
(
output
);
output_ptr
=
output
->
data
<
float
>
();
for
(
int
outer_idx
=
0
;
outer_idx
<
outer_size
;
++
outer_idx
)
{
const
int
idx
=
(
outer_idx
*
input_channels
+
i
*
output_channels
)
*
inner_size
;
const
int
idx
=
(
outer_idx
*
input_channels
+
i
*
output_channels
)
*
inner_size
;
for
(
int
j
=
0
;
j
<
output_channels
*
inner_size
;
++
j
)
{
ASSERT_NEAR
(
*
output_ptr
++
,
input_ptr
[
idx
+
j
],
1e-2
)
<<
"with output "
<<
i
<<
" index "
<<
idx
+
j
;
ASSERT_NEAR
(
*
output_ptr
++
,
input_ptr
[
idx
+
j
],
1e-2
)
<<
"with output "
<<
i
<<
" index "
<<
idx
+
j
;
}
}
}
...
...
mace/ops/softmax.h
浏览文件 @
33415ee9
...
...
@@ -31,7 +31,7 @@ class SoftmaxOp : public Operator<D, T> {
const
Tensor
*
logits
=
this
->
Input
(
LOGITS
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
output
->
ResizeLike
(
logits
);
MACE_RETURN_IF_ERROR
(
output
->
ResizeLike
(
logits
)
);
return
functor_
(
logits
,
output
,
future
);
}
...
...
mace/ops/softmax_test.cc
浏览文件 @
33415ee9
...
...
@@ -22,7 +22,7 @@ namespace test {
class
SoftmaxOpTest
:
public
OpsTestBase
{};
namespace
{
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
Simple
()
{
// Construct graph
OpsTestNet
net
;
...
...
@@ -71,7 +71,7 @@ TEST_F(SoftmaxOpTest, CPUSimple) { Simple<DeviceType::CPU>(); }
TEST_F
(
SoftmaxOpTest
,
OPENCLSimple
)
{
Simple
<
DeviceType
::
GPU
>
();
}
namespace
{
template
<
DeviceType
D
>
template
<
DeviceType
D
>
void
Complex
(
const
std
::
vector
<
index_t
>
&
logits_shape
)
{
// Construct graph
OpsTestNet
net
;
...
...
@@ -108,8 +108,7 @@ void Complex(const std::vector<index_t> &logits_shape) {
ImageToBuffer
<
D
,
float
>
(
&
net
,
"OutputImage"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-5
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-5
);
}
}
// namespace
...
...
mace/ops/space_to_batch.h
浏览文件 @
33415ee9
...
...
@@ -29,16 +29,14 @@ class SpaceToBatchNDOp : public Operator<D, T> {
public:
SpaceToBatchNDOp
(
const
OperatorDef
&
op_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
op_def
,
ws
),
functor_
(
OperatorBase
::
GetRepeatedArgs
<
int
>
(
"paddings"
,
{
0
,
0
,
0
,
0
}),
functor_
(
OperatorBase
::
GetRepeatedArgs
<
int
>
(
"paddings"
,
{
0
,
0
,
0
,
0
}),
OperatorBase
::
GetRepeatedArgs
<
int
>
(
"block_shape"
,
{
1
,
1
}),
false
)
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
space_tensor
=
this
->
Input
(
INPUT
);
Tensor
*
batch_tensor
=
this
->
Output
(
OUTPUT
);
return
functor_
(
const_cast
<
Tensor
*>
(
space_tensor
),
batch_tensor
,
future
);
return
functor_
(
const_cast
<
Tensor
*>
(
space_tensor
),
batch_tensor
,
future
);
}
private:
...
...
mace/ops/space_to_batch_test.cc
浏览文件 @
33415ee9
...
...
@@ -41,9 +41,7 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
.
AddIntsArg
(
"block_shape"
,
block_shape_data
)
.
Finalize
(
net
.
NewOperatorDef
());
}
else
if
(
D
==
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"SpaceToBatchND"
,
"SpaceToBatchNDTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -60,10 +58,8 @@ void RunSpaceToBatch(const std::vector<index_t> &input_shape,
ImageToBuffer
<
D
,
float
>
(
&
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
}
else
if
(
D
==
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
// Check
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
));
...
...
@@ -89,9 +85,7 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
.
AddIntsArg
(
"block_shape"
,
block_shape_data
)
.
Finalize
(
net
.
NewOperatorDef
());
}
else
if
(
D
==
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"BatchToSpaceND"
,
"BatchToSpaceNDTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -108,10 +102,8 @@ void RunBatchToSpace(const std::vector<index_t> &input_shape,
ImageToBuffer
<
D
,
float
>
(
&
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
}
else
if
(
D
==
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
// Check
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
));
...
...
@@ -124,8 +116,8 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
const
std
::
vector
<
int
>
&
padding_data
,
const
std
::
vector
<
index_t
>
&
batch_shape
,
const
std
::
vector
<
float
>
&
batch_data
)
{
auto
space_tensor
=
std
::
unique_ptr
<
Tensor
>
(
new
Tensor
(
GetDeviceAllocator
(
DeviceType
::
GPU
),
DataTypeToEnum
<
T
>::
v
()));
auto
space_tensor
=
std
::
unique_ptr
<
Tensor
>
(
new
Tensor
(
GetDeviceAllocator
(
DeviceType
::
GPU
),
DataTypeToEnum
<
T
>::
v
()));
space_tensor
->
Resize
(
space_shape
);
{
Tensor
::
MappingGuard
space_mapper
(
space_tensor
.
get
());
...
...
@@ -136,8 +128,8 @@ void TestBidirectionalTransform(const std::vector<index_t> &space_shape,
memcpy
(
space_ptr
,
space_data
.
data
(),
space_data
.
size
()
*
sizeof
(
T
));
}
auto
batch_tensor
=
std
::
unique_ptr
<
Tensor
>
(
new
Tensor
(
GetDeviceAllocator
(
DeviceType
::
GPU
),
DataTypeToEnum
<
T
>::
v
()));
auto
batch_tensor
=
std
::
unique_ptr
<
Tensor
>
(
new
Tensor
(
GetDeviceAllocator
(
DeviceType
::
GPU
),
DataTypeToEnum
<
T
>::
v
()));
batch_tensor
->
Resize
(
batch_shape
);
{
Tensor
::
MappingGuard
batch_mapper
(
batch_tensor
.
get
());
...
...
@@ -233,9 +225,7 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
// run cpu
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"SpaceToBatchND"
,
"SpaceToBatchNDTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -244,10 +234,8 @@ void TestSpaceToBatchLargeInput(const std::vector<index_t> &input_shape,
.
AddIntsArg
(
"block_shape"
,
block_shape_data
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
(
CPU
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"OutputCPU"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"OutputCPU"
,
NHWC
);
// Check
ExpectTensorNear
<
float
>
(
*
net
.
GetOutput
(
"OutputCPU"
),
...
...
@@ -274,9 +262,7 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
// run cpu
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"BatchToSpaceND"
,
"BatchToSpaceNDTest"
)
.
Input
(
"InputNCHW"
)
...
...
@@ -285,17 +271,14 @@ void TestoBatchToSpaceLargeInput(const std::vector<index_t> &input_shape,
.
AddIntsArg
(
"block_shape"
,
block_shape_data
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
(
CPU
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"OutputCPU"
,
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"OutputCPU"
,
NHWC
);
// Check
ExpectTensorNear
<
float
>
(
*
net
.
GetOutput
(
"OutputCPU"
),
*
net
.
GetOutput
(
"OutputGPU"
));
}
TEST
(
SpaceToBatchTest
,
LargeData
)
{
TestSpaceToBatchLargeInput
({
1
,
256
,
256
,
32
},
{
8
,
8
},
{
0
,
0
,
0
,
0
});
TestSpaceToBatchLargeInput
({
1
,
256
,
256
,
32
},
{
8
,
8
},
{
4
,
4
,
4
,
4
});
...
...
mace/ops/space_to_depth.h
浏览文件 @
33415ee9
...
...
@@ -24,20 +24,18 @@
namespace
mace
{
namespace
ops
{
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
class
SpaceToDepthOp
:
public
Operator
<
D
,
T
>
{
public:
SpaceToDepthOp
(
const
OperatorDef
&
op_def
,
Workspace
*
ws
)
:
Operator
<
D
,
T
>
(
op_def
,
ws
),
functor_
(
OperatorBase
::
GetOptionalArg
<
int
>
(
"block_size"
,
1
),
false
)
{
}
functor_
(
OperatorBase
::
GetOptionalArg
<
int
>
(
"block_size"
,
1
),
false
)
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
MACE_CHECK
(
input
->
dim_size
()
==
4
,
"input dim should be 4"
);
const
int
block_size
=
OperatorBase
::
GetOptionalArg
<
int
>
(
"block_size"
,
1
);
const
int
block_size
=
OperatorBase
::
GetOptionalArg
<
int
>
(
"block_size"
,
1
);
index_t
input_height
;
index_t
input_width
;
index_t
input_depth
;
...
...
mace/ops/transpose.cc
浏览文件 @
33415ee9
mace/ops/transpose.h
浏览文件 @
33415ee9
...
...
@@ -18,12 +18,12 @@
#include <vector>
#include "mace/core/operator.h"
#include "mace/kernels/transpose.h"
#include "mace/kernels/softmax.h"
#include "mace/kernels/transpose.h"
namespace
mace
{
template
<
DeviceType
D
,
class
T
>
template
<
DeviceType
D
,
class
T
>
class
TransposeOp
:
public
Operator
<
D
,
T
>
{
public:
TransposeOp
(
const
OperatorDef
&
operator_def
,
Workspace
*
ws
)
...
...
@@ -35,14 +35,14 @@ class TransposeOp : public Operator<D, T> {
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
const
std
::
vector
<
index_t
>
&
input_shape
=
input
->
shape
();
MACE_CHECK
((
input_shape
.
size
()
==
4
&&
dims_
.
size
()
==
4
)
||
(
input_shape
.
size
()
==
2
&&
dims_
.
size
()
==
2
),
MACE_CHECK
((
input_shape
.
size
()
==
4
&&
dims_
.
size
()
==
4
)
||
(
input_shape
.
size
()
==
2
&&
dims_
.
size
()
==
2
),
"rank should be 2 or 4"
);
std
::
vector
<
index_t
>
output_shape
;
for
(
size_t
i
=
0
;
i
<
dims_
.
size
();
++
i
)
{
output_shape
.
push_back
(
input_shape
[
dims_
[
i
]]);
}
MACE_
FAILURE_RETURN
(
output
->
Resize
(
output_shape
));
MACE_
RETURN_IF_ERROR
(
output
->
Resize
(
output_shape
));
return
functor_
(
input
,
output
,
future
);
}
...
...
mace/ops/transpose_test.cc
浏览文件 @
33415ee9
...
...
@@ -37,10 +37,8 @@ void TransposeNCHWTest(const std::vector<index_t> &input_shape) {
// Run on cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
DataFormat
::
NHWC
,
"InputNCHW"
,
DataFormat
::
NCHW
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
DataFormat
::
NHWC
,
"InputNCHW"
,
DataFormat
::
NCHW
);
ExpectTensorNear
<
float
>
(
*
net
.
GetOutput
(
"InputNCHW"
),
*
net
.
GetOutput
(
"Output"
));
...
...
@@ -61,10 +59,8 @@ void TransposeNHWCTest(const std::vector<index_t> &input_shape) {
// Run on cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
DataFormat
::
NCHW
,
"InputNHWC"
,
DataFormat
::
NHWC
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
DataFormat
::
NCHW
,
"InputNHWC"
,
DataFormat
::
NHWC
);
ExpectTensorNear
<
float
>
(
*
net
.
GetOutput
(
"InputNHWC"
),
*
net
.
GetOutput
(
"Output"
));
...
...
@@ -99,8 +95,7 @@ TEST_F(TransposeOpTest, Rank2) {
// Run on cpu
net
.
RunOp
();
net
.
AddInputFromArray
<
CPU
,
float
>
(
"ExpectedOutput"
,
{
3
,
2
},
net
.
AddInputFromArray
<
CPU
,
float
>
(
"ExpectedOutput"
,
{
3
,
2
},
{
1
,
4
,
2
,
5
,
3
,
6
});
ExpectTensorNear
<
float
>
(
*
net
.
GetOutput
(
"ExpectedOutput"
),
...
...
mace/ops/winograd_convolution_test.cc
浏览文件 @
33415ee9
...
...
@@ -132,11 +132,9 @@ void WinogradConvolution(const index_t batch,
ImageToBuffer
<
D
,
float
>
(
&
net
,
"WinoOutputImage"
,
"WinoOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
if
(
DataTypeToEnum
<
T
>::
value
==
DataType
::
DT_HALF
)
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"WinoOutput"
),
1e-2
,
1e-2
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"WinoOutput"
),
1e-2
,
1e-2
);
}
else
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"WinoOutput"
),
1e-5
,
1e-4
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"WinoOutput"
),
1e-5
,
1e-4
);
}
}
}
// namespace
...
...
@@ -144,22 +142,19 @@ void WinogradConvolution(const index_t batch,
TEST_F
(
WinogradConvlutionTest
,
AlignedConvolution
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
32
,
16
,
Padding
::
VALID
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
32
,
16
,
Padding
::
SAME
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
32
,
16
,
Padding
::
SAME
);
}
TEST_F
(
WinogradConvlutionTest
,
UnAlignedConvolution
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
31
,
37
,
Padding
::
VALID
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
37
,
31
,
Padding
::
SAME
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
37
,
31
,
Padding
::
SAME
);
}
TEST_F
(
WinogradConvlutionTest
,
BatchConvolution
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
3
,
64
,
64
,
32
,
32
,
Padding
::
VALID
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
5
,
61
,
67
,
37
,
31
,
Padding
::
SAME
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
5
,
61
,
67
,
37
,
31
,
Padding
::
SAME
);
}
namespace
{
...
...
@@ -248,34 +243,26 @@ void WinogradConvolutionWithPad(const index_t batch,
ImageToBuffer
<
D
,
float
>
(
&
net
,
"WinoOutputImage"
,
"WinoOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
if
(
DataTypeToEnum
<
T
>::
value
==
DataType
::
DT_HALF
)
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"WinoOutput"
),
1e-2
,
1e-2
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"WinoOutput"
),
1e-2
,
1e-2
);
}
else
{
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"WinoOutput"
),
1e-5
,
1e-4
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"WinoOutput"
),
1e-5
,
1e-4
);
}
}
}
// namespace
TEST_F
(
WinogradConvlutionTest
,
AlignedConvolutionWithPad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
32
,
16
,
1
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
1
,
32
,
32
,
32
,
16
,
2
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
32
,
16
,
1
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
1
,
32
,
32
,
32
,
16
,
2
);
}
TEST_F
(
WinogradConvlutionTest
,
UnAlignedConvolutionWithPad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
31
,
37
,
1
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
1
,
61
,
67
,
37
,
31
,
2
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
31
,
37
,
1
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
1
,
61
,
67
,
37
,
31
,
2
);
}
TEST_F
(
WinogradConvlutionTest
,
BatchConvolutionWithPad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
3
,
64
,
64
,
32
,
32
,
1
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
5
,
61
,
67
,
37
,
31
,
2
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
3
,
64
,
64
,
32
,
32
,
1
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
half
>
(
5
,
61
,
67
,
37
,
31
,
2
);
}
}
// namespace test
...
...
mace/public/mace.h
浏览文件 @
33415ee9
...
...
@@ -65,7 +65,7 @@ enum MaceStatus {
MACE_OUT_OF_RESOURCES
=
2
};
#define MACE_
FAILURE_RETURN
(stmt) \
#define MACE_
RETURN_IF_ERROR
(stmt) \
{ \
MaceStatus status = (stmt); \
if (status != MACE_SUCCESS) { \
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录