Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
a74060eb
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
a74060eb
编写于
5月 21, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Improve batchspace performance
上级
dbbf8596
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
267 addition
and
66 deletion
+267
-66
mace/kernels/space_to_batch.h
mace/kernels/space_to_batch.h
+126
-45
mace/ops/batch_to_space_benchmark.cc
mace/ops/batch_to_space_benchmark.cc
+25
-10
mace/ops/space_to_batch_benchmark.cc
mace/ops/space_to_batch_benchmark.cc
+26
-11
mace/ops/space_to_batch_test.cc
mace/ops/space_to_batch_test.cc
+90
-0
未找到文件。
mace/kernels/space_to_batch.h
浏览文件 @
a74060eb
...
@@ -17,6 +17,7 @@
...
@@ -17,6 +17,7 @@
#include <memory>
#include <memory>
#include <vector>
#include <vector>
#include <algorithm>
#include "mace/core/future.h"
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/core/tensor.h"
...
@@ -160,10 +161,16 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
...
@@ -160,10 +161,16 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
Tensor
::
MappingGuard
input_guard
(
space_tensor
);
Tensor
::
MappingGuard
input_guard
(
space_tensor
);
Tensor
::
MappingGuard
output_guard
(
batch_tensor
);
Tensor
::
MappingGuard
output_guard
(
batch_tensor
);
int
pad_top
=
paddings_
[
0
];
int
pad_left
=
paddings_
[
2
];
int
block_shape_h
=
block_shape_
[
0
];
int
block_shape_w
=
block_shape_
[
1
];
if
(
b2s_
)
{
if
(
b2s_
)
{
const
float
*
input_data
=
batch_tensor
->
data
<
float
>
();
const
float
*
input_data
=
batch_tensor
->
data
<
float
>
();
float
*
output_data
=
space_tensor
->
mutable_data
<
float
>
();
float
*
output_data
=
space_tensor
->
mutable_data
<
float
>
();
index_t
in_batches
=
batch_tensor
->
dim
(
0
);
index_t
in_height
=
batch_tensor
->
dim
(
2
);
index_t
in_height
=
batch_tensor
->
dim
(
2
);
index_t
in_width
=
batch_tensor
->
dim
(
3
);
index_t
in_width
=
batch_tensor
->
dim
(
3
);
...
@@ -172,26 +179,58 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
...
@@ -172,26 +179,58 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
index_t
out_height
=
space_tensor
->
dim
(
2
);
index_t
out_height
=
space_tensor
->
dim
(
2
);
index_t
out_width
=
space_tensor
->
dim
(
3
);
index_t
out_width
=
space_tensor
->
dim
(
3
);
#pragma omp parallel for collapse(2)
// 32k/sizeof(float)/out_width/block_shape
for
(
index_t
b
=
0
;
b
<
out_batches
;
++
b
)
{
index_t
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
block_h_size
=
for
(
index_t
h
=
0
;
h
<
out_height
;
++
h
)
{
std
::
max
(
static_cast
<
index_t
>
(
1
),
8
*
1024
/
block_shape_w
/
out_width
);
const
index_t
in_h
=
(
h
+
paddings_
[
0
])
/
block_shape_
[
0
];
const
index_t
tile_h
=
(
h
+
paddings_
[
0
])
%
block_shape_
[
0
];
// make channel outter loop so we can make best use of cache
for
(
index_t
w
=
0
;
w
<
out_width
;
++
w
)
{
#pragma omp parallel for collapse(3)
const
index_t
in_w
=
(
w
+
paddings_
[
2
])
/
block_shape_
[
1
];
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
tile_w
=
(
w
+
paddings_
[
2
])
%
block_shape_
[
1
];
for
(
index_t
block_h
=
0
;
block_h
<
in_height
;
const
index_t
block_h
+=
block_h_size
)
{
in_b
=
(
tile_h
*
block_shape_
[
1
]
+
tile_w
)
*
out_batches
+
b
;
for
(
index_t
in_b
=
0
;
in_b
<
in_batches
;
++
in_b
)
{
output_data
[((
b
*
channels
+
c
)
*
out_height
+
h
)
*
out_width
const
index_t
b
=
in_b
%
out_batches
;
+
w
]
=
const
index_t
tile_index
=
in_b
/
out_batches
;
input_data
[
const
index_t
tile_h
=
tile_index
/
block_shape_w
;
((
in_b
*
channels
+
c
)
*
in_height
+
in_h
)
*
in_width
const
index_t
tile_w
=
tile_index
%
block_shape_w
;
+
in_w
];
const
index_t
valid_h_start
=
std
::
max
(
block_h
,
}
(
pad_top
-
tile_h
}
+
block_shape_h
-
1
)
}
/
block_shape_h
);
}
const
index_t
valid_h_end
=
std
::
min
(
in_height
,
std
::
min
(
block_h
+
block_h_size
,
(
out_height
+
pad_top
-
tile_h
+
block_shape_h
-
1
)
/
block_shape_h
));
const
index_t
valid_w_start
=
std
::
max
(
static_cast
<
index_t
>
(
0
),
(
pad_left
-
tile_w
+
block_shape_w
-
1
)
/
block_shape_w
);
const
index_t
valid_w_end
=
std
::
min
(
in_width
,
(
out_width
+
pad_left
-
tile_w
+
block_shape_w
-
1
)
/
block_shape_w
);
const
float
*
input_base
=
input_data
+
(
in_b
*
channels
+
c
)
*
in_height
*
in_width
;
float
*
output_base
=
output_data
+
(
b
*
channels
+
c
)
*
out_height
*
out_width
;
index_t
h
=
valid_h_start
*
block_shape_h
+
tile_h
-
pad_top
;
for
(
index_t
in_h
=
valid_h_start
;
in_h
<
valid_h_end
;
++
in_h
)
{
index_t
w
=
valid_w_start
*
block_shape_w
+
tile_w
-
pad_left
;
for
(
index_t
in_w
=
valid_w_start
;
in_w
<
valid_w_end
;
++
in_w
)
{
output_base
[
h
*
out_width
+
w
]
=
input_base
[
in_h
*
in_width
+
in_w
];
w
+=
block_shape_w
;
}
// w
h
+=
block_shape_h
;
}
// h
}
// b
}
// block_h
}
// c
}
else
{
}
else
{
const
float
*
input_data
=
space_tensor
->
data
<
float
>
();
const
float
*
input_data
=
space_tensor
->
data
<
float
>
();
float
*
output_data
=
batch_tensor
->
mutable_data
<
float
>
();
float
*
output_data
=
batch_tensor
->
mutable_data
<
float
>
();
...
@@ -205,31 +244,73 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
...
@@ -205,31 +244,73 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
index_t
out_height
=
batch_tensor
->
dim
(
2
);
index_t
out_height
=
batch_tensor
->
dim
(
2
);
index_t
out_width
=
batch_tensor
->
dim
(
3
);
index_t
out_width
=
batch_tensor
->
dim
(
3
);
#pragma omp parallel for collapse(2)
index_t
block_h_size
=
for
(
index_t
b
=
0
;
b
<
out_batches
;
++
b
)
{
std
::
max
(
static_cast
<
index_t
>
(
1
),
8
*
1024
/
block_shape_w
/
in_width
);
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
const
index_t
in_b
=
b
%
in_batches
;
// make channel outter loop so we can make best use of cache
const
index_t
tile_h
=
b
/
in_batches
/
block_shape_
[
1
];
#pragma omp parallel for collapse(3)
const
index_t
tile_w
=
b
/
in_batches
%
block_shape_
[
1
];
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
h
=
0
;
h
<
out_height
;
++
h
)
{
for
(
index_t
block_h
=
0
;
block_h
<
out_height
;
const
index_t
in_h
=
h
*
block_shape_
[
0
]
+
tile_h
-
paddings_
[
0
];
block_h
+=
block_h_size
)
{
for
(
index_t
w
=
0
;
w
<
out_width
;
++
w
)
{
for
(
index_t
b
=
0
;
b
<
out_batches
;
++
b
)
{
const
index_t
in_w
=
w
*
block_shape_
[
1
]
+
tile_w
-
paddings_
[
2
];
const
index_t
in_b
=
b
%
in_batches
;
if
(
in_h
>=
0
&&
in_w
>=
0
&&
in_h
<
in_height
const
index_t
tile_index
=
b
/
in_batches
;
&&
in_w
<
in_width
)
{
const
index_t
tile_h
=
tile_index
/
block_shape_w
;
output_data
[((
b
*
channels
+
c
)
*
out_height
+
h
)
*
out_width
const
index_t
tile_w
=
tile_index
%
block_shape_w
;
+
w
]
=
const
index_t
valid_h_start
=
std
::
max
(
block_h
,
input_data
[
(
pad_top
-
tile_h
((
in_b
*
channels
+
c
)
*
in_height
+
in_h
)
*
in_width
+
block_shape_h
-
1
)
+
in_w
];
/
block_shape_h
);
}
else
{
const
index_t
valid_h_end
=
std
::
min
(
out_height
,
output_data
[((
b
*
channels
+
c
)
*
out_height
+
h
)
*
out_width
std
::
min
(
+
w
]
=
0
;
block_h
+
block_h_size
,
}
(
in_height
+
pad_top
}
-
tile_h
}
+
block_shape_h
-
1
)
}
/
block_shape_h
));
}
const
index_t
valid_w_start
=
std
::
max
(
static_cast
<
index_t
>
(
0
),
(
pad_left
-
tile_w
+
block_shape_w
-
1
)
/
block_shape_w
);
const
index_t
valid_w_end
=
std
::
min
(
out_width
,
(
in_width
+
pad_left
-
tile_w
+
block_shape_w
-
1
)
/
block_shape_w
);
const
float
*
input_base
=
input_data
+
(
in_b
*
channels
+
c
)
*
in_height
*
in_width
;
float
*
output_base
=
output_data
+
(
b
*
channels
+
c
)
*
out_height
*
out_width
;
memset
(
output_base
+
block_h
*
out_width
,
0
,
(
valid_h_start
-
block_h
)
*
out_width
*
sizeof
(
float
));
index_t
in_h
=
valid_h_start
*
block_shape_h
+
tile_h
-
pad_top
;
for
(
index_t
h
=
valid_h_start
;
h
<
valid_h_end
;
++
h
)
{
memset
(
output_base
+
h
*
out_width
,
0
,
valid_w_start
*
sizeof
(
float
));
index_t
in_w
=
valid_w_start
*
block_shape_w
+
tile_w
-
pad_left
;
for
(
index_t
w
=
valid_w_start
;
w
<
valid_w_end
;
++
w
)
{
output_base
[
h
*
out_width
+
w
]
=
input_base
[
in_h
*
in_width
+
in_w
];
in_w
+=
block_shape_w
;
}
// w
in_h
+=
block_shape_h
;
memset
(
output_base
+
h
*
out_width
+
valid_w_end
,
0
,
(
out_width
-
valid_w_end
)
*
sizeof
(
float
));
}
// h
memset
(
output_base
+
valid_h_end
*
out_width
,
0
,
(
std
::
min
(
out_height
,
block_h
+
block_h_size
)
-
valid_h_end
)
*
out_width
*
sizeof
(
float
));
}
// b
}
// block_h
}
// c
}
}
}
}
};
};
...
...
mace/ops/batch_to_space_benchmark.cc
浏览文件 @
a74060eb
...
@@ -27,16 +27,29 @@ void BMBatchToSpace(
...
@@ -27,16 +27,29 @@ void BMBatchToSpace(
mace
::
testing
::
StopTiming
();
mace
::
testing
::
StopTiming
();
OpsTestNet
net
;
OpsTestNet
net
;
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
if
(
D
==
DeviceType
::
CPU
)
{
BufferToImage
<
D
,
float
>
(
&
net
,
"Input"
,
"InputImage"
,
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
channels
,
height
,
width
});
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
}
else
if
(
D
==
DeviceType
::
GPU
)
{
OpDefBuilder
(
"BatchToSpaceND"
,
"BatchToSpaceNDTest"
)
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
.
Input
(
"InputImage"
)
}
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"crops"
,
{
0
,
0
,
0
,
0
})
.
AddIntsArg
(
"block_shape"
,
{
arg
,
arg
})
.
Finalize
(
net
.
NewOperatorDef
());
if
(
D
==
DeviceType
::
CPU
)
{
OpDefBuilder
(
"BatchToSpaceND"
,
"BatchToSpaceNDTest"
)
.
Input
(
"Input"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"crops"
,
{
0
,
0
,
0
,
0
})
.
AddIntsArg
(
"block_shape"
,
{
arg
,
arg
})
.
Finalize
(
net
.
NewOperatorDef
());
}
else
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
float
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
OpDefBuilder
(
"BatchToSpaceND"
,
"BatchToSpaceNDTest"
)
.
Input
(
"InputImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"crops"
,
{
0
,
0
,
0
,
0
})
.
AddIntsArg
(
"block_shape"
,
{
arg
,
arg
})
.
Finalize
(
net
.
NewOperatorDef
());
}
// Warm-up
// Warm-up
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
...
@@ -63,11 +76,13 @@ void BMBatchToSpace(
...
@@ -63,11 +76,13 @@ void BMBatchToSpace(
BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)
BENCHMARK(BM_BATCH_TO_SPACE_##N##_##H##_##W##_##C##_##ARG##_##TYPE##_##DEVICE)
#define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \
#define BM_BATCH_TO_SPACE(N, H, W, C, ARG) \
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU);
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, GPU); \
BM_BATCH_TO_SPACE_MACRO(N, H, W, C, ARG, float, CPU);
BM_BATCH_TO_SPACE
(
128
,
8
,
8
,
128
,
2
);
BM_BATCH_TO_SPACE
(
128
,
8
,
8
,
128
,
2
);
BM_BATCH_TO_SPACE
(
4
,
128
,
128
,
32
,
2
);
BM_BATCH_TO_SPACE
(
4
,
128
,
128
,
32
,
2
);
BM_BATCH_TO_SPACE
(
16
,
64
,
64
,
32
,
4
);
BM_BATCH_TO_SPACE
(
16
,
64
,
64
,
32
,
4
);
BM_BATCH_TO_SPACE
(
64
,
32
,
32
,
32
,
8
);
}
// namespace test
}
// namespace test
}
// namespace ops
}
// namespace ops
...
...
mace/ops/space_to_batch_benchmark.cc
浏览文件 @
a74060eb
...
@@ -27,17 +27,29 @@ void BMSpaceToBatch(
...
@@ -27,17 +27,29 @@ void BMSpaceToBatch(
mace
::
testing
::
StopTiming
();
mace
::
testing
::
StopTiming
();
OpsTestNet
net
;
OpsTestNet
net
;
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
if
(
D
==
DeviceType
::
CPU
)
{
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
channels
,
height
,
width
});
BufferToImage
<
D
,
float
>
(
&
net
,
"Input"
,
"InputImage"
,
}
else
if
(
D
==
DeviceType
::
GPU
)
{
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
OpDefBuilder
(
"SpaceToBatchND"
,
"SpaceToBatchNDTest"
)
}
.
Input
(
"InputImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"paddings"
,
{
shape
,
shape
,
shape
,
shape
})
.
AddIntsArg
(
"block_shape"
,
{
shape
,
shape
})
.
Finalize
(
net
.
NewOperatorDef
());
if
(
D
==
DeviceType
::
CPU
)
{
OpDefBuilder
(
"SpaceToBatchND"
,
"SpaceToBatchNDTest"
)
.
Input
(
"Input"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"paddings"
,
{
shape
,
shape
,
shape
,
shape
})
.
AddIntsArg
(
"block_shape"
,
{
shape
,
shape
})
.
Finalize
(
net
.
NewOperatorDef
());
}
else
if
(
D
==
DeviceType
::
GPU
)
{
BufferToImage
<
D
,
float
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
OpDefBuilder
(
"SpaceToBatchND"
,
"SpaceToBatchNDTest"
)
.
Input
(
"InputImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"paddings"
,
{
shape
,
shape
,
shape
,
shape
})
.
AddIntsArg
(
"block_shape"
,
{
shape
,
shape
})
.
Finalize
(
net
.
NewOperatorDef
());
}
// Warm-up
// Warm-up
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
...
@@ -65,11 +77,14 @@ void BMSpaceToBatch(
...
@@ -65,11 +77,14 @@ void BMSpaceToBatch(
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE)
BM_SPACE_TO_BATCH_##N##_##H##_##W##_##C##_##SHAPE##_##TYPE##_##DEVICE)
#define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \
#define BM_SPACE_TO_BATCH(N, H, W, C, SHAPE) \
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU);
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, GPU); \
BM_SPACE_TO_BATCH_MACRO(N, H, W, C, SHAPE, float, CPU);
BM_SPACE_TO_BATCH
(
128
,
16
,
16
,
128
,
2
);
BM_SPACE_TO_BATCH
(
128
,
16
,
16
,
128
,
2
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
2
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
2
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
16
,
2
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
4
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
4
);
BM_SPACE_TO_BATCH
(
1
,
256
,
256
,
32
,
8
);
}
// namespace test
}
// namespace test
}
// namespace ops
}
// namespace ops
...
...
mace/ops/space_to_batch_test.cc
浏览文件 @
a74060eb
...
@@ -213,6 +213,96 @@ TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
...
@@ -213,6 +213,96 @@ TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
9
,
10
,
13
,
14
,
25
,
26
,
29
,
30
,
11
,
12
,
15
,
16
,
27
,
28
,
31
,
32
});
9
,
10
,
13
,
14
,
25
,
26
,
29
,
30
,
11
,
12
,
15
,
16
,
27
,
28
,
31
,
32
});
}
}
void
TestSpaceToBatchLargeInput
(
const
std
::
vector
<
index_t
>
&
input_shape
,
const
std
::
vector
<
int
>
&
block_shape_data
,
const
std
::
vector
<
int
>
&
padding_data
)
{
OpsTestNet
net
;
net
.
AddRandomInput
<
GPU
,
float
>
(
"Input"
,
input_shape
);
// run gpu
BufferToImage
<
GPU
,
float
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
OpDefBuilder
(
"SpaceToBatchND"
,
"SpaceToBatchNDTest"
)
.
Input
(
"InputImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"paddings"
,
padding_data
)
.
AddIntsArg
(
"block_shape"
,
block_shape_data
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
(
GPU
);
ImageToBuffer
<
GPU
,
float
>
(
&
net
,
"OutputImage"
,
"OutputGPU"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
// run cpu
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"SpaceToBatchND"
,
"SpaceToBatchNDTest"
)
.
Input
(
"InputNCHW"
)
.
Output
(
"OutputNCHW"
)
.
AddIntsArg
(
"paddings"
,
padding_data
)
.
AddIntsArg
(
"block_shape"
,
block_shape_data
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
(
CPU
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"OutputCPU"
,
NHWC
);
// Check
ExpectTensorNear
<
float
>
(
*
net
.
GetOutput
(
"OutputCPU"
),
*
net
.
GetOutput
(
"OutputGPU"
));
}
void
TestoBatchToSpaceLargeInput
(
const
std
::
vector
<
index_t
>
&
input_shape
,
const
std
::
vector
<
int
>
&
block_shape_data
,
const
std
::
vector
<
int
>
&
crops_data
)
{
OpsTestNet
net
;
net
.
AddRandomInput
<
GPU
,
float
>
(
"Input"
,
input_shape
);
// run gpu
BufferToImage
<
GPU
,
float
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
OpDefBuilder
(
"BatchToSpaceND"
,
"BatchToSpaceNDTest"
)
.
Input
(
"InputImage"
)
.
Output
(
"OutputImage"
)
.
AddIntsArg
(
"crops"
,
crops_data
)
.
AddIntsArg
(
"block_shape"
,
block_shape_data
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
(
GPU
);
ImageToBuffer
<
GPU
,
float
>
(
&
net
,
"OutputImage"
,
"OutputGPU"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
// run cpu
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"BatchToSpaceND"
,
"BatchToSpaceNDTest"
)
.
Input
(
"InputNCHW"
)
.
Output
(
"OutputNCHW"
)
.
AddIntsArg
(
"crops"
,
crops_data
)
.
AddIntsArg
(
"block_shape"
,
block_shape_data
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
(
CPU
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"OutputCPU"
,
NHWC
);
// Check
ExpectTensorNear
<
float
>
(
*
net
.
GetOutput
(
"OutputCPU"
),
*
net
.
GetOutput
(
"OutputGPU"
));
}
TEST
(
SpaceToBatchTest
,
LargeData
)
{
TestSpaceToBatchLargeInput
({
1
,
256
,
256
,
32
},
{
8
,
8
},
{
0
,
0
,
0
,
0
});
TestSpaceToBatchLargeInput
({
1
,
256
,
256
,
32
},
{
8
,
8
},
{
4
,
4
,
4
,
4
});
TestoBatchToSpaceLargeInput
({
64
,
32
,
32
,
32
},
{
8
,
8
},
{
0
,
0
,
0
,
0
});
TestoBatchToSpaceLargeInput
({
64
,
32
,
32
,
32
},
{
8
,
8
},
{
4
,
4
,
4
,
4
});
}
}
// namespace test
}
// namespace test
}
// namespace ops
}
// namespace ops
}
// namespace mace
}
// namespace mace
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录