Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
项目经理老王
Mace
提交
ea9a8243
Mace
项目概览
项目经理老王
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
ea9a8243
编写于
9月 12, 2018
作者:
L
liutuo
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix conflicts
上级
b1764247
变更
18
隐藏空白更改
内联
并排
Showing
18 changed file
with
958 addition
and
529 deletion
+958
-529
mace/kernels/batch_to_space.h
mace/kernels/batch_to_space.h
+209
-0
mace/kernels/depth_to_space.h
mace/kernels/depth_to_space.h
+30
-64
mace/kernels/opencl/batch_to_space.cc
mace/kernels/opencl/batch_to_space.cc
+104
-0
mace/kernels/opencl/cl/batch_to_space.cl
mace/kernels/opencl/cl/batch_to_space.cl
+48
-0
mace/kernels/opencl/cl/depth_to_space.cl
mace/kernels/opencl/cl/depth_to_space.cl
+0
-40
mace/kernels/opencl/cl/space_to_batch.cl
mace/kernels/opencl/cl/space_to_batch.cl
+0
-47
mace/kernels/opencl/cl/space_to_depth.cl
mace/kernels/opencl/cl/space_to_depth.cl
+41
-0
mace/kernels/opencl/depth_to_space.cc
mace/kernels/opencl/depth_to_space.cc
+29
-49
mace/kernels/opencl/space_to_batch.cc
mace/kernels/opencl/space_to_batch.cc
+8
-25
mace/kernels/opencl/space_to_depth.cc
mace/kernels/opencl/space_to_depth.cc
+107
-0
mace/kernels/space_to_batch.h
mace/kernels/space_to_batch.h
+80
-195
mace/kernels/space_to_depth.h
mace/kernels/space_to_depth.h
+114
-0
mace/ops/batch_to_space.h
mace/ops/batch_to_space.h
+3
-4
mace/ops/depth_to_space.h
mace/ops/depth_to_space.h
+1
-14
mace/ops/depth_to_space_test.cc
mace/ops/depth_to_space_test.cc
+13
-64
mace/ops/space_to_batch.h
mace/ops/space_to_batch.h
+1
-2
mace/ops/space_to_depth.h
mace/ops/space_to_depth.h
+3
-25
mace/ops/space_to_depth_test.cc
mace/ops/space_to_depth_test.cc
+167
-0
未找到文件。
mace/kernels/batch_to_space.h
0 → 100644
浏览文件 @
ea9a8243
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_BATCH_TO_SPACE_H_
#define MACE_KERNELS_BATCH_TO_SPACE_H_
#include <memory>
#include <vector>
#include <algorithm>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/kernel.h"
#include "mace/public/mace.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace
mace
{
namespace
kernels
{
struct
BatchToSpaceFunctorBase
:
OpKernel
{
BatchToSpaceFunctorBase
(
OpKernelContext
*
context
,
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
block_shape
)
:
OpKernel
(
context
),
paddings_
(
paddings
.
begin
(),
paddings
.
end
()),
block_shape_
(
block_shape
.
begin
(),
block_shape
.
end
())
{
MACE_CHECK
(
block_shape
.
size
()
==
2
&&
block_shape
[
0
]
>
1
&&
block_shape
[
1
]
>
1
,
"Block's shape should be 1D, and greater than 1"
);
MACE_CHECK
(
paddings
.
size
()
==
4
,
"Paddings' shape should be 2D"
);
}
std
::
vector
<
int
>
paddings_
;
std
::
vector
<
int
>
block_shape_
;
protected:
void
CalculateBatchToSpaceOutputShape
(
const
Tensor
*
input_tensor
,
const
DataFormat
data_format
,
index_t
*
output_shape
)
{
MACE_CHECK
(
input_tensor
->
dim_size
()
==
4
,
"Input's shape should be 4D"
);
index_t
batch
=
input_tensor
->
dim
(
0
);
index_t
channels
=
0
;
index_t
height
=
0
;
index_t
width
=
0
;
if
(
data_format
==
DataFormat
::
NHWC
)
{
height
=
input_tensor
->
dim
(
1
);
width
=
input_tensor
->
dim
(
2
);
channels
=
input_tensor
->
dim
(
3
);
}
else
if
(
data_format
==
DataFormat
::
NCHW
)
{
height
=
input_tensor
->
dim
(
2
);
width
=
input_tensor
->
dim
(
3
);
channels
=
input_tensor
->
dim
(
1
);
}
else
{
MACE_NOT_IMPLEMENTED
;
}
index_t
new_batch
=
batch
/
block_shape_
[
0
]
/
block_shape_
[
1
];
index_t
new_height
=
height
*
block_shape_
[
0
]
-
paddings_
[
0
]
-
paddings_
[
1
];
index_t
new_width
=
width
*
block_shape_
[
1
]
-
paddings_
[
2
]
-
paddings_
[
3
];
if
(
data_format
==
DataFormat
::
NHWC
)
{
output_shape
[
0
]
=
new_batch
;
output_shape
[
1
]
=
new_height
;
output_shape
[
2
]
=
new_width
;
output_shape
[
3
]
=
channels
;
}
else
{
output_shape
[
0
]
=
new_batch
;
output_shape
[
1
]
=
channels
;
output_shape
[
2
]
=
new_height
;
output_shape
[
3
]
=
new_width
;
}
}
};
template
<
DeviceType
D
,
typename
T
>
struct
BatchToSpaceFunctor
;
template
<
>
struct
BatchToSpaceFunctor
<
DeviceType
::
CPU
,
float
>
:
BatchToSpaceFunctorBase
{
BatchToSpaceFunctor
(
OpKernelContext
*
context
,
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
block_shape
)
:
BatchToSpaceFunctorBase
(
context
,
paddings
,
block_shape
)
{}
MaceStatus
operator
()(
Tensor
*
space_tensor
,
Tensor
*
batch_tensor
,
StatsFuture
*
future
)
{
MACE_UNUSED
(
future
);
std
::
vector
<
index_t
>
output_shape
(
4
,
0
);
CalculateBatchToSpaceOutputShape
(
batch_tensor
,
DataFormat
::
NCHW
,
output_shape
.
data
());
MACE_RETURN_IF_ERROR
(
space_tensor
->
Resize
(
output_shape
));
Tensor
::
MappingGuard
input_guard
(
space_tensor
);
Tensor
::
MappingGuard
output_guard
(
batch_tensor
);
int
pad_top
=
paddings_
[
0
];
int
pad_left
=
paddings_
[
2
];
int
block_shape_h
=
block_shape_
[
0
];
int
block_shape_w
=
block_shape_
[
1
];
const
float
*
input_data
=
batch_tensor
->
data
<
float
>
();
float
*
output_data
=
space_tensor
->
mutable_data
<
float
>
();
index_t
in_batches
=
batch_tensor
->
dim
(
0
);
index_t
in_height
=
batch_tensor
->
dim
(
2
);
index_t
in_width
=
batch_tensor
->
dim
(
3
);
index_t
out_batches
=
space_tensor
->
dim
(
0
);
index_t
channels
=
space_tensor
->
dim
(
1
);
index_t
out_height
=
space_tensor
->
dim
(
2
);
index_t
out_width
=
space_tensor
->
dim
(
3
);
// 32k/sizeof(float)/out_width/block_shape
index_t
block_h_size
=
std
::
max
(
static_cast
<
index_t
>
(
1
),
8
*
1024
/
block_shape_w
/
out_width
);
// make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3)
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
block_h
=
0
;
block_h
<
in_height
;
block_h
+=
block_h_size
)
{
for
(
index_t
in_b
=
0
;
in_b
<
in_batches
;
++
in_b
)
{
const
index_t
b
=
in_b
%
out_batches
;
const
index_t
tile_index
=
in_b
/
out_batches
;
const
index_t
tile_h
=
tile_index
/
block_shape_w
;
const
index_t
tile_w
=
tile_index
%
block_shape_w
;
const
index_t
valid_h_start
=
std
::
max
(
block_h
,
(
pad_top
-
tile_h
+
block_shape_h
-
1
)
/
block_shape_h
);
const
index_t
valid_h_end
=
std
::
min
(
in_height
,
std
::
min
(
block_h
+
block_h_size
,
(
out_height
+
pad_top
-
tile_h
+
block_shape_h
-
1
)
/
block_shape_h
));
const
index_t
valid_w_start
=
std
::
max
(
static_cast
<
index_t
>
(
0
),
(
pad_left
-
tile_w
+
block_shape_w
-
1
)
/
block_shape_w
);
const
index_t
valid_w_end
=
std
::
min
(
in_width
,
(
out_width
+
pad_left
-
tile_w
+
block_shape_w
-
1
)
/
block_shape_w
);
const
float
*
input_base
=
input_data
+
(
in_b
*
channels
+
c
)
*
in_height
*
in_width
;
float
*
output_base
=
output_data
+
(
b
*
channels
+
c
)
*
out_height
*
out_width
;
index_t
h
=
valid_h_start
*
block_shape_h
+
tile_h
-
pad_top
;
for
(
index_t
in_h
=
valid_h_start
;
in_h
<
valid_h_end
;
++
in_h
)
{
index_t
w
=
valid_w_start
*
block_shape_w
+
tile_w
-
pad_left
;
for
(
index_t
in_w
=
valid_w_start
;
in_w
<
valid_w_end
;
++
in_w
)
{
output_base
[
h
*
out_width
+
w
]
=
input_base
[
in_h
*
in_width
+
in_w
];
w
+=
block_shape_w
;
}
// w
h
+=
block_shape_h
;
}
// h
}
// b
}
// block_h
}
// c
return
MACE_SUCCESS
;
}
};
#ifdef MACE_ENABLE_OPENCL
template
<
typename
T
>
struct
BatchToSpaceFunctor
<
DeviceType
::
GPU
,
T
>
:
BatchToSpaceFunctorBase
{
BatchToSpaceFunctor
(
OpKernelContext
*
context
,
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
block_shape
)
:
BatchToSpaceFunctorBase
(
context
,
paddings
,
block_shape
)
{}
MaceStatus
operator
()(
Tensor
*
space_tensor
,
Tensor
*
batch_tensor
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
space_shape_
;
};
#endif // MACE_ENABLE_OPENCL
}
// namespace kernels
}
// namespace mace
#endif // MACE_KERNELS_BATCH_TO_SPACE_H_
mace/kernels/depth_to_space.h
浏览文件 @
ea9a8243
...
...
@@ -32,9 +32,8 @@ namespace kernels {
template
<
DeviceType
D
,
typename
T
>
struct
DepthToSpaceOpFunctor
:
OpKernel
{
DepthToSpaceOpFunctor
(
OpKernelContext
*
context
,
const
int
block_size
,
bool
d2s
)
:
OpKernel
(
context
),
block_size_
(
block_size
),
d2s_
(
d2s
)
{}
const
int
block_size
)
:
OpKernel
(
context
),
block_size_
(
block_size
)
{}
MaceStatus
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
)
{
...
...
@@ -44,17 +43,13 @@ struct DepthToSpaceOpFunctor : OpKernel {
const
index_t
input_height
=
input
->
dim
(
2
);
const
index_t
input_width
=
input
->
dim
(
3
);
index_t
output_depth
,
output_width
,
output_height
;
MACE_CHECK
(
input_depth
%
(
block_size_
*
block_size_
)
==
0
,
"input depth should be dividable by block_size * block_size"
,
input_depth
);
if
(
d2s_
)
{
output_depth
=
input_depth
/
(
block_size_
*
block_size_
);
output_width
=
input_width
*
block_size_
;
output_height
=
input_height
*
block_size_
;
}
else
{
output_depth
=
input_depth
*
block_size_
*
block_size_
;
output_width
=
input_width
/
block_size_
;
output_height
=
input_height
/
block_size_
;
}
const
index_t
output_depth
=
input_depth
/
(
block_size_
*
block_size_
);
const
index_t
output_width
=
input_width
*
block_size_
;
const
index_t
output_height
=
input_height
*
block_size_
;
std
::
vector
<
index_t
>
output_shape
=
{
batch_size
,
output_depth
,
output_height
,
output_width
};
...
...
@@ -65,78 +60,49 @@ struct DepthToSpaceOpFunctor : OpKernel {
const
T
*
input_ptr
=
input
->
data
<
T
>
();
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
if
(
d2s_
)
{
#pragma omp parallel for
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
index_t
d
=
0
;
d
<
output_depth
;
++
d
)
{
for
(
index_t
h
=
0
;
h
<
output_height
;
++
h
)
{
const
index_t
in_h
=
h
/
block_size_
;
const
index_t
offset_h
=
(
h
%
block_size_
);
for
(
int
w
=
0
;
w
<
output_width
;
++
w
)
{
const
index_t
in_w
=
w
/
block_size_
;
const
index_t
offset_w
=
w
%
block_size_
;
const
index_t
offset_d
=
(
offset_h
*
block_size_
+
offset_w
)
*
output_depth
;
const
index_t
in_d
=
d
+
offset_d
;
const
index_t
o_index
=
((
b
*
output_depth
+
d
)
*
output_height
+
h
)
*
output_width
+
w
;
const
index_t
i_index
=
((
b
*
input_depth
+
in_d
)
*
input_height
+
in_h
)
*
input_width
+
in_w
;
output_ptr
[
o_index
]
=
input_ptr
[
i_index
];
}
}
}
}
}
else
{
#pragma omp parallel for
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
index_t
d
=
0
;
d
<
input_depth
;
++
d
)
{
for
(
index_t
h
=
0
;
h
<
input_height
;
++
h
)
{
const
index_t
out_h
=
h
/
block_size_
;
const
index_t
offset_h
=
(
h
%
block_size_
);
for
(
index_t
w
=
0
;
w
<
input_width
;
++
w
)
{
const
index_t
out_w
=
w
/
block_size_
;
const
index_t
offset_w
=
(
w
%
block_size_
);
const
index_t
offset_d
=
(
offset_h
*
block_size_
+
offset_w
)
*
input_depth
;
const
index_t
out_d
=
d
+
offset_d
;
const
index_t
o_index
=
((
b
*
output_depth
+
out_d
)
*
output_height
+
out_h
)
*
output_width
+
out_w
;
const
index_t
i_index
=
((
b
*
input_depth
+
d
)
*
input_height
+
h
)
*
input_width
+
w
;
output_ptr
[
o_index
]
=
input_ptr
[
i_index
];
}
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
index_t
d
=
0
;
d
<
output_depth
;
++
d
)
{
for
(
index_t
h
=
0
;
h
<
output_height
;
++
h
)
{
const
index_t
in_h
=
h
/
block_size_
;
const
index_t
offset_h
=
(
h
%
block_size_
);
for
(
int
w
=
0
;
w
<
output_width
;
++
w
)
{
const
index_t
in_w
=
w
/
block_size_
;
const
index_t
offset_w
=
w
%
block_size_
;
const
index_t
offset_d
=
(
offset_h
*
block_size_
+
offset_w
)
*
output_depth
;
const
index_t
in_d
=
d
+
offset_d
;
const
index_t
o_index
=
((
b
*
output_depth
+
d
)
*
output_height
+
h
)
*
output_width
+
w
;
const
index_t
i_index
=
((
b
*
input_depth
+
in_d
)
*
input_height
+
in_h
)
*
input_width
+
in_w
;
output_ptr
[
o_index
]
=
input_ptr
[
i_index
];
}
}
}
}
return
MACE_SUCCESS
;
}
const
int
block_size_
;
bool
d2s_
;
};
#ifdef MACE_ENABLE_OPENCL
template
<
typename
T
>
struct
DepthToSpaceOpFunctor
<
DeviceType
::
GPU
,
T
>
:
OpKernel
{
DepthToSpaceOpFunctor
(
OpKernelContext
*
context
,
const
int
block_size
,
bool
d2s
)
:
OpKernel
(
context
),
block_size_
(
block_size
),
d2s_
(
d2s
)
{}
const
int
block_size
)
:
OpKernel
(
context
),
block_size_
(
block_size
)
{}
MaceStatus
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
const
int
block_size_
;
bool
d2s_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
...
...
mace/kernels/opencl/batch_to_space.cc
0 → 100644
浏览文件 @
ea9a8243
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
#define MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
#include "mace/kernels/batch_to_space.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace
mace
{
namespace
kernels
{
template
<
typename
T
>
MaceStatus
BatchToSpaceFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
Tensor
*
space_tensor
,
Tensor
*
batch_tensor
,
StatsFuture
*
future
)
{
std
::
vector
<
index_t
>
output_shape
(
4
,
0
);
CalculateBatchToSpaceOutputShape
(
batch_tensor
,
DataFormat
::
NHWC
,
output_shape
.
data
());
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
output_image_shape
);
MACE_RETURN_IF_ERROR
(
space_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
));
const
uint32_t
chan_blk
=
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
batch_tensor
->
dim
(
3
)));
const
uint32_t
gws
[
3
]
=
{
chan_blk
,
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
0
)
*
batch_tensor
->
dim
(
1
))};
auto
runtime
=
context_
->
device
()
->
opencl_runtime
();
if
(
kernel_
.
get
()
==
nullptr
)
{
const
char
*
kernel_name
=
"batch_to_space"
;
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
kernel_name
);
std
::
set
<
std
::
string
>
built_options
;
OUT_OF_RANGE_CONFIG
(
kernel_error_
,
context_
);
NON_UNIFORM_WG_CONFIG
;
std
::
stringstream
kernel_name_ss
;
kernel_name_ss
<<
"-D"
<<
kernel_name
<<
"="
<<
obfuscated_kernel_name
;
built_options
.
emplace
(
kernel_name_ss
.
str
());
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"batch_to_space"
,
obfuscated_kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
if
(
!
IsVecEqual
(
space_shape_
,
space_tensor
->
shape
()))
{
uint32_t
idx
=
0
;
OUT_OF_RANGE_SET_ARG
;
SET_3D_GWS_ARGS
(
kernel_
);
kernel_
.
setArg
(
idx
++
,
*
(
batch_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
space_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
block_shape_
[
0
]);
kernel_
.
setArg
(
idx
++
,
block_shape_
[
1
]);
kernel_
.
setArg
(
idx
++
,
paddings_
[
0
]);
kernel_
.
setArg
(
idx
++
,
paddings_
[
2
]);
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
space_tensor
->
dim
(
0
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
space_tensor
->
dim
(
1
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
space_tensor
->
dim
(
2
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
batch_tensor
->
dim
(
1
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
batch_tensor
->
dim
(
2
)));
space_shape_
=
space_tensor
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
runtime
,
gws
,
kwg_size_
);
std
::
string
tuning_key
=
Concat
(
"batch_to_space"
,
batch_tensor
->
dim
(
0
),
batch_tensor
->
dim
(
1
),
batch_tensor
->
dim
(
2
),
batch_tensor
->
dim
(
3
));
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
runtime
,
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
OUT_OF_RANGE_VALIDATION
(
kernel_error_
);
return
MACE_SUCCESS
;
}
template
struct
BatchToSpaceFunctor
<
DeviceType
::
GPU
,
float
>;
template
struct
BatchToSpaceFunctor
<
DeviceType
::
GPU
,
half
>;
}
// namespace kernels
}
// namespace mace
#endif // MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
mace/kernels/opencl/cl/batch_to_space.cl
0 → 100644
浏览文件 @
ea9a8243
#
include
<common.h>
__kernel
void
batch_to_space
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
batch_data,
__write_only
image2d_t
space_data,
__private
const
int
block_height,
__private
const
int
block_width,
__private
const
int
padding_height,
__private
const
int
padding_width,
__private
const
int
batch_size,
__private
const
int
space_height,
__private
const
int
space_width,
__private
const
int
batch_height,
__private
const
int
batch_width
)
{
const
int
chan_idx
=
get_global_id
(
0
)
;
const
int
batch_w_idx
=
get_global_id
(
1
)
;
const
int
batch_hb_idx
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
chan_idx
>=
global_size_dim0
|
| batch_w_idx >= global_size_dim1
|
|
batch_hb_idx
>=
global_size_dim2
)
{
return
;
}
#
endif
const
int
batch_b_idx
=
batch_hb_idx
/
batch_height
;
const
int
batch_h_idx
=
batch_hb_idx
%
batch_height
;
const
int
block_size
=
mul24
(
block_height,
block_width
)
;
const
int
space_b_idx
=
batch_b_idx
%
batch_size
;
const
int
remaining_batch_idx
=
batch_b_idx
/
batch_size
;
const
int
space_h_idx
=
(
remaining_batch_idx
/
block_width
)
+
mul24
(
batch_h_idx,
block_height
)
-
padding_height
;
const
int
space_w_idx
=
(
remaining_batch_idx
%
block_width
)
+
mul24
(
batch_w_idx,
block_width
)
-
padding_width
;
if
(
0
<=
space_w_idx
&&
space_w_idx
<
space_width
&&
0
<=
space_h_idx
&&
space_h_idx
<
space_height
)
{
int2
batch_coord
=
(
int2
)(
mul24
(
chan_idx,
batch_width
)
+
batch_w_idx,
batch_hb_idx
)
;
DATA_TYPE4
value
=
READ_IMAGET
(
batch_data,
SAMPLER,
batch_coord
)
;
int2
space_coord
=
(
int2
)(
mul24
(
chan_idx,
space_width
)
+
space_w_idx,
space_b_idx
*
space_height
+
space_h_idx
)
;
WRITE_IMAGET
(
space_data,
space_coord,
value
)
;
}
}
mace/kernels/opencl/cl/depth_to_space.cl
浏览文件 @
ea9a8243
...
...
@@ -39,43 +39,3 @@ __kernel void depth_to_space(KERNEL_ERROR_PARAMS
WRITE_IMAGET
(
output,
(
int2
)(
out_pos,
out_hb
)
,
in_data
)
;
}
__kernel void space_to_depth(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__private const int block_size,
__private const int input_width,
__private const int input_depth_blocks,
__private const int output_hb,
__private const int output_width,
__private const int output_depth_blocks,
__write_only image2d_t output) {
const int d = get_global_id(0);
const int w = get_global_id(1);
const int hb = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (d >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
}
#endif
const int in_pos = mad24(d, input_width, w);
const int out_hb = hb / block_size;
const int offset_h = hb % block_size;
const int out_w = w / block_size;
const int offset_w = w % block_size;
const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks;
const int out_d = d + offset_d;
if (out_d >= output_depth_blocks || out_hb >= output_hb |
|
out_w
>=
output_width
)
{
return
;
}
const
int
out_pos
=
mad24
(
out_d,
output_width,
out_w
)
;
DATA_TYPE4
in_data
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
in_pos,
hb
))
;
WRITE_IMAGET
(
output,
(
int2
)(
out_pos,
out_hb
)
,
in_data
)
;
}
mace/kernels/opencl/cl/space_to_batch.cl
浏览文件 @
ea9a8243
...
...
@@ -49,50 +49,3 @@ __kernel void space_to_batch(KERNEL_ERROR_PARAMS
WRITE_IMAGET
(
batch_data,
batch_coord,
value
)
;
}
__kernel void batch_to_space(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t batch_data,
__write_only image2d_t space_data,
__private const int block_height,
__private const int block_width,
__private const int padding_height,
__private const int padding_width,
__private const int batch_size,
__private const int space_height,
__private const int space_width,
__private const int batch_height,
__private const int batch_width) {
const int chan_idx = get_global_id(0);
const int batch_w_idx = get_global_id(1);
const int batch_hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
|
|
batch_hb_idx
>=
global_size_dim2
)
{
return
;
}
#
endif
const
int
batch_b_idx
=
batch_hb_idx
/
batch_height
;
const
int
batch_h_idx
=
batch_hb_idx
%
batch_height
;
const
int
block_size
=
mul24
(
block_height,
block_width
)
;
const
int
space_b_idx
=
batch_b_idx
%
batch_size
;
const
int
remaining_batch_idx
=
batch_b_idx
/
batch_size
;
const
int
space_h_idx
=
(
remaining_batch_idx
/
block_width
)
+
mul24
(
batch_h_idx,
block_height
)
-
padding_height
;
const
int
space_w_idx
=
(
remaining_batch_idx
%
block_width
)
+
mul24
(
batch_w_idx,
block_width
)
-
padding_width
;
if
(
0
<=
space_w_idx
&&
space_w_idx
<
space_width
&&
0
<=
space_h_idx
&&
space_h_idx
<
space_height
)
{
int2
batch_coord
=
(
int2
)(
mul24
(
chan_idx,
batch_width
)
+
batch_w_idx,
batch_hb_idx
)
;
DATA_TYPE4
value
=
READ_IMAGET
(
batch_data,
SAMPLER,
batch_coord
)
;
int2
space_coord
=
(
int2
)(
mul24
(
chan_idx,
space_width
)
+
space_w_idx,
space_b_idx
*
space_height
+
space_h_idx
)
;
WRITE_IMAGET
(
space_data,
space_coord,
value
)
;
}
}
mace/kernels/opencl/cl/space_to_depth.cl
0 → 100644
浏览文件 @
ea9a8243
#
include
<common.h>
__kernel
void
space_to_depth
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
block_size,
__private
const
int
input_width,
__private
const
int
input_depth_blocks,
__private
const
int
output_hb,
__private
const
int
output_width,
__private
const
int
output_depth_blocks,
__write_only
image2d_t
output
)
{
const
int
d
=
get_global_id
(
0
)
;
const
int
w
=
get_global_id
(
1
)
;
const
int
hb
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
d
>=
global_size_dim0
|
| w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
}
#endif
const int in_pos = mad24(d, input_width, w);
const int out_hb = hb / block_size;
const int offset_h = hb % block_size;
const int out_w = w / block_size;
const int offset_w = w % block_size;
const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks;
const int out_d = d + offset_d;
if (out_d >= output_depth_blocks || out_hb >= output_hb |
|
out_w
>=
output_width
)
{
return
;
}
const
int
out_pos
=
mad24
(
out_d,
output_width,
out_w
)
;
DATA_TYPE4
in_data
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
in_pos,
hb
))
;
WRITE_IMAGET
(
output,
(
int2
)(
out_pos,
out_hb
)
,
in_data
)
;
}
mace/kernels/opencl/depth_to_space.cc
浏览文件 @
ea9a8243
...
...
@@ -30,54 +30,41 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
const
index_t
input_width
=
input
->
dim
(
2
);
const
index_t
input_depth
=
input
->
dim
(
3
);
const
char
*
kernel_name
=
nullptr
;
MACE_CHECK
(
input_depth
%
(
block_size_
*
block_size_
)
==
0
,
"input depth should be dividable by block_size * block_size"
,
input_depth
);
MACE_CHECK
((
input_depth
%
4
)
==
0
,
"input channel should be dividable by 4"
);
const
index_t
output_height
=
input_height
*
block_size_
;
const
index_t
output_width
=
input_width
*
block_size_
;
const
index_t
output_depth
=
input_depth
/
(
block_size_
*
block_size_
);
MACE_CHECK
(
output_depth
%
4
==
0
,
"output channel not support:"
)
<<
output_depth
;
uint32_t
gws
[
3
];
std
::
string
tuning_key
;
index_t
output_height
,
output_width
,
output_depth
;
if
(
d2s_
)
{
output_height
=
input_height
*
block_size_
;
output_width
=
input_width
*
block_size_
;
output_depth
=
input_depth
/
(
block_size_
*
block_size_
);
MACE_CHECK
(
output_depth
%
4
==
0
,
"output channel not support:"
)
<<
output_depth
;
kernel_name
=
"depth_to_space"
;
gws
[
0
]
=
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
output_depth
));
gws
[
1
]
=
static_cast
<
uint32_t
>
(
output_width
);
gws
[
2
]
=
static_cast
<
uint32_t
>
(
output_height
*
batch
);
tuning_key
=
Concat
(
"depth_to_space_opencl_kernel"
,
batch
,
output_height
,
output_width
,
output_depth
);
}
else
{
output_height
=
input_height
/
block_size_
;
output_width
=
input_width
/
block_size_
;
output_depth
=
input_depth
*
block_size_
*
block_size_
;
MACE_CHECK
(
input_depth
%
4
==
0
,
"input channel not support:"
)
<<
input_depth
;
kernel_name
=
"space_to_depth"
;
gws
[
0
]
=
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_depth
));
gws
[
1
]
=
static_cast
<
uint32_t
>
(
input_width
);
gws
[
2
]
=
static_cast
<
uint32_t
>
(
input_height
*
batch
);
tuning_key
=
Concat
(
"space_to_depth_opencl_kernel"
,
input
->
dim
(
0
),
input
->
dim
(
1
),
input
->
dim
(
2
),
input
->
dim
(
3
));
}
const
index_t
input_depth_blocks
=
RoundUpDiv4
(
input_depth
);
const
index_t
output_depth_blocks
=
RoundUpDiv4
(
output_depth
);
std
::
vector
<
index_t
>
output_shape
=
{
batch
,
output_height
,
output_width
,
std
::
vector
<
index_t
>
output_shape
=
{
batch
,
output_height
,
output_width
,
output_depth
};
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
image_shape
);
MACE_RETURN_IF_ERROR
(
output
->
ResizeImage
(
output_shape
,
image_shape
));
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
output_depth
)),
static_cast
<
uint32_t
>
(
output_width
),
static_cast
<
uint32_t
>
(
output_height
*
batch
)
};
auto
runtime
=
context_
->
device
()
->
opencl_runtime
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
OUT_OF_RANGE_CONFIG
(
kernel_error_
,
context_
);
NON_UNIFORM_WG_CONFIG
;
const
char
*
kernel_name
=
kernel_name
=
"depth_to_space"
;
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
kernel_name
);
std
::
stringstream
kernel_name_ss
;
kernel_name_ss
<<
"-D"
<<
kernel_name
<<
"="
<<
obfuscated_kernel_name
;
...
...
@@ -89,7 +76,6 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
obfuscated_kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
...
...
@@ -99,26 +85,20 @@ MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
OUT_OF_RANGE_SET_ARG
;
SET_3D_GWS_ARGS
(
kernel_
);
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
if
(
d2s_
)
{
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
block_size_
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_height
*
batch
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_depth_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_depth_blocks
));
}
else
{
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
block_size_
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_depth_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_height
*
batch
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_depth_blocks
));
}
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
block_size_
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_height
*
batch
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_depth_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_depth_blocks
));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input
->
shape
();
}
std
::
string
tuning_key
=
Concat
(
"depth_to_space_opencl_kernel"
,
batch
,
output_height
,
output_width
,
output_depth
);
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
runtime
,
gws
,
kwg_size_
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
runtime
,
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
...
...
mace/kernels/opencl/space_to_batch.cc
浏览文件 @
ea9a8243
...
...
@@ -28,27 +28,14 @@ template <typename T>
MaceStatus
SpaceToBatchFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
Tensor
*
space_tensor
,
Tensor
*
batch_tensor
,
StatsFuture
*
future
)
{
std
::
vector
<
index_t
>
output_shape
(
4
,
0
);
if
(
b2s_
)
{
CalculateBatchToSpaceOutputShape
(
batch_tensor
,
DataFormat
::
NHWC
,
output_shape
.
data
());
}
else
{
CalculateSpaceToBatchOutputShape
(
space_tensor
,
DataFormat
::
NHWC
,
output_shape
.
data
());
}
const
char
*
kernel_name
=
nullptr
;
CalculateSpaceToBatchOutputShape
(
space_tensor
,
DataFormat
::
NHWC
,
output_shape
.
data
());
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
output_image_shape
);
if
(
b2s_
)
{
MACE_RETURN_IF_ERROR
(
space_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
));
kernel_name
=
"batch_to_space"
;
}
else
{
MACE_RETURN_IF_ERROR
(
batch_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
));
kernel_name
=
"space_to_batch"
;
}
MACE_RETURN_IF_ERROR
(
batch_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
));
const
char
*
kernel_name
=
"space_to_batch"
;
const
uint32_t
chan_blk
=
RoundUpDiv4
<
uint32_t
>
(
batch_tensor
->
dim
(
3
));
const
uint32_t
gws
[
3
]
=
{
chan_blk
,
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
2
)),
...
...
@@ -79,13 +66,9 @@ MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
uint32_t
idx
=
0
;
OUT_OF_RANGE_SET_ARG
;
SET_3D_GWS_ARGS
(
kernel_
);
if
(
b2s_
)
{
kernel_
.
setArg
(
idx
++
,
*
(
batch_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
space_tensor
->
opencl_image
()));
}
else
{
kernel_
.
setArg
(
idx
++
,
*
(
space_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
batch_tensor
->
opencl_image
()));
}
kernel_
.
setArg
(
idx
++
,
*
(
space_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
batch_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
block_shape_
[
0
]);
kernel_
.
setArg
(
idx
++
,
block_shape_
[
1
]);
kernel_
.
setArg
(
idx
++
,
paddings_
[
0
]);
...
...
mace/kernels/opencl/space_to_depth.cc
0 → 100644
浏览文件 @
ea9a8243
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/kernels/space_to_depth.h"
#include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
#include "mace/utils/utils.h"
namespace
mace
{
namespace
kernels
{
template
<
typename
T
>
MaceStatus
SpaceToDepthOpFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
)
{
const
index_t
batch
=
input
->
dim
(
0
);
const
index_t
input_height
=
input
->
dim
(
1
);
const
index_t
input_width
=
input
->
dim
(
2
);
const
index_t
input_depth
=
input
->
dim
(
3
);
MACE_CHECK
((
input_depth
%
4
)
==
0
,
"input channel should be dividable by 4"
);
MACE_CHECK
(
(
input_width
%
block_size_
==
0
)
&&
(
input_height
%
block_size_
==
0
),
"input width and height should be dividable by block_size"
);
const
index_t
output_height
=
input_height
/
block_size_
;
const
index_t
output_width
=
input_width
/
block_size_
;
const
index_t
output_depth
=
input_depth
*
block_size_
*
block_size_
;
const
index_t
input_depth_blocks
=
RoundUpDiv4
(
input_depth
);
const
index_t
output_depth_blocks
=
RoundUpDiv4
(
output_depth
);
std
::
vector
<
index_t
>
output_shape
=
{
batch
,
output_height
,
output_width
,
output_depth
};
std
::
vector
<
size_t
>
image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
image_shape
);
MACE_RETURN_IF_ERROR
(
output
->
ResizeImage
(
output_shape
,
image_shape
));
auto
runtime
=
context_
->
device
()
->
opencl_runtime
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
OUT_OF_RANGE_CONFIG
(
kernel_error_
,
context_
);
NON_UNIFORM_WG_CONFIG
;
const
char
*
kernel_name
=
"space_to_depth"
;
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
kernel_name
);
std
::
stringstream
kernel_name_ss
;
kernel_name_ss
<<
"-D"
<<
kernel_name
<<
"="
<<
obfuscated_kernel_name
;
built_options
.
emplace
(
kernel_name_ss
.
str
());
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
dt
));
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"space_to_depth"
,
obfuscated_kernel_name
,
built_options
,
&
kernel_
));
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
input_depth_blocks
),
static_cast
<
uint32_t
>
(
input_width
),
static_cast
<
uint32_t
>
(
input_height
*
batch
)};
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
OUT_OF_RANGE_SET_ARG
;
SET_3D_GWS_ARGS
(
kernel_
);
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
block_size_
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_depth_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_height
*
batch
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_depth_blocks
));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input
->
shape
();
}
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
runtime
,
gws
,
kwg_size_
);
std
::
string
tuning_key
=
Concat
(
"space_to_depth_opencl_kernel"
,
input
->
dim
(
0
),
input
->
dim
(
1
),
input
->
dim
(
2
),
input
->
dim
(
3
));
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
runtime
,
kernel_
,
tuning_key
,
gws
,
lws
,
future
));
OUT_OF_RANGE_VALIDATION
(
kernel_error_
);
return
MACE_SUCCESS
;
}
template
struct
SpaceToDepthOpFunctor
<
DeviceType
::
GPU
,
float
>;
template
struct
SpaceToDepthOpFunctor
<
DeviceType
::
GPU
,
half
>;
}
// namespace kernels
}
// namespace mace
mace/kernels/space_to_batch.h
浏览文件 @
ea9a8243
...
...
@@ -33,12 +33,10 @@ namespace kernels {
struct
SpaceToBatchFunctorBase
:
OpKernel
{
SpaceToBatchFunctorBase
(
OpKernelContext
*
context
,
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
block_shape
,
bool
b2s
)
const
std
::
vector
<
int
>
&
block_shape
)
:
OpKernel
(
context
),
paddings_
(
paddings
.
begin
(),
paddings
.
end
()),
block_shape_
(
block_shape
.
begin
(),
block_shape
.
end
()),
b2s_
(
b2s
)
{
block_shape_
(
block_shape
.
begin
(),
block_shape
.
end
())
{
MACE_CHECK
(
block_shape
.
size
()
==
2
&&
block_shape
[
0
]
>
1
&&
block_shape
[
1
]
>
1
,
"Block's shape should be 1D, and greater than 1"
);
...
...
@@ -47,7 +45,6 @@ struct SpaceToBatchFunctorBase : OpKernel {
std
::
vector
<
int
>
paddings_
;
std
::
vector
<
int
>
block_shape_
;
bool
b2s_
;
protected:
void
CalculateSpaceToBatchOutputShape
(
const
Tensor
*
input_tensor
,
...
...
@@ -93,43 +90,6 @@ struct SpaceToBatchFunctorBase : OpKernel {
output_shape
[
3
]
=
new_width
;
}
}
void
CalculateBatchToSpaceOutputShape
(
const
Tensor
*
input_tensor
,
const
DataFormat
data_format
,
index_t
*
output_shape
)
{
MACE_CHECK
(
input_tensor
->
dim_size
()
==
4
,
"Input's shape should be 4D"
);
index_t
batch
=
input_tensor
->
dim
(
0
);
index_t
channels
=
0
;
index_t
height
=
0
;
index_t
width
=
0
;
if
(
data_format
==
DataFormat
::
NHWC
)
{
height
=
input_tensor
->
dim
(
1
);
width
=
input_tensor
->
dim
(
2
);
channels
=
input_tensor
->
dim
(
3
);
}
else
if
(
data_format
==
DataFormat
::
NCHW
)
{
height
=
input_tensor
->
dim
(
2
);
width
=
input_tensor
->
dim
(
3
);
channels
=
input_tensor
->
dim
(
1
);
}
else
{
MACE_NOT_IMPLEMENTED
;
}
index_t
new_batch
=
batch
/
block_shape_
[
0
]
/
block_shape_
[
1
];
index_t
new_height
=
height
*
block_shape_
[
0
]
-
paddings_
[
0
]
-
paddings_
[
1
];
index_t
new_width
=
width
*
block_shape_
[
1
]
-
paddings_
[
2
]
-
paddings_
[
3
];
if
(
data_format
==
DataFormat
::
NHWC
)
{
output_shape
[
0
]
=
new_batch
;
output_shape
[
1
]
=
new_height
;
output_shape
[
2
]
=
new_width
;
output_shape
[
3
]
=
channels
;
}
else
{
output_shape
[
0
]
=
new_batch
;
output_shape
[
1
]
=
channels
;
output_shape
[
2
]
=
new_height
;
output_shape
[
3
]
=
new_width
;
}
}
};
template
<
DeviceType
D
,
typename
T
>
...
...
@@ -139,9 +99,8 @@ template<>
struct
SpaceToBatchFunctor
<
DeviceType
::
CPU
,
float
>
:
SpaceToBatchFunctorBase
{
SpaceToBatchFunctor
(
OpKernelContext
*
context
,
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
block_shape
,
bool
b2s
)
:
SpaceToBatchFunctorBase
(
context
,
paddings
,
block_shape
,
b2s
)
{}
const
std
::
vector
<
int
>
&
block_shape
)
:
SpaceToBatchFunctorBase
(
context
,
paddings
,
block_shape
)
{}
MaceStatus
operator
()(
Tensor
*
space_tensor
,
Tensor
*
batch_tensor
,
...
...
@@ -149,17 +108,11 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
MACE_UNUSED
(
future
);
std
::
vector
<
index_t
>
output_shape
(
4
,
0
);
if
(
b2s_
)
{
CalculateBatchToSpaceOutputShape
(
batch_tensor
,
DataFormat
::
NCHW
,
output_shape
.
data
());
MACE_RETURN_IF_ERROR
(
space_tensor
->
Resize
(
output_shape
));
}
else
{
CalculateSpaceToBatchOutputShape
(
space_tensor
,
DataFormat
::
NCHW
,
output_shape
.
data
());
MACE_RETURN_IF_ERROR
(
batch_tensor
->
Resize
(
output_shape
));
}
CalculateSpaceToBatchOutputShape
(
space_tensor
,
DataFormat
::
NCHW
,
output_shape
.
data
());
MACE_RETURN_IF_ERROR
(
batch_tensor
->
Resize
(
output_shape
));
Tensor
::
MappingGuard
input_guard
(
space_tensor
);
Tensor
::
MappingGuard
output_guard
(
batch_tensor
);
...
...
@@ -169,152 +122,85 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
int
block_shape_h
=
block_shape_
[
0
];
int
block_shape_w
=
block_shape_
[
1
];
if
(
b2s_
)
{
const
float
*
input_data
=
batch_tensor
->
data
<
float
>
();
float
*
output_data
=
space_tensor
->
mutable_data
<
float
>
();
const
float
*
input_data
=
space_tensor
->
data
<
float
>
();
float
*
output_data
=
batch_tensor
->
mutable_data
<
float
>
();
index_t
in_batches
=
batch
_tensor
->
dim
(
0
);
index_t
in_height
=
batch
_tensor
->
dim
(
2
);
index_t
in_width
=
batch
_tensor
->
dim
(
3
);
index_t
in_batches
=
space
_tensor
->
dim
(
0
);
index_t
in_height
=
space
_tensor
->
dim
(
2
);
index_t
in_width
=
space
_tensor
->
dim
(
3
);
index_t
out_batches
=
space
_tensor
->
dim
(
0
);
index_t
channels
=
space
_tensor
->
dim
(
1
);
index_t
out_height
=
space
_tensor
->
dim
(
2
);
index_t
out_width
=
space
_tensor
->
dim
(
3
);
index_t
out_batches
=
batch
_tensor
->
dim
(
0
);
index_t
channels
=
batch
_tensor
->
dim
(
1
);
index_t
out_height
=
batch
_tensor
->
dim
(
2
);
index_t
out_width
=
batch
_tensor
->
dim
(
3
);
// 32k/sizeof(float)/out_width/block_shape
index_t
block_h_size
=
std
::
max
(
static_cast
<
index_t
>
(
1
),
8
*
1024
/
block_shape_w
/
out_width
);
index_t
block_h_size
=
std
::
max
(
static_cast
<
index_t
>
(
1
),
8
*
1024
/
block_shape_w
/
in_width
);
// make channel outter loop so we can make best use of cache
// make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3)
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
block_h
=
0
;
block_h
<
in_height
;
block_h
+=
block_h_size
)
{
for
(
index_t
in_b
=
0
;
in_b
<
in_batches
;
++
in_b
)
{
const
index_t
b
=
in_b
%
out_batches
;
const
index_t
tile_index
=
in_b
/
out_batches
;
const
index_t
tile_h
=
tile_index
/
block_shape_w
;
const
index_t
tile_w
=
tile_index
%
block_shape_w
;
const
index_t
valid_h_start
=
std
::
max
(
block_h
,
(
pad_top
-
tile_h
+
block_shape_h
-
1
)
/
block_shape_h
);
const
index_t
valid_h_end
=
std
::
min
(
in_height
,
std
::
min
(
block_h
+
block_h_size
,
(
out_height
+
pad_top
-
tile_h
+
block_shape_h
-
1
)
/
block_shape_h
));
const
index_t
valid_w_start
=
std
::
max
(
static_cast
<
index_t
>
(
0
),
(
pad_left
-
tile_w
+
block_shape_w
-
1
)
/
block_shape_w
);
const
index_t
valid_w_end
=
std
::
min
(
in_width
,
(
out_width
+
pad_left
-
tile_w
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
block_h
=
0
;
block_h
<
out_height
;
block_h
+=
block_h_size
)
{
for
(
index_t
b
=
0
;
b
<
out_batches
;
++
b
)
{
const
index_t
in_b
=
b
%
in_batches
;
const
index_t
tile_index
=
b
/
in_batches
;
const
index_t
tile_h
=
tile_index
/
block_shape_w
;
const
index_t
tile_w
=
tile_index
%
block_shape_w
;
const
index_t
valid_h_start
=
std
::
max
(
block_h
,
(
pad_top
-
tile_h
+
block_shape_h
-
1
)
/
block_shape_h
);
const
index_t
valid_h_end
=
std
::
min
(
out_height
,
std
::
min
(
block_h
+
block_h_size
,
(
in_height
+
pad_top
-
tile_h
+
block_shape_h
-
1
)
/
block_shape_h
));
const
index_t
valid_w_start
=
std
::
max
(
static_cast
<
index_t
>
(
0
),
(
pad_left
-
tile_w
+
block_shape_w
-
1
)
/
block_shape_w
);
const
float
*
input_base
=
input_data
+
(
in_b
*
channels
+
c
)
*
in_height
*
in_width
;
float
*
output_base
=
output_data
+
(
b
*
channels
+
c
)
*
out_height
*
out_width
;
index_t
h
=
valid_h_start
*
block_shape_h
+
tile_h
-
pad_top
;
for
(
index_t
in_h
=
valid_h_start
;
in_h
<
valid_h_end
;
++
in_h
)
{
index_t
w
=
valid_w_start
*
block_shape_w
+
tile_w
-
pad_left
;
for
(
index_t
in_w
=
valid_w_start
;
in_w
<
valid_w_end
;
++
in_w
)
{
output_base
[
h
*
out_width
+
w
]
=
input_base
[
in_h
*
in_width
+
in_w
];
w
+=
block_shape_w
;
}
// w
h
+=
block_shape_h
;
}
// h
}
// b
}
// block_h
}
// c
}
else
{
const
float
*
input_data
=
space_tensor
->
data
<
float
>
();
float
*
output_data
=
batch_tensor
->
mutable_data
<
float
>
();
index_t
in_batches
=
space_tensor
->
dim
(
0
);
index_t
in_height
=
space_tensor
->
dim
(
2
);
index_t
in_width
=
space_tensor
->
dim
(
3
);
index_t
out_batches
=
batch_tensor
->
dim
(
0
);
index_t
channels
=
batch_tensor
->
dim
(
1
);
index_t
out_height
=
batch_tensor
->
dim
(
2
);
index_t
out_width
=
batch_tensor
->
dim
(
3
);
const
index_t
valid_w_end
=
std
::
min
(
out_width
,
(
in_width
+
pad_left
-
tile_w
+
block_shape_w
-
1
)
/
block_shape_w
);
const
float
*
input_base
=
input_data
+
(
in_b
*
channels
+
c
)
*
in_height
*
in_width
;
float
*
output_base
=
output_data
+
(
b
*
channels
+
c
)
*
out_height
*
out_width
;
memset
(
output_base
+
block_h
*
out_width
,
0
,
(
valid_h_start
-
block_h
)
*
out_width
*
sizeof
(
float
));
index_t
in_h
=
valid_h_start
*
block_shape_h
+
tile_h
-
pad_top
;
for
(
index_t
h
=
valid_h_start
;
h
<
valid_h_end
;
++
h
)
{
memset
(
output_base
+
h
*
out_width
,
0
,
valid_w_start
*
sizeof
(
float
));
index_t
block_h_size
=
std
::
max
(
static_cast
<
index_t
>
(
1
),
8
*
1024
/
block_shape_w
/
in_width
);
index_t
in_w
=
valid_w_start
*
block_shape_w
+
tile_w
-
pad_left
;
for
(
index_t
w
=
valid_w_start
;
w
<
valid_w_end
;
++
w
)
{
output_base
[
h
*
out_width
+
w
]
=
input_base
[
in_h
*
in_width
+
in_w
];
in_w
+=
block_shape_w
;
}
// w
in_h
+=
block_shape_h
;
// make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3)
for
(
index_t
c
=
0
;
c
<
channels
;
++
c
)
{
for
(
index_t
block_h
=
0
;
block_h
<
out_height
;
block_h
+=
block_h_size
)
{
for
(
index_t
b
=
0
;
b
<
out_batches
;
++
b
)
{
const
index_t
in_b
=
b
%
in_batches
;
const
index_t
tile_index
=
b
/
in_batches
;
const
index_t
tile_h
=
tile_index
/
block_shape_w
;
const
index_t
tile_w
=
tile_index
%
block_shape_w
;
const
index_t
valid_h_start
=
std
::
max
(
block_h
,
(
pad_top
-
tile_h
+
block_shape_h
-
1
)
/
block_shape_h
);
const
index_t
valid_h_end
=
std
::
min
(
out_height
,
std
::
min
(
block_h
+
block_h_size
,
(
in_height
+
pad_top
-
tile_h
+
block_shape_h
-
1
)
/
block_shape_h
));
const
index_t
valid_w_start
=
std
::
max
(
static_cast
<
index_t
>
(
0
),
(
pad_left
-
tile_w
+
block_shape_w
-
1
)
/
block_shape_w
);
const
index_t
valid_w_end
=
std
::
min
(
out_width
,
(
in_width
+
pad_left
-
tile_w
+
block_shape_w
-
1
)
/
block_shape_w
);
const
float
*
input_base
=
input_data
+
(
in_b
*
channels
+
c
)
*
in_height
*
in_width
;
float
*
output_base
=
output_data
+
(
b
*
channels
+
c
)
*
out_height
*
out_width
;
memset
(
output_base
+
block_h
*
out_width
,
memset
(
output_base
+
h
*
out_width
+
valid_w_end
,
0
,
(
valid_h_start
-
block_h
)
*
out_width
*
sizeof
(
float
));
index_t
in_h
=
valid_h_start
*
block_shape_h
+
tile_h
-
pad_top
;
for
(
index_t
h
=
valid_h_start
;
h
<
valid_h_end
;
++
h
)
{
memset
(
output_base
+
h
*
out_width
,
0
,
valid_w_start
*
sizeof
(
float
));
index_t
in_w
=
valid_w_start
*
block_shape_w
+
tile_w
-
pad_left
;
for
(
index_t
w
=
valid_w_start
;
w
<
valid_w_end
;
++
w
)
{
output_base
[
h
*
out_width
+
w
]
=
input_base
[
in_h
*
in_width
+
in_w
];
in_w
+=
block_shape_w
;
}
// w
in_h
+=
block_shape_h
;
memset
(
output_base
+
h
*
out_width
+
valid_w_end
,
0
,
(
out_width
-
valid_w_end
)
*
sizeof
(
float
));
}
// h
memset
(
output_base
+
valid_h_end
*
out_width
,
0
,
(
std
::
min
(
out_height
,
block_h
+
block_h_size
)
-
valid_h_end
)
*
out_width
*
sizeof
(
float
));
}
// b
}
// block_h
}
// c
}
(
out_width
-
valid_w_end
)
*
sizeof
(
float
));
}
// h
memset
(
output_base
+
valid_h_end
*
out_width
,
0
,
(
std
::
min
(
out_height
,
block_h
+
block_h_size
)
-
valid_h_end
)
*
out_width
*
sizeof
(
float
));
}
// b
}
// block_h
}
// c
return
MACE_SUCCESS
;
}
};
...
...
@@ -324,9 +210,8 @@ template <typename T>
struct
SpaceToBatchFunctor
<
DeviceType
::
GPU
,
T
>
:
SpaceToBatchFunctorBase
{
SpaceToBatchFunctor
(
OpKernelContext
*
context
,
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
block_shape
,
bool
b2s
)
:
SpaceToBatchFunctorBase
(
context
,
paddings
,
block_shape
,
b2s
)
{}
const
std
::
vector
<
int
>
&
block_shape
)
:
SpaceToBatchFunctorBase
(
context
,
paddings
,
block_shape
)
{}
MaceStatus
operator
()(
Tensor
*
space_tensor
,
Tensor
*
batch_tensor
,
...
...
mace/kernels/space_to_depth.h
0 → 100644
浏览文件 @
ea9a8243
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_SPACE_TO_DEPTH_H_
#define MACE_KERNELS_SPACE_TO_DEPTH_H_
#include <memory>
#include <vector>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
#include "mace/kernels/kernel.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
#endif // MACE_ENABLE_OPENCL
namespace
mace
{
namespace
kernels
{
template
<
DeviceType
D
,
typename
T
>
struct
SpaceToDepthOpFunctor
:
OpKernel
{
SpaceToDepthOpFunctor
(
OpKernelContext
*
context
,
const
int
block_size
)
:
OpKernel
(
context
),
block_size_
(
block_size
)
{}
MaceStatus
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
)
{
MACE_UNUSED
(
future
);
const
index_t
batch_size
=
input
->
dim
(
0
);
const
index_t
input_depth
=
input
->
dim
(
1
);
const
index_t
input_height
=
input
->
dim
(
2
);
const
index_t
input_width
=
input
->
dim
(
3
);
MACE_CHECK
(
(
input_width
%
block_size_
==
0
)
&&
(
input_height
%
block_size_
==
0
),
"input width and height should be dividable by block_size"
);
const
index_t
output_depth
=
input_depth
*
block_size_
*
block_size_
;
const
index_t
output_width
=
input_width
/
block_size_
;
const
index_t
output_height
=
input_height
/
block_size_
;
std
::
vector
<
index_t
>
output_shape
=
{
batch_size
,
output_depth
,
output_height
,
output_width
};
MACE_RETURN_IF_ERROR
(
output
->
Resize
(
output_shape
));
Tensor
::
MappingGuard
logits_guard
(
input
);
Tensor
::
MappingGuard
output_guard
(
output
);
const
T
*
input_ptr
=
input
->
data
<
T
>
();
T
*
output_ptr
=
output
->
mutable_data
<
T
>
();
#pragma omp parallel for
for
(
index_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
index_t
d
=
0
;
d
<
input_depth
;
++
d
)
{
for
(
index_t
h
=
0
;
h
<
input_height
;
++
h
)
{
const
index_t
out_h
=
h
/
block_size_
;
const
index_t
offset_h
=
(
h
%
block_size_
);
for
(
index_t
w
=
0
;
w
<
input_width
;
++
w
)
{
const
index_t
out_w
=
w
/
block_size_
;
const
index_t
offset_w
=
(
w
%
block_size_
);
const
index_t
offset_d
=
(
offset_h
*
block_size_
+
offset_w
)
*
input_depth
;
const
index_t
out_d
=
d
+
offset_d
;
const
index_t
o_index
=
((
b
*
output_depth
+
out_d
)
*
output_height
+
out_h
)
*
output_width
+
out_w
;
const
index_t
i_index
=
((
b
*
input_depth
+
d
)
*
input_height
+
h
)
*
input_width
+
w
;
output_ptr
[
o_index
]
=
input_ptr
[
i_index
];
}
}
}
}
return
MACE_SUCCESS
;
}
const
int
block_size_
;
};
#ifdef MACE_ENABLE_OPENCL
template
<
typename
T
>
struct
SpaceToDepthOpFunctor
<
DeviceType
::
GPU
,
T
>
:
OpKernel
{
explicit
SpaceToDepthOpFunctor
(
OpKernelContext
*
context
,
const
int
block_size
)
:
OpKernel
(
context
),
block_size_
(
block_size
)
{}
MaceStatus
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
const
int
block_size_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
#endif // MACE_ENABLE_OPENCL
}
// namespace kernels
}
// namespace mace
#endif // MACE_KERNELS_SPACE_TO_DEPTH_H_
mace/ops/batch_to_space.h
浏览文件 @
ea9a8243
...
...
@@ -19,7 +19,7 @@
#include <vector>
#include "mace/core/operator.h"
#include "mace/kernels/
space_to_batch
.h"
#include "mace/kernels/
batch_to_space
.h"
namespace
mace
{
namespace
ops
{
...
...
@@ -31,8 +31,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
:
Operator
<
D
,
T
>
(
op_def
,
context
),
functor_
(
context
,
OperatorBase
::
GetRepeatedArgs
<
int
>
(
"crops"
,
{
0
,
0
,
0
,
0
}),
OperatorBase
::
GetRepeatedArgs
<
int
>
(
"block_shape"
,
{
1
,
1
}),
true
)
{}
OperatorBase
::
GetRepeatedArgs
<
int
>
(
"block_shape"
,
{
1
,
1
}))
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
batch_tensor
=
this
->
Input
(
INPUT
);
...
...
@@ -41,7 +40,7 @@ class BatchToSpaceNDOp : public Operator<D, T> {
}
private:
kernels
::
SpaceToBatch
Functor
<
D
,
T
>
functor_
;
kernels
::
BatchToSpace
Functor
<
D
,
T
>
functor_
;
protected:
MACE_OP_INPUT_TAGS
(
INPUT
);
...
...
mace/ops/depth_to_space.h
浏览文件 @
ea9a8243
...
...
@@ -30,26 +30,13 @@ class DepthToSpaceOp : public Operator<D, T> {
DepthToSpaceOp
(
const
OperatorDef
&
op_def
,
OpKernelContext
*
context
)
:
Operator
<
D
,
T
>
(
op_def
,
context
),
block_size_
(
OperatorBase
::
GetOptionalArg
<
int
>
(
"block_size"
,
1
)),
functor_
(
context
,
this
->
block_size_
,
true
)
{}
functor_
(
context
,
this
->
block_size_
)
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
MACE_CHECK
(
input
->
dim_size
()
==
4
,
"input dim should be 4"
);
int
input_depth
;
if
(
D
==
CPU
)
{
input_depth
=
input
->
dim
(
1
);
}
else
if
(
D
==
GPU
)
{
input_depth
=
input
->
dim
(
3
);
}
else
{
MACE_NOT_IMPLEMENTED
;
}
MACE_CHECK
(
input_depth
%
(
block_size_
*
block_size_
)
==
0
,
"input depth should be dividable by block_size * block_size"
,
input_depth
);
MACE_CHECK
((
input_depth
%
4
)
==
0
,
"input channel should be dividable by 4"
);
return
functor_
(
input
,
output
,
future
);
}
...
...
mace/ops/depth_to_space_test.cc
浏览文件 @
ea9a8243
...
...
@@ -24,21 +24,18 @@ namespace test {
namespace
{
template
<
DeviceType
D
>
void
RunDepthToSpace
(
const
bool
d2s
,
const
std
::
vector
<
index_t
>
&
input_shape
,
void
RunDepthToSpace
(
const
std
::
vector
<
index_t
>
&
input_shape
,
const
std
::
vector
<
float
>
&
input_data
,
const
int
block_size
,
const
std
::
vector
<
index_t
>
&
expected_shape
,
const
std
::
vector
<
float
>
&
expected_data
)
{
OpsTestNet
net
;
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
input_shape
,
input_data
);
const
char
*
ops_name
=
(
d2s
)
?
"DepthToSpace"
:
"SpaceToDepth"
;
const
char
*
ops_test_name
=
(
d2s
)
?
"DepthToSpaceTest"
:
"SpaceToDepthTest"
;
// Construct graph
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
ops_name
,
ops_test_name
)
OpDefBuilder
(
"DepthToSpace"
,
"DepthToSpaceTest"
)
.
Input
(
"InputNCHW"
)
.
Output
(
"OutputNCHW"
)
.
AddIntArg
(
"block_size"
,
block_size
)
...
...
@@ -51,7 +48,7 @@ void RunDepthToSpace(const bool d2s,
}
else
{
BufferToImage
<
D
,
float
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
OpDefBuilder
(
ops_name
,
ops_test_name
)
OpDefBuilder
(
"DepthToSpace"
,
"DepthToSpaceTest"
)
.
Input
(
"InputImage"
)
.
Output
(
"OutputImage"
)
.
AddIntArg
(
"block_size"
,
block_size
)
...
...
@@ -69,47 +66,11 @@ void RunDepthToSpace(const bool d2s,
}
}
// namespace
class
SpaceToDepthOpTest
:
public
OpsTestBase
{};
TEST_F
(
SpaceToDepthOpTest
,
Input2x4x4_B2_CPU
)
{
RunDepthToSpace
<
DeviceType
::
CPU
>
(
false
,
{
1
,
2
,
4
,
4
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
},
2
,
{
1
,
1
,
2
,
16
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
});
}
TEST_F
(
SpaceToDepthOpTest
,
Input2x4x4_B2_OPENCL
)
{
RunDepthToSpace
<
DeviceType
::
GPU
>
(
false
,
{
1
,
2
,
4
,
4
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
},
2
,
{
1
,
1
,
2
,
16
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
});
}
TEST_F
(
SpaceToDepthOpTest
,
Input2x2x4_B2_CPU
)
{
RunDepthToSpace
<
DeviceType
::
CPU
>
(
false
,
{
1
,
2
,
2
,
4
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
},
2
,
{
1
,
1
,
1
,
16
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
});
}
TEST_F
(
SpaceToDepthOpTest
,
Input4x4x1_B2_OPENCL
)
{
RunDepthToSpace
<
DeviceType
::
GPU
>
(
false
,
{
1
,
2
,
2
,
4
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
},
2
,
{
1
,
1
,
1
,
16
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
});
}
class
DepthToSpaceOpTest
:
public
OpsTestBase
{};
TEST_F
(
DepthToSpaceOpTest
,
Input1x2x16_B2_CPU
)
{
RunDepthToSpace
<
DeviceType
::
CPU
>
(
true
,
{
1
,
1
,
2
,
16
},
{
1
,
1
,
2
,
16
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
},
2
,
{
1
,
2
,
4
,
4
},
...
...
@@ -119,7 +80,7 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_CPU) {
TEST_F
(
DepthToSpaceOpTest
,
Input1x2x16_B2_OPENCL
)
{
RunDepthToSpace
<
DeviceType
::
GPU
>
(
true
,
{
1
,
1
,
2
,
16
},
{
1
,
1
,
2
,
16
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
},
2
,
{
1
,
2
,
4
,
4
},
...
...
@@ -129,14 +90,14 @@ TEST_F(DepthToSpaceOpTest, Input1x2x16_B2_OPENCL) {
TEST_F
(
DepthToSpaceOpTest
,
Input1x1x16_B2_CPU
)
{
RunDepthToSpace
<
DeviceType
::
CPU
>
(
true
,
{
1
,
1
,
1
,
16
},
{
1
,
1
,
1
,
16
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
},
2
,
{
1
,
2
,
2
,
4
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
});
}
TEST_F
(
DepthToSpaceOpTest
,
Input1x1x16_B2_OPENCL
)
{
RunDepthToSpace
<
DeviceType
::
GPU
>
(
true
,
{
1
,
1
,
1
,
16
},
{
1
,
1
,
1
,
16
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
},
2
,
{
1
,
2
,
2
,
4
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
});
}
...
...
@@ -144,14 +105,13 @@ TEST_F(DepthToSpaceOpTest, Input1x1x16_B2_OPENCL) {
TEST_F
(
DepthToSpaceOpTest
,
InputLarger_B2_OPENCL
)
{
const
std
::
vector
<
float
>
in
=
std
::
vector
<
float
>
(
192
*
192
*
128
,
1.0
);
RunDepthToSpace
<
DeviceType
::
GPU
>
(
true
,
{
1
,
192
,
192
,
128
},
in
,
2
,
RunDepthToSpace
<
DeviceType
::
GPU
>
({
1
,
192
,
192
,
128
},
in
,
2
,
{
1
,
384
,
384
,
32
},
in
);
}
namespace
{
template
<
DeviceType
D
,
typename
T
>
void
RandomTest
(
const
bool
d2s
,
const
int
block_size
,
void
RandomTest
(
const
int
block_size
,
const
std
::
vector
<
index_t
>
&
shape
)
{
testing
::
internal
::
LogToStderr
();
srand
(
time
(
NULL
));
...
...
@@ -159,14 +119,11 @@ void RandomTest(const bool d2s,
// Construct graph
OpsTestNet
net
;
const
char
*
ops_name
=
(
d2s
)
?
"DepthToSpace"
:
"SpaceToDepth"
;
const
char
*
ops_test_name
=
(
d2s
)
?
"DepthToSpaceTest"
:
"SpaceToDepthTest"
;
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
shape
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
ops_name
,
ops_test_name
)
OpDefBuilder
(
"DepthToSpace"
,
"DepthToSpaceTest"
)
.
Input
(
"InputNCHW"
)
.
AddIntArg
(
"block_size"
,
block_size
)
.
Output
(
"OutputNCHW"
)
...
...
@@ -181,7 +138,7 @@ void RandomTest(const bool d2s,
BufferToImage
<
D
,
T
>
(
&
net
,
"Input"
,
"InputImg"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
OpDefBuilder
(
ops_name
,
ops_test_name
)
OpDefBuilder
(
"DepthToSpace"
,
"DepthToSpaceTest"
)
.
Input
(
"InputImg"
)
.
AddIntArg
(
"block_size"
,
block_size
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
...
...
@@ -205,19 +162,11 @@ void RandomTest(const bool d2s,
}
// namespace
TEST_F
(
DepthToSpaceOpTest
,
OPENCLRandomFloat
)
{
RandomTest
<
DeviceType
::
GPU
,
float
>
(
true
,
2
,
{
1
,
192
,
192
,
128
});
RandomTest
<
DeviceType
::
GPU
,
float
>
(
2
,
{
1
,
192
,
192
,
128
});
}
TEST_F
(
DepthToSpaceOpTest
,
OPENCLRandomHalf
)
{
RandomTest
<
DeviceType
::
GPU
,
half
>
(
true
,
2
,
{
1
,
192
,
192
,
128
});
}
TEST_F
(
SpaceToDepthOpTest
,
OPENCLRandomFloat
)
{
RandomTest
<
DeviceType
::
GPU
,
float
>
(
false
,
2
,
{
1
,
384
,
384
,
32
});
}
TEST_F
(
SpaceToDepthOpTest
,
OPENCLRandomHalf
)
{
RandomTest
<
DeviceType
::
GPU
,
half
>
(
false
,
2
,
{
1
,
384
,
384
,
32
});
RandomTest
<
DeviceType
::
GPU
,
half
>
(
2
,
{
1
,
192
,
192
,
128
});
}
}
// namespace test
...
...
mace/ops/space_to_batch.h
浏览文件 @
ea9a8243
...
...
@@ -31,8 +31,7 @@ class SpaceToBatchNDOp : public Operator<D, T> {
:
Operator
<
D
,
T
>
(
op_def
,
context
),
functor_
(
context
,
OperatorBase
::
GetRepeatedArgs
<
int
>
(
"paddings"
,
{
0
,
0
,
0
,
0
}),
OperatorBase
::
GetRepeatedArgs
<
int
>
(
"block_shape"
,
{
1
,
1
}),
false
)
{}
OperatorBase
::
GetRepeatedArgs
<
int
>
(
"block_shape"
,
{
1
,
1
}))
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
space_tensor
=
this
->
Input
(
INPUT
);
...
...
mace/ops/space_to_depth.h
浏览文件 @
ea9a8243
...
...
@@ -19,7 +19,7 @@
#include <vector>
#include "mace/core/operator.h"
#include "mace/kernels/
depth_to_space
.h"
#include "mace/kernels/
space_to_depth
.h"
namespace
mace
{
namespace
ops
{
...
...
@@ -30,34 +30,12 @@ class SpaceToDepthOp : public Operator<D, T> {
SpaceToDepthOp
(
const
OperatorDef
&
op_def
,
OpKernelContext
*
context
)
:
Operator
<
D
,
T
>
(
op_def
,
context
),
functor_
(
context
,
OperatorBase
::
GetOptionalArg
<
int
>
(
"block_size"
,
1
),
false
)
{}
OperatorBase
::
GetOptionalArg
<
int
>
(
"block_size"
,
1
))
{}
MaceStatus
Run
(
StatsFuture
*
future
)
override
{
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
MACE_CHECK
(
input
->
dim_size
()
==
4
,
"input dim should be 4"
);
const
int
block_size
=
OperatorBase
::
GetOptionalArg
<
int
>
(
"block_size"
,
1
);
index_t
input_height
;
index_t
input_width
;
index_t
input_depth
;
if
(
D
==
CPU
)
{
input_height
=
input
->
dim
(
2
);
input_width
=
input
->
dim
(
3
);
input_depth
=
input
->
dim
(
1
);
}
else
if
(
D
==
GPU
)
{
input_height
=
input
->
dim
(
1
);
input_width
=
input
->
dim
(
2
);
input_depth
=
input
->
dim
(
3
);
}
else
{
MACE_NOT_IMPLEMENTED
;
}
MACE_CHECK
((
input_depth
%
4
)
==
0
,
"input channel should be dividable by 4"
);
MACE_CHECK
(
(
input_width
%
block_size
==
0
)
&&
(
input_height
%
block_size
==
0
),
"input width and height should be dividable by block_size"
,
input
->
dim
(
3
));
return
functor_
(
input
,
output
,
future
);
}
...
...
@@ -66,7 +44,7 @@ class SpaceToDepthOp : public Operator<D, T> {
MACE_OP_OUTPUT_TAGS
(
OUTPUT
);
private:
kernels
::
DepthToSpace
OpFunctor
<
D
,
T
>
functor_
;
kernels
::
SpaceToDepth
OpFunctor
<
D
,
T
>
functor_
;
};
}
// namespace ops
...
...
mace/ops/space_to_depth_test.cc
0 → 100644
浏览文件 @
ea9a8243
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <fstream>
#include <vector>
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
namespace
mace
{
namespace
ops
{
namespace
test
{
namespace
{
template
<
DeviceType
D
>
void
RunSpaceToDepth
(
const
std
::
vector
<
index_t
>
&
input_shape
,
const
std
::
vector
<
float
>
&
input_data
,
const
int
block_size
,
const
std
::
vector
<
index_t
>
&
expected_shape
,
const
std
::
vector
<
float
>
&
expected_data
)
{
OpsTestNet
net
;
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
input_shape
,
input_data
);
// Construct graph
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"SpaceToDepth"
,
"SpaceToDepthTest"
)
.
Input
(
"InputNCHW"
)
.
Output
(
"OutputNCHW"
)
.
AddIntArg
(
"block_size"
,
block_size
)
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
}
else
{
BufferToImage
<
D
,
float
>
(
&
net
,
"Input"
,
"InputImage"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
OpDefBuilder
(
"SpaceToDepth"
,
"SpaceToDepthTest"
)
.
Input
(
"InputImage"
)
.
Output
(
"OutputImage"
)
.
AddIntArg
(
"block_size"
,
block_size
)
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
}
if
(
D
==
DeviceType
::
GPU
)
{
ImageToBuffer
<
DeviceType
::
GPU
,
float
>
(
&
net
,
"OutputImage"
,
"Output"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
}
auto
expected
=
net
.
CreateTensor
<
float
>
(
expected_shape
,
expected_data
);
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
);
}
}
// namespace
class
SpaceToDepthOpTest
:
public
OpsTestBase
{};
TEST_F
(
SpaceToDepthOpTest
,
Input2x4x4_B2_CPU
)
{
RunSpaceToDepth
<
DeviceType
::
CPU
>
(
{
1
,
2
,
4
,
4
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
},
2
,
{
1
,
1
,
2
,
16
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
});
}
TEST_F
(
SpaceToDepthOpTest
,
Input2x4x4_B2_OPENCL
)
{
RunSpaceToDepth
<
DeviceType
::
GPU
>
(
{
1
,
2
,
4
,
4
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
},
2
,
{
1
,
1
,
2
,
16
},
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
21
,
22
,
23
,
24
,
25
,
26
,
27
,
28
,
29
,
30
,
31
});
}
TEST_F
(
SpaceToDepthOpTest
,
Input2x2x4_B2_CPU
)
{
RunSpaceToDepth
<
DeviceType
::
CPU
>
(
{
1
,
2
,
2
,
4
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
},
2
,
{
1
,
1
,
1
,
16
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
});
}
TEST_F
(
SpaceToDepthOpTest
,
Input4x4x1_B2_OPENCL
)
{
RunSpaceToDepth
<
DeviceType
::
GPU
>
(
{
1
,
2
,
2
,
4
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
},
2
,
{
1
,
1
,
1
,
16
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
});
}
namespace
{
template
<
DeviceType
D
,
typename
T
>
void
RandomTest
(
const
int
block_size
,
const
std
::
vector
<
index_t
>
&
shape
)
{
testing
::
internal
::
LogToStderr
();
srand
(
time
(
NULL
));
// Construct graph
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
shape
);
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
OpDefBuilder
(
"SpaceToDepth"
,
"SpaceToDepthTest"
)
.
Input
(
"InputNCHW"
)
.
AddIntArg
(
"block_size"
,
block_size
)
.
Output
(
"OutputNCHW"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
BufferToImage
<
D
,
T
>
(
&
net
,
"Input"
,
"InputImg"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
OpDefBuilder
(
"SpaceToDepth"
,
"SpaceToDepthTest"
)
.
Input
(
"InputImg"
)
.
AddIntArg
(
"block_size"
,
block_size
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Output
(
"OutputImg"
)
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
ImageToBuffer
<
D
,
float
>
(
&
net
,
"OutputImg"
,
"OPENCLOutput"
,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
if
(
DataTypeToEnum
<
T
>::
value
==
DT_FLOAT
)
{
ExpectTensorNear
<
float
>
(
*
net
.
GetTensor
(
"Output"
),
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-5
);
}
else
{
ExpectTensorNear
<
float
>
(
*
net
.
GetTensor
(
"Output"
),
*
net
.
GetOutput
(
"OPENCLOutput"
),
1e-3
,
1e-4
);
}
}
}
// namespace
TEST_F
(
SpaceToDepthOpTest
,
OPENCLRandomFloat
)
{
RandomTest
<
DeviceType
::
GPU
,
float
>
(
2
,
{
1
,
384
,
384
,
32
});
}
TEST_F
(
SpaceToDepthOpTest
,
OPENCLRandomHalf
)
{
RandomTest
<
DeviceType
::
GPU
,
half
>
(
2
,
{
1
,
384
,
384
,
32
});
}
}
// namespace test
}
// namespace ops
}
// namespace mace
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录