Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Mr.Vain
Mace
提交
6b65f42b
Mace
项目概览
Mr.Vain
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
6b65f42b
编写于
5月 12, 2020
作者:
L
luxuhui
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feature: support GroupNorm op
N/A Signed-off-by:
N
Luxuhui
<
luxuhui@xiaomi.com
>
上级
b5dfcf4d
变更
17
隐藏空白更改
内联
并排
Showing
17 changed file
with
761 addition
and
133 deletion
+761
-133
mace/core/ops/operator.h
mace/core/ops/operator.h
+1
-1
mace/libmace/mace.cc
mace/libmace/mace.cc
+1
-1
mace/ops/argmax.cc
mace/ops/argmax.cc
+1
-1
mace/ops/eltwise.cc
mace/ops/eltwise.cc
+1
-1
mace/ops/group_norm.cc
mace/ops/group_norm.cc
+195
-0
mace/ops/mvnorm.cc
mace/ops/mvnorm.cc
+4
-1
mace/ops/opencl/cl/mvnorm.cl
mace/ops/opencl/cl/mvnorm.cl
+72
-23
mace/ops/opencl/image/mvnorm.cc
mace/ops/opencl/image/mvnorm.cc
+142
-77
mace/ops/opencl/image/mvnorm.h
mace/ops/opencl/image/mvnorm.h
+49
-18
mace/ops/opencl/mvnorm.h
mace/ops/opencl/mvnorm.h
+0
-1
mace/ops/registry/ops_registry.cc
mace/ops/registry/ops_registry.cc
+2
-0
mace/utils/tuner.h
mace/utils/tuner.h
+1
-1
test/ccunit/mace/ops/group_norm_test.cc
test/ccunit/mace/ops/group_norm_test.cc
+262
-0
third_party/caffe/caffe.proto
third_party/caffe/caffe.proto
+2
-2
tools/python/transform/base_converter.py
tools/python/transform/base_converter.py
+8
-6
tools/python/transform/caffe_converter.py
tools/python/transform/caffe_converter.py
+19
-0
tools/python/transform/shape_inference.py
tools/python/transform/shape_inference.py
+1
-0
未找到文件。
mace/core/ops/operator.h
浏览文件 @
6b65f42b
...
...
@@ -65,7 +65,7 @@ class Operation {
}
bool
ExistArg
(
const
std
::
string
&
name
)
const
{
MACE_CHECK
(
operator_def_
,
"operator_def
wa
s null!"
);
MACE_CHECK
(
operator_def_
,
"operator_def
i
s null!"
);
return
ProtoArgHelper
::
ExistArg
<
OperatorDef
>
(
*
operator_def_
,
name
);
}
...
...
mace/libmace/mace.cc
浏览文件 @
6b65f42b
...
...
@@ -893,7 +893,7 @@ MaceStatus MaceEngine::Impl::TransposeOutput(
int64_t
output_size
=
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
int64_t
>
());
MACE_CHECK
(
output_size
<=
output
->
second
.
impl_
->
buffer_size
)
<<
"
Output size exceeds buffer size: shape"
<<
output_tensor
->
name
()
<<
"
Output size exceeds buffer size: shape"
<<
MakeString
<
int64_t
>
(
shape
)
<<
" vs buffer size "
<<
output
->
second
.
impl_
->
buffer_size
;
output
->
second
.
impl_
->
shape
=
shape
;
...
...
mace/ops/argmax.cc
浏览文件 @
6b65f42b
...
...
@@ -176,7 +176,7 @@ class ArgMaxOp : public Operation {
const
FrameworkType
model_type_
;
// for Caffe
const
bool
has_axis_
;
const
bool
top_k_
;
const
int
top_k_
;
const
bool
out_val_
;
// for ONNX and TENSORFLOW
...
...
mace/ops/eltwise.cc
浏览文件 @
6b65f42b
...
...
@@ -953,7 +953,7 @@ class EltwiseOp : public Operation {
swapped
=
!
swapped
;
}
// convert tensor for caffe's b
oar
dcast
// convert tensor for caffe's b
roa
dcast
if
(
!
has_data_format_
&&
input0
->
dim_size
()
==
4
)
{
if
(
input1
->
dim_size
()
==
2
)
{
const_cast
<
Tensor
*>
(
input1
)
->
Reshape
(
...
...
mace/ops/group_norm.cc
0 → 100644
浏览文件 @
6b65f42b
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <memory>
#include <string>
#include <vector>
#include "mace/core/ops/operator.h"
#include "mace/core/registry/ops_registry.h"
#include "mace/ops/activation.h"
#include "mace/ops/delegator/activation.h"
#include "mace/utils/memory.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/ops/opencl/image/mvnorm.h"
#endif // MACE_ENABLE_OPENCL
namespace
mace
{
namespace
ops
{
template
<
DeviceType
D
,
class
T
>
class
GroupNormOp
;
template
<
class
T
>
class
GroupNormOp
<
DeviceType
::
CPU
,
T
>
:
public
Operation
{
public:
explicit
GroupNormOp
(
OpConstructContext
*
context
)
:
Operation
(
context
),
eps_
(
Operation
::
GetOptionalArg
<
float
>
(
"epsilon"
,
static_cast
<
float
>
(
1e-5
))),
group_num_
(
Operation
::
GetOptionalArg
<
int
>
(
"group_num"
,
32
))
{}
MaceStatus
Run
(
OpContext
*
context
)
override
{
MACE_UNUSED
(
context
);
const
Tensor
*
input
=
Input
(
INPUT
);
Tensor
*
output
=
Output
(
OUTPUT
);
MACE_CHECK
(
input
->
dim_size
()
==
4
,
"input must be 4-dimensional. "
,
input
->
dim_size
());
const
std
::
vector
<
index_t
>
&
input_shape
=
input
->
shape
();
MACE_RETURN_IF_ERROR
(
output
->
Resize
(
input_shape
));
const
auto
batch
=
input_shape
[
0
];
const
auto
channel
=
input_shape
[
1
];
const
auto
height
=
input_shape
[
2
];
const
auto
width
=
input_shape
[
3
];
MACE_CHECK
(
channel
%
group_num_
==
0
,
"group_num_ invalid."
,
channel
,
group_num_
);
const
auto
group_size
=
channel
/
group_num_
;
Tensor
::
MappingGuard
guard_input
(
input
);
Tensor
::
MappingGuard
guard_output
(
output
);
const
T
*
input_data
=
input
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
();
const
auto
outer_loop
=
batch
*
group_num_
;
const
auto
inner_loop
=
group_size
*
height
*
width
;
utils
::
ThreadPool
&
thread_pool
=
context
->
device
()
->
cpu_runtime
()
->
thread_pool
();
auto
*
scratch_buffer
=
context
->
device
()
->
scratch_buffer
();
scratch_buffer
->
Rewind
();
auto
scratch_buffer_size
=
outer_loop
*
sizeof
(
float
)
*
2
;
MACE_RETURN_IF_ERROR
(
scratch_buffer
->
GrowSize
(
scratch_buffer_size
));
float
*
mean_ptr
=
scratch_buffer
->
mutable_data
<
float
>
();
float
*
variance_ptr
=
mean_ptr
+
outer_loop
;
// compute EX
thread_pool
.
Compute1D
([
=
](
index_t
start
,
index_t
end
,
index_t
step
)
{
for
(
index_t
i
=
start
;
i
<
end
;
i
+=
step
)
{
const
auto
offset
=
inner_loop
*
i
;
mean_ptr
[
i
]
=
std
::
accumulate
(
input_data
+
offset
,
input_data
+
offset
+
inner_loop
,
static_cast
<
T
>
(
0.0
f
));
mean_ptr
[
i
]
/=
inner_loop
;
}
},
0
,
outer_loop
,
1
);
// compute (X - EX)^2
thread_pool
.
Compute2D
([
=
](
index_t
start0
,
index_t
end0
,
index_t
step0
,
index_t
start1
,
index_t
end1
,
index_t
step1
)
{
for
(
index_t
i
=
start0
;
i
<
end0
;
i
+=
step0
)
{
const
auto
offset
=
i
*
inner_loop
;
for
(
index_t
j
=
start1
;
j
<
end1
;
j
+=
step1
)
{
const
auto
idx
=
offset
+
j
;
const
auto
x_ex
=
input_data
[
idx
]
-
mean_ptr
[
i
];
output_data
[
idx
]
=
x_ex
*
x_ex
;
}
}
},
0
,
outer_loop
,
1
,
0
,
inner_loop
,
1
);
// compute (E((X - EX)^2) + eps_)^0.5
thread_pool
.
Compute1D
([
=
](
index_t
start
,
index_t
end
,
index_t
step
)
{
for
(
index_t
i
=
start
;
i
<
end
;
i
+=
step
)
{
auto
output_data_base
=
output_data
+
inner_loop
*
i
;
variance_ptr
[
i
]
=
std
::
accumulate
(
output_data_base
,
output_data_base
+
inner_loop
,
static_cast
<
T
>
(
0.0
f
));
variance_ptr
[
i
]
=
std
::
pow
(
variance_ptr
[
i
]
/
inner_loop
+
eps_
,
0.5
f
);
}
},
0
,
outer_loop
,
1
);
// compute (X - EX) / ((E((X - EX)^2) + eps_)^0.5)
thread_pool
.
Compute2D
([
=
](
index_t
start0
,
index_t
end0
,
index_t
step0
,
index_t
start1
,
index_t
end1
,
index_t
step1
)
{
for
(
index_t
i
=
start0
;
i
<
end0
;
i
+=
step0
)
{
const
auto
offset
=
i
*
inner_loop
;
for
(
index_t
j
=
start1
;
j
<
end1
;
j
+=
step1
)
{
output_data
[
offset
+
j
]
=
(
input_data
[
offset
+
j
]
-
mean_ptr
[
i
])
/
variance_ptr
[
i
];
}
}
},
0
,
outer_loop
,
1
,
0
,
inner_loop
,
1
);
return
MaceStatus
::
MACE_SUCCESS
;
}
private:
const
float
eps_
;
const
int
group_num_
;
MACE_OP_INPUT_TAGS
(
INPUT
);
MACE_OP_OUTPUT_TAGS
(
OUTPUT
);
};
#ifdef MACE_ENABLE_OPENCL
template
<
>
class
GroupNormOp
<
DeviceType
::
GPU
,
float
>
:
public
Operation
{
public:
explicit
GroupNormOp
(
OpConstructContext
*
context
)
:
Operation
(
context
)
{
const
auto
group_num
=
Operation
::
GetOptionalArg
<
int
>
(
"group_num"
,
32
);
const
auto
eps
=
Operation
::
GetOptionalArg
<
float
>
(
"epsilon"
,
static_cast
<
float
>
(
1e-5
));
if
(
context
->
GetOpMemoryType
()
==
MemoryType
::
GPU_IMAGE
)
{
kernel_
=
make_unique
<
opencl
::
image
::
MVNormKernel
>
(
true
,
opencl
::
image
::
MeanType
::
GROUP_CHANNELS
,
eps
,
group_num
);
}
else
{
MACE_NOT_IMPLEMENTED
;
}
}
MaceStatus
Run
(
OpContext
*
context
)
override
{
const
Tensor
*
input
=
this
->
Input
(
INPUT
);
Tensor
*
output
=
this
->
Output
(
OUTPUT
);
MACE_CHECK
(
input
->
dim_size
()
==
4
,
"input must be 4-dimensional."
,
input
->
dim_size
());
MACE_RETURN_IF_ERROR
(
output
->
ResizeLike
(
input
));
return
kernel_
->
Compute
(
context
,
input
,
output
);
}
private:
std
::
unique_ptr
<
OpenCLMVNormKernel
>
kernel_
;
MACE_OP_INPUT_TAGS
(
INPUT
);
MACE_OP_OUTPUT_TAGS
(
OUTPUT
);
};
#endif // MACE_ENABLE_OPENCL
void
RegisterGroupNorm
(
OpRegistry
*
op_registry
)
{
MACE_REGISTER_OP
(
op_registry
,
"GroupNorm"
,
GroupNormOp
,
DeviceType
::
CPU
,
float
);
MACE_REGISTER_BF16_OP
(
op_registry
,
"GroupNorm"
,
GroupNormOp
,
DeviceType
::
CPU
);
MACE_REGISTER_GPU_OP
(
op_registry
,
"GroupNorm"
,
GroupNormOp
);
MACE_REGISTER_OP_CONDITION
(
op_registry
,
OpConditionBuilder
(
"GroupNorm"
).
SetDevicePlacerFunc
(
[](
OpConditionContext
*
context
)
->
std
::
set
<
DeviceType
>
{
auto
op
=
context
->
operator_def
();
if
(
op
->
output_shape_size
()
!=
op
->
output_size
())
{
return
{
DeviceType
::
CPU
,
DeviceType
::
GPU
};
}
const
int
group_num
=
ProtoArgHelper
::
GetOptionalArg
<
OperatorDef
,
int
>
(
*
op
,
"group_num"
,
32
);
auto
output_channels
=
op
->
output_shape
(
0
).
dims
()[
3
];
const
int
group_size
=
output_channels
/
group_num
;
if
(
group_size
%
4
==
0
)
{
return
{
DeviceType
::
CPU
,
DeviceType
::
GPU
};
}
return
{
DeviceType
::
CPU
};
}));
}
}
// namespace ops
}
// namespace mace
mace/ops/mvnorm.cc
浏览文件 @
6b65f42b
...
...
@@ -145,9 +145,12 @@ class MVNormOp<DeviceType::GPU, float> : public Operation {
Operation
::
GetOptionalArg
<
bool
>
(
"across_channels"
,
false
);
auto
eps
=
Operation
::
GetOptionalArg
<
float
>
(
"epsilon"
,
1e-9
);
auto
mean_type
=
across_channels
?
opencl
::
image
::
MeanType
::
ACROSS_CHANNELS
:
opencl
::
image
::
MeanType
::
SINGLE_CHANNEL
;
if
(
context
->
GetOpMemoryType
()
==
MemoryType
::
GPU_IMAGE
)
{
kernel_
=
make_unique
<
opencl
::
image
::
MVNormKernel
>
(
normalize_variance
,
across_channels
,
eps
);
normalize_variance
,
mean_type
,
eps
);
}
else
{
MACE_NOT_IMPLEMENTED
;
}
...
...
mace/ops/opencl/cl/mvnorm.cl
浏览文件 @
6b65f42b
#
include
<common.h>
DATA_TYPE4
compute_mean_image
(
image2d_t
input,
const
int
width_idx,
const
int
hb_idx,
const
int
chan_blks,
const
int
height,
const
int
width
)
{
DATA_TYPE4
compute_mean_image
(
image2d_t
input,
const
int
height,
const
int
width,
const
int
chan_blks,
const
int
group_blks,
const
int
batch_idx,
const
int
chan_blk_idx
)
{
DATA_TYPE4
total
=
0.0f
;
DATA_TYPE4
mean
=
0.0f
;
const
int
hb_base
=
mul24
(
hb_idx
/
height,
height
)
;
const
int
wc_blks
=
mul24
(
width,
chan_blks
)
;
const
int
hb_base
=
mul24
(
batch_idx,
height
)
;
#
ifdef
ACROSS_CHANNELS
const
int
wc_blks
=
mul24
(
width,
chan_blks
)
;
for
(
int
h_idx
=
hb_base
; h_idx < hb_base + height; ++h_idx) {
for
(
int
pos
=
0
; pos < wc_blks; ++pos) {
DATA_TYPE4
in_data
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos,
h_idx
))
;
...
...
@@ -16,7 +17,23 @@ DATA_TYPE4 compute_mean_image(image2d_t input, const int width_idx,
}
}
DATA_TYPE
total_value
=
total.x
+
total.y
+
total.z
+
total.w
;
DATA_TYPE
mean_value
=
total_value
/
(
DATA_TYPE
)(
mul24
(
mul24
(
height,
wc_blks
)
,
4
))
;
DATA_TYPE
mean_value
=
total_value
/
(
DATA_TYPE
)(
mul24
(
mul24
(
height,
wc_blks
)
,
4
))
;
mean
=
(
DATA_TYPE4
)
{mean_value,
mean_value,
mean_value,
mean_value}
;
#
else
#
ifdef
GROUP_CHANNELS
const
int
group_base
=
chan_blk_idx
/
group_blks
*
group_blks
;
const
int
wg_blks_start
=
mul24
(
width,
group_base
)
;
const
int
wg_blks_end
=
wg_blks_start
+
group_blks
*
width
;
for
(
int
h_idx
=
hb_base
; h_idx < hb_base + height; ++h_idx) {
for
(
int
pos
=
wg_blks_start
; pos < wg_blks_end; ++pos) {
DATA_TYPE4
in_data
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos,
h_idx
))
;
total
+=
in_data
;
}
}
DATA_TYPE
total_value
=
total.x
+
total.y
+
total.z
+
total.w
;
const
int
total_num
=
mul24
(
mul24
(
height,
wg_blks_end
-
wg_blks_start
)
,
4
)
;
DATA_TYPE
mean_value
=
total_value
/
(
DATA_TYPE
)(
total_num
)
;
mean
=
(
DATA_TYPE4
)
{mean_value,
mean_value,
mean_value,
mean_value}
;
#
else
for
(
int
h_idx
=
hb_base
; h_idx < hb_base + height; ++h_idx) {
...
...
@@ -27,15 +44,42 @@ DATA_TYPE4 compute_mean_image(image2d_t input, const int width_idx,
}
}
mean
=
total
/
mul24
(
height,
width
)
;
#
endif
#
endif
//
GROUP_CHANNELS
#
endif
//
ACROSS_CHANNELS
return
mean
;
}
__kernel
void
mvnorm_compute_mean_value
(
OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input,
__private
const
int
height,
__private
const
int
width,
__private
const
int
group_blks,
__write_only
image2d_t
output
)
{
const
int
chan_blk_idx
=
get_global_id
(
0
)
;
const
int
batch_idx
=
get_global_id
(
1
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
chan_blk_idx
>=
global_size_dim0
|
| batch_idx >= global_size_dim1) {
return;
}
#endif
const int chan_blks = global_size_dim0;
const int batch = global_size_dim1;
DATA_TYPE4 mean = compute_mean_image(input, height, width, chan_blks,
group_blks, batch_idx, chan_blk_idx);
WRITE_IMAGET(output, (int2)(chan_blk_idx, batch_idx), mean);
}
__kernel void mvnorm_mean(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__read_only image2d_t mean_image, // E(X)
__private const int height,
__private const int group_blks,
__write_only image2d_t output) {
const int chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1);
...
...
@@ -54,18 +98,20 @@ __kernel void mvnorm_mean(OUT_OF_RANGE_PARAMS
const int pos = mad24(chan_blk_idx, width, width_idx);
DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(pos, hb_idx));
DATA_TYPE4 mean =
compute_mean_image(input, width_idx,
hb_idx, chan_blks, height, width
);
DATA_TYPE4 mean =
READ_IMAGET(
mean_image, SAMPLER, (int2)(chan_blk_idx, hb_idx / height)
);
in_data -= mean;
WRITE_IMAGET(output, (int2)(pos, hb_idx), in_data);
}
// compute the (X - EX)^2
__kernel void mvnorm_vn_step1(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__
write
_only image2d_t mean_image, // E(X)
__
read
_only image2d_t mean_image, // E(X)
__write_only image2d_t square_image, // (X - EX)^2
__private const int height) {
__private const int height,
__private const int group_blks) {
const int chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1);
const int hb_idx = get_global_id(2);
...
...
@@ -82,23 +128,21 @@ __kernel void mvnorm_vn_step1(OUT_OF_RANGE_PARAMS
const int pos = mad24(chan_blk_idx, width, width_idx);
DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(pos, hb_idx));
DATA_TYPE4 mean =
compute_mean_image(input, width_idx,
hb_idx, chan_blks, height, width
);
DATA_TYPE4 mean =
READ_IMAGET(mean_image, SAMPLER, (int2)(chan_blk_idx, hb_idx / height)
);
in_data = in_data - mean;
DATA_TYPE4 pow_data = in_data * in_data;
if (hb_idx == 0 && width_idx == 0) {
WRITE_IMAGET(mean_image, (int2)(chan_blk_idx, 0), mean);
}
DATA_TYPE4 pow_data = in_data * in_data;
WRITE_IMAGET(square_image, (int2)(pos, hb_idx), pow_data);
}
// compute (X - EX) / (E((X - EX)^2)^0.5 + eps_)
__kernel void mvnorm_vn_step2(OUT_OF_RANGE_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__read_only image2d_t mean_image, // E(X)
__read_only image2d_t
square_image, // (X - EX)^2
__read_only image2d_t
mean_image_sqr, // E((X - EX)^2)
__private const int height,
__private const int group_blks,
__private const float eps,
__write_only image2d_t output) {
const int chan_blk_idx = get_global_id(0);
...
...
@@ -115,15 +159,20 @@ __kernel void mvnorm_vn_step2(OUT_OF_RANGE_PARAMS
const
int
chan_blks
=
global_size_dim0
;
const
int
width
=
global_size_dim1
;
DATA_TYPE4
mean
=
READ_IMAGET
(
mean_image,
SAMPLER,
(
int2
)(
chan_blk_idx,
0
))
;
DATA_TYPE4
mean
=
READ_IMAGET
(
mean_image,
SAMPLER,
(
int2
)(
chan_blk_idx,
hb_idx
/
height
))
;
const
int
pos
=
mad24
(
chan_blk_idx,
width,
width_idx
)
;
DATA_TYPE4
in_data
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos,
hb_idx
))
;
in_data
=
in_data
-
mean
;
DATA_TYPE4
mean_
v
=
compute_mean_image
(
square_image,
width_idx,
hb_idx,
chan_blks,
height,
width
)
;
DATA_TYPE4
mean_
sqr
=
READ_IMAGET
(
mean_image_sqr,
SAMPLER,
(
int2
)(
chan_blk_idx,
hb_idx
/
height
))
;
;
DATA_TYPE4
norm_data
=
in_data
/
(
sqrt
(
mean_v
)
+
eps
)
;
#
ifdef
GROUP_CHANNELS
DATA_TYPE4
norm_data
=
in_data
/
sqrt
(
mean_sqr
+
eps
)
;
#
else
DATA_TYPE4
norm_data
=
in_data
/
(
sqrt
(
mean_sqr
)
+
eps
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
pos,
hb_idx
)
,
norm_data
)
;
}
mace/ops/opencl/image/mvnorm.cc
浏览文件 @
6b65f42b
...
...
@@ -29,101 +29,167 @@ namespace {
MaceStatus
BuildMVNKernel
(
OpenCLRuntime
*
runtime
,
cl
::
Kernel
*
kernel
,
const
char
*
kernel_name
,
std
::
set
<
std
::
string
>
*
built_options
,
bool
across_channel
)
{
std
::
stringstream
m
i
cro_name
;
m
i
cro_name
<<
"-Dmvnorm="
<<
kernel_name
;
built_options
->
emplace
(
m
i
cro_name
.
str
());
MeanType
mean_type_
)
{
std
::
stringstream
m
a
cro_name
;
m
a
cro_name
<<
"-Dmvnorm="
<<
kernel_name
;
built_options
->
emplace
(
m
a
cro_name
.
str
());
built_options
->
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DT_FLOAT
));
built_options
->
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DT_FLOAT
));
if
(
across_channel
)
{
if
(
mean_type_
==
MeanType
::
ACROSS_CHANNELS
)
{
built_options
->
emplace
(
"-DACROSS_CHANNELS"
);
}
else
if
(
mean_type_
==
MeanType
::
GROUP_CHANNELS
)
{
built_options
->
emplace
(
"-DGROUP_CHANNELS"
);
}
MACE_RETURN_IF_ERROR
(
runtime
->
BuildKernel
(
"mvnorm"
,
kernel_name
,
*
built_options
,
kernel
));
return
MaceStatus
::
MACE_SUCCESS
;
}
std
::
unique_ptr
<
Image
>
CreateImage
(
OpContext
*
context
,
const
DataType
dt
,
const
std
::
vector
<
index_t
>
&
buffer_shape
)
{
std
::
unique_ptr
<
Image
>
image
=
make_unique
<
Image
>
(
context
->
device
()
->
allocator
());
std
::
vector
<
size_t
>
shape
;
OpenCLUtil
::
CalImage2DShape
(
buffer_shape
,
OpenCLBufferType
::
IN_OUT_CHANNEL
,
&
shape
);
MACE_CHECK
(
image
->
Allocate
(
shape
,
dt
)
==
MaceStatus
::
MACE_SUCCESS
);
VLOG
(
1
)
<<
"MVNormKernel::CreateImage allocate image_:"
<<
MakeString
(
shape
);
return
image
;
}
}
// namespace
MVNormKernel
::
MVNormKernel
(
bool
normalize_variance
,
bool
across_channels
,
float
eps
)
MeanType
mean_type
,
float
eps
,
int
group_num
)
:
normalize_variance_
(
normalize_variance
),
across_channels_
(
across_channels
),
eps_
(
eps
)
{}
void
MVNormKernel
::
CheckImage
(
OpContext
*
context
,
const
DataType
dt
,
const
std
::
vector
<
index_t
>
&
square_shape
,
const
std
::
vector
<
index_t
>
&
mean_shape
)
{
if
(
square_image_
==
nullptr
)
{
square_image_
=
CreateImage
(
context
,
dt
,
square_shape
);
}
if
(
mean_image_
==
nullptr
)
{
mean_image_
=
CreateImage
(
context
,
dt
,
mean_shape
);
}
}
mean_type_
(
mean_type
),
eps_
(
eps
),
group_num_
(
group_num
)
{}
MaceStatus
MVNormKernel
::
Compute
(
OpContext
*
context
,
const
Tensor
*
input
,
Tensor
*
output
)
{
MaceStatus
MVNormKernel
::
Compute
(
OpContext
*
context
,
const
Tensor
*
input
,
Tensor
*
output
)
{
const
auto
batch
=
input
->
dim
(
0
);
const
auto
height
=
input
->
dim
(
1
);
const
auto
width
=
input
->
dim
(
2
);
const
auto
channels
=
input
->
dim
(
3
);
index_t
group_blocks
=
0
;
if
(
mean_type_
==
MeanType
::
GROUP_CHANNELS
)
{
MACE_CHECK
(
group_num_
>
0
,
"group num should > 0"
);
const
index_t
group
=
channels
/
group_num_
;
MACE_CHECK
(
group
>
0
&&
group
%
4
==
0
,
group
,
" can not be divided by 4"
);
group_blocks
=
group
/
4
;
}
return
DoCompute
(
context
,
input
,
output
,
batch
,
height
,
width
,
channels
,
group_blocks
);
}
MaceStatus
MVNormKernel
::
DoCompute
(
OpContext
*
context
,
const
Tensor
*
input
,
Tensor
*
output
,
const
index_t
batch
,
const
index_t
height
,
const
index_t
width
,
const
index_t
channels
,
const
index_t
group_blocks
)
{
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
auto
runtime
=
context
->
device
()
->
gpu_runtime
()
->
opencl_runtime
();
const
std
::
vector
<
index_t
>
mean_shape
=
{
batch
,
1
,
1
,
channels
};
std
::
vector
<
size_t
>
mean_image_shape
;
OpenCLUtil
::
CalImage2DShape
(
mean_shape
,
OpenCLBufferType
::
IN_OUT_HEIGHT
,
&
mean_image_shape
);
ScratchImageManager
*
scratch_manager
=
context
->
device
()
->
gpu_runtime
()
->
scratch_image_manager
();
ScratchImage
scratch_mean_image
(
scratch_manager
);
auto
mace_mean_image
=
scratch_mean_image
.
Scratch
(
context
->
device
()
->
allocator
(),
mean_image_shape
,
input
->
dtype
());
cl
::
Image
*
mean_image
=
static_cast
<
cl
::
Image
*>
(
mace_mean_image
->
buffer
());
if
(
normalize_variance_
)
{
const
std
::
vector
<
index_t
>
&
square_shape
=
input
->
buffer_shape
();
const
std
::
vector
<
index_t
>
mean_shape
=
{
1
,
1
,
1
,
channels
};
CheckImage
(
context
,
input
->
dtype
(),
square_shape
,
mean_shape
);
// compute the (X - EX)^2
ScratchImage
scratch_mean_image_sqr
(
scratch_manager
);
auto
mace_mean_image_sqr
=
scratch_mean_image_sqr
.
Scratch
(
context
->
device
()
->
allocator
(),
mean_image_shape
,
input
->
dtype
());
cl
::
Image
*
mean_image_sqr
=
static_cast
<
cl
::
Image
*>
(
mace_mean_image_sqr
->
buffer
());
// compute the EX
MACE_RETURN_IF_ERROR
(
ExecuteMeanValueKernel
(
context
,
runtime
,
batch
,
height
,
width
,
channel_blocks
,
group_blocks
,
input
->
opencl_image
(),
mean_image
));
// compute (X - EX)^2 to output
MACE_RETURN_IF_ERROR
(
ExecuteVarianceNormStep1Kernel
(
context
,
runtime
,
gws
,
input
));
context
,
runtime
,
gws
,
height
,
group_blocks
,
input
->
opencl_image
(),
mean_image
,
output
->
opencl_image
()));
// compute E((X - EX)^2) to mean_image_sqr_
MACE_RETURN_IF_ERROR
(
ExecuteMeanValueKernel
(
context
,
runtime
,
batch
,
height
,
width
,
channel_blocks
,
group_blocks
,
output
->
opencl_image
(),
mean_image_sqr
));
// compute the compute (X - EX) / (E((X - EX)^2)^0.5 + eps_)
MACE_RETURN_IF_ERROR
(
ExecuteVarianceNormStep2Kernel
(
context
,
runtime
,
gws
,
input
,
output
));
context
,
runtime
,
gws
,
height
,
group_blocks
,
input
->
opencl_image
(),
mean_image
,
mean_image_sqr
,
output
->
opencl_image
()));
}
else
{
// compute the EX
MACE_RETURN_IF_ERROR
(
ExecuteMeanValueKernel
(
context
,
runtime
,
batch
,
height
,
width
,
channel_blocks
,
group_blocks
,
input
->
opencl_image
(),
mean_image
));
// compute the (X - EX)
MACE_RETURN_IF_ERROR
(
ExecuteMeanNormKernel
(
context
,
runtime
,
gws
,
input
,
output
));
context
,
runtime
,
gws
,
height
,
group_blocks
,
input
->
opencl_image
(),
mean_image
,
output
->
opencl_image
()));
}
return
MaceStatus
::
MACE_SUCCESS
;
return
MaceStatus
::
MACE_SUCCESS
;
}
MaceStatus
MVNormKernel
::
ExecuteMeanValueKernel
(
OpContext
*
context
,
OpenCLRuntime
*
runtime
,
const
index_t
batch
,
const
index_t
height
,
const
index_t
width
,
const
index_t
channel_blocks
,
const
index_t
group_blocks
,
const
cl
::
Image
*
input_image
,
cl
::
Image
*
output_image
)
{
MACE_OUT_OF_RANGE_DEFINITION
;
if
(
kernel_mean_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
MACE_OUT_OF_RANGE_CONFIG
;
MACE_NON_UNIFORM_WG_CONFIG
;
MACE_RETURN_IF_ERROR
(
BuildMVNKernel
(
runtime
,
&
kernel_mean_
,
"mvnorm_compute_mean_value"
,
&
built_options
,
mean_type_
));
kwg_size_mean_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_mean_
));
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
static_cast
<
uint32_t
>
(
kwg_size_mean_
)
/
8
,
8
,
0
};
MACE_OUT_OF_RANGE_INIT
(
kernel_mean_
);
uint32_t
idx
=
0
;
MACE_OUT_OF_RANGE_SET_ARGS
(
kernel_mean_
);
MACE_SET_2D_GWS_ARGS
(
kernel_mean_
,
gws
);
kernel_mean_
.
setArg
(
idx
++
,
*
input_image
);
kernel_mean_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
kernel_mean_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
kernel_mean_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
group_blocks
));
kernel_mean_
.
setArg
(
idx
++
,
*
output_image
);
std
::
string
tuning_key
=
Concat
(
"mvnorm_compute_mean_opencl_kernel"
,
gws
[
0
],
gws
[
1
],
normalize_variance_
,
mean_type_
);
MACE_RETURN_IF_ERROR
(
TuningOrRun2DKernel
(
runtime
,
kernel_mean_
,
tuning_key
,
gws
,
lws
,
context
->
future
()));
MACE_OUT_OF_RANGE_VALIDATION
;
return
MaceStatus
::
MACE_SUCCESS
;
}
MaceStatus
MVNormKernel
::
ExecuteMeanNormKernel
(
OpContext
*
context
,
OpenCLRuntime
*
runtime
,
const
uint32_t
(
&
gws
)[
3
],
const
Tensor
*
input
,
Tensor
*
output
)
{
const
auto
height
=
input
->
dim
(
1
);
const
index_t
height
,
const
index_t
group_blocks
,
const
cl
::
Image
*
input
,
const
cl
::
Image
*
mean_image
,
cl
::
Image
*
output
)
{
MACE_OUT_OF_RANGE_DEFINITION
;
if
(
kernel_step1_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
MACE_OUT_OF_RANGE_CONFIG
;
MACE_NON_UNIFORM_WG_CONFIG
;
MACE_RETURN_IF_ERROR
(
BuildMVNKernel
(
runtime
,
&
kernel_step1_
,
"mvnorm_mean"
,
&
built_options
,
across_channels
_
));
&
built_options
,
mean_type
_
));
kwg_size_step1_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_step1_
));
}
...
...
@@ -132,14 +198,16 @@ MaceStatus MVNormKernel::ExecuteMeanNormKernel(OpContext *context,
uint32_t
idx
=
0
;
MACE_OUT_OF_RANGE_SET_ARGS
(
kernel_step1_
);
MACE_SET_3D_GWS_ARGS
(
kernel_step1_
,
gws
);
kernel_step1_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_step1_
.
setArg
(
idx
++
,
*
input
);
kernel_step1_
.
setArg
(
idx
++
,
*
mean_image
);
kernel_step1_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
kernel_step1_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_step1_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
group_blocks
));
kernel_step1_
.
setArg
(
idx
++
,
*
output
);
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
runtime
,
gws
,
kwg_size_step1_
);
std
::
string
tuning_key
=
Concat
(
"mvnorm_mean_opencl_kernel"
,
gws
[
0
],
gws
[
1
],
gws
[
2
]
,
normalize_variance_
,
across_channels_
);
std
::
string
tuning_key
=
Concat
(
"mvnorm_mean_opencl_kernel"
,
gws
[
0
],
gws
[
1
],
gws
[
2
],
normalize_variance_
,
mean_type_
,
group_blocks
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
runtime
,
kernel_step1_
,
tuning_key
,
gws
,
lws
,
context
->
future
()));
...
...
@@ -147,12 +215,11 @@ MaceStatus MVNormKernel::ExecuteMeanNormKernel(OpContext *context,
return
MaceStatus
::
MACE_SUCCESS
;
}
// The first step of compute Variance Norm, compute the (X - EX)^2
// store them into the square_image_
// compute the (X - EX)^2
MaceStatus
MVNormKernel
::
ExecuteVarianceNormStep1Kernel
(
OpContext
*
context
,
OpenCLRuntime
*
runtime
,
const
uint32_t
(
&
gws
)[
3
],
const
Tensor
*
input
)
{
const
auto
height
=
input
->
dim
(
1
);
const
uint32_t
(
&
gws
)[
3
],
const
index_t
height
,
const
index_t
group_blocks
,
const
cl
::
Image
*
input
,
const
cl
::
Image
*
mean_image
,
cl
::
Image
*
output
)
{
MACE_OUT_OF_RANGE_DEFINITION
;
if
(
kernel_step1_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
...
...
@@ -160,7 +227,7 @@ MaceStatus MVNormKernel::ExecuteVarianceNormStep1Kernel(
MACE_NON_UNIFORM_WG_CONFIG
;
MACE_RETURN_IF_ERROR
(
BuildMVNKernel
(
runtime
,
&
kernel_step1_
,
"mvnorm_vn_step1"
,
&
built_options
,
across_channels
_
));
&
built_options
,
mean_type
_
));
kwg_size_step1_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_step1_
));
}
...
...
@@ -169,18 +236,17 @@ MaceStatus MVNormKernel::ExecuteVarianceNormStep1Kernel(
uint32_t
idx
=
0
;
MACE_OUT_OF_RANGE_SET_ARGS
(
kernel_step1_
);
MACE_SET_3D_GWS_ARGS
(
kernel_step1_
,
gws
);
kernel_step1_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
cl
::
Image
*
mean_image
=
static_cast
<
cl
::
Image
*>
(
mean_image_
->
buffer
());
kernel_step1_
.
setArg
(
idx
++
,
*
input
);
kernel_step1_
.
setArg
(
idx
++
,
*
mean_image
);
cl
::
Image
*
square_image
=
static_cast
<
cl
::
Image
*>
(
square_image_
->
buffer
());
kernel_step1_
.
setArg
(
idx
++
,
*
square_image
);
kernel_step1_
.
setArg
(
idx
++
,
*
output
);
kernel_step1_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
kernel_step1_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
group_blocks
));
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
runtime
,
gws
,
kwg_size_step1_
);
std
::
string
tuning_key
=
Concat
(
"mvnorm_v_step1_opencl_kernel"
,
gws
[
0
],
gws
[
1
],
gws
[
2
],
normalize_variance_
,
across_channels
_
);
mean_type
_
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
runtime
,
kernel_step1_
,
tuning_key
,
gws
,
lws
,
context
->
future
()));
...
...
@@ -188,12 +254,12 @@ MaceStatus MVNormKernel::ExecuteVarianceNormStep1Kernel(
return
MaceStatus
::
MACE_SUCCESS
;
}
// The second step of compute Variance Norm, read the (X - EX)^2 from
// square_image_ and compute (X - EX) / (E((X - EX)^2)^0.5 + eps_)
// compute (X - EX) / (E((X - EX)^2)^0.5 + eps_)
MaceStatus
MVNormKernel
::
ExecuteVarianceNormStep2Kernel
(
OpContext
*
context
,
OpenCLRuntime
*
runtime
,
const
uint32_t
(
&
gws
)[
3
],
const
Tensor
*
input
,
Tensor
*
output
)
{
const
auto
height
=
input
->
dim
(
1
);
const
index_t
height
,
const
index_t
group_blocks
,
const
cl
::
Image
*
input
,
const
cl
::
Image
*
mean_image
,
const
cl
::
Image
*
mean_image_sqr
,
cl
::
Image
*
output
)
{
MACE_OUT_OF_RANGE_DEFINITION
;
if
(
kernel_step2_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
...
...
@@ -201,7 +267,7 @@ MaceStatus MVNormKernel::ExecuteVarianceNormStep2Kernel(
MACE_NON_UNIFORM_WG_CONFIG
;
MACE_RETURN_IF_ERROR
(
BuildMVNKernel
(
runtime
,
&
kernel_step2_
,
"mvnorm_vn_step2"
,
&
built_options
,
across_channels
_
));
&
built_options
,
mean_type
_
));
kwg_size_step2_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_step2_
));
}
...
...
@@ -210,20 +276,19 @@ MaceStatus MVNormKernel::ExecuteVarianceNormStep2Kernel(
uint32_t
idx
=
0
;
MACE_OUT_OF_RANGE_SET_ARGS
(
kernel_step2_
);
MACE_SET_3D_GWS_ARGS
(
kernel_step2_
,
gws
);
kernel_step2_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
cl
::
Image
*
mean_image
=
static_cast
<
cl
::
Image
*>
(
mean_image_
->
buffer
());
kernel_step2_
.
setArg
(
idx
++
,
*
input
);
kernel_step2_
.
setArg
(
idx
++
,
*
mean_image
);
cl
::
Image
*
square_image
=
static_cast
<
cl
::
Image
*>
(
square_image_
->
buffer
());
kernel_step2_
.
setArg
(
idx
++
,
*
square_image
);
kernel_step2_
.
setArg
(
idx
++
,
*
mean_image_sqr
);
kernel_step2_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
kernel_step2_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
group_blocks
));
kernel_step2_
.
setArg
(
idx
++
,
static_cast
<
float
>
(
eps_
));
kernel_step2_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
())
);
kernel_step2_
.
setArg
(
idx
++
,
*
output
);
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
runtime
,
gws
,
kwg_size_step2_
);
std
::
string
tuning_key
=
Concat
(
"mvnorm_v_step2_opencl_kernel"
,
gws
[
0
],
gws
[
1
],
gws
[
2
],
normalize_variance_
,
across_channels
_
);
mean_type
_
);
MACE_RETURN_IF_ERROR
(
TuningOrRun3DKernel
(
runtime
,
kernel_step2_
,
tuning_key
,
gws
,
lws
,
context
->
future
()));
...
...
mace/ops/opencl/image/mvnorm.h
浏览文件 @
6b65f42b
...
...
@@ -28,48 +28,79 @@ namespace ops {
namespace
opencl
{
namespace
image
{
enum
MeanType
{
SINGLE_CHANNEL
,
GROUP_CHANNELS
,
ACROSS_CHANNELS
,
};
class
MVNormKernel
:
public
OpenCLMVNormKernel
{
public:
explicit
MVNormKernel
(
bool
normalize_variance_
,
bool
across_channels
,
float
eps
);
explicit
MVNormKernel
(
bool
normalize_variance_
,
MeanType
mean_type
,
float
eps
,
int
group_num
=
0
);
~
MVNormKernel
()
=
default
;
MaceStatus
Compute
(
OpContext
*
context
,
const
Tensor
*
input
,
Tensor
*
output
)
override
;
private:
void
CheckImage
(
OpContext
*
context
,
const
DataType
dt
,
const
std
::
vector
<
index_t
>
&
square_shape
,
const
std
::
vector
<
index_t
>
&
mean_shape
);
MaceStatus
DoCompute
(
OpContext
*
context
,
const
Tensor
*
input
,
Tensor
*
output
,
const
index_t
batch
,
const
index_t
height
,
const
index_t
width
,
const
index_t
channels
,
const
index_t
group_blocks
);
MaceStatus
ExecuteMeanValueKernel
(
OpContext
*
context
,
OpenCLRuntime
*
runtime
,
const
index_t
batch
,
const
index_t
height
,
const
index_t
width
,
const
index_t
channel_blocks
,
const
index_t
group_blocks
,
const
cl
::
Image
*
input_image
,
cl
::
Image
*
output_image
);
MaceStatus
ExecuteMeanNormKernel
(
OpContext
*
context
,
OpenCLRuntime
*
runtime
,
const
uint32_t
(
&
gws
)[
3
],
const
Tensor
*
input
,
Tensor
*
output
);
const
index_t
height
,
const
index_t
group_blocks
,
const
cl
::
Image
*
input
,
const
cl
::
Image
*
mean_image
,
cl
::
Image
*
output
);
// compute the (X - EX)^2
MaceStatus
ExecuteVarianceNormStep1Kernel
(
OpContext
*
context
,
OpenCLRuntime
*
runtime
,
const
uint32_t
(
&
gws
)[
3
],
const
Tensor
*
input
);
const
index_t
height
,
const
index_t
group_blocks
,
const
cl
::
Image
*
input
,
const
cl
::
Image
*
mean_image
,
cl
::
Image
*
output
);
// compute (X - EX) / (E((X - EX)^2)^0.5 + eps_)
MaceStatus
ExecuteVarianceNormStep2Kernel
(
OpContext
*
context
,
OpenCLRuntime
*
runtime
,
const
uint32_t
(
&
gws
)[
3
],
const
Tensor
*
input
,
Tensor
*
output
);
const
index_t
height
,
const
index_t
group_blocks
,
const
cl
::
Image
*
input
,
const
cl
::
Image
*
mean_image
,
const
cl
::
Image
*
mean_image_sqr
,
cl
::
Image
*
output
);
private:
bool
normalize_variance_
;
bool
across_channels_
;
float
eps_
;
const
bool
normalize_variance_
;
const
MeanType
mean_type_
;
const
float
eps_
;
const
int
group_num_
;
cl
::
Kernel
kernel_mean_
;
uint32_t
kwg_size_mean_
;
cl
::
Kernel
kernel_step1_
;
uint32_t
kwg_size_step1_
;
cl
::
Kernel
kernel_step2_
;
uint32_t
kwg_size_step2_
;
// the cache of (X - EX)^2
std
::
unique_ptr
<
Image
>
square_image_
;
// the cache of EX
std
::
unique_ptr
<
Image
>
mean_image_
;
};
}
// namespace image
...
...
mace/ops/opencl/mvnorm.h
浏览文件 @
6b65f42b
...
...
@@ -16,7 +16,6 @@
#define MACE_OPS_OPENCL_MVNORM_H_
#include "mace/public/mace.h"
#include "mace/utils/math.h"
namespace
mace
{
...
...
mace/ops/registry/ops_registry.cc
浏览文件 @
6b65f42b
...
...
@@ -41,6 +41,7 @@ extern void RegisterExtractPooling(OpRegistry *op_registry);
extern
void
RegisterFill
(
OpRegistry
*
op_registry
);
extern
void
RegisterFullyConnected
(
OpRegistry
*
op_registry
);
extern
void
RegisterGather
(
OpRegistry
*
op_registry
);
extern
void
RegisterGroupNorm
(
OpRegistry
*
op_registry
);
extern
void
RegisterIdentity
(
OpRegistry
*
op_registry
);
extern
void
RegisterIfDefined
(
OpRegistry
*
op_registry
);
extern
void
RegisterInferConv2dShape
(
OpRegistry
*
op_registry
);
...
...
@@ -120,6 +121,7 @@ void RegisterAllOps(OpRegistry *registry) {
ops
::
RegisterFill
(
registry
);
ops
::
RegisterFullyConnected
(
registry
);
ops
::
RegisterGather
(
registry
);
ops
::
RegisterGroupNorm
(
registry
);
ops
::
RegisterIdentity
(
registry
);
ops
::
RegisterIfDefined
(
registry
);
ops
::
RegisterInferConv2dShape
(
registry
);
...
...
mace/utils/tuner.h
浏览文件 @
6b65f42b
...
...
@@ -95,7 +95,7 @@ class Tuner {
std
::
vector
<
param_type
>
opt_param
=
default_param
;
RetType
res
=
Tune
<
RetType
>
(
param_generator
,
func
,
timer
,
&
opt_param
);
VLOG
(
3
)
<<
"Tuning "
<<
param_key
<<
" re
t
ult: "
<<
MakeString
(
opt_param
);
<<
" re
s
ult: "
<<
MakeString
(
opt_param
);
param_table_
[
obfucated_param_key
]
=
opt_param
;
return
res
;
}
else
{
...
...
test/ccunit/mace/ops/group_norm_test.cc
0 → 100644
浏览文件 @
6b65f42b
// Copyright 2020 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/core/types.h"
#include "mace/ops/ops_test_util.h"
namespace
mace
{
namespace
ops
{
namespace
test
{
class
GroupNormOpTest
:
public
OpsTestBase
{};
namespace
{
template
<
DeviceType
D
,
typename
T
>
void
TestGroupNorm
(
const
std
::
vector
<
index_t
>
&
input_shape
,
const
std
::
vector
<
T
>
&
input
,
int
group_num
,
const
std
::
vector
<
T
>
&
output
)
{
OpsTestNet
net
;
net
.
AddInputFromArray
<
D
,
T
>
(
MakeString
(
"Input"
),
input_shape
,
input
);
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
CPU
,
float
>
(
"Input"
,
DataFormat
::
NHWC
,
"InputNCHW"
,
DataFormat
::
NCHW
);
}
OpDefBuilder
(
"GroupNorm"
,
"GroupNormTest"
)
.
Input
(
D
==
DeviceType
::
CPU
?
"InputNCHW"
:
"Input"
)
.
AddIntArg
(
"group_num"
,
group_num
)
.
Output
(
D
==
DeviceType
::
CPU
?
"OutputNCHW"
:
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
(
D
);
if
(
D
==
DeviceType
::
CPU
)
{
net
.
TransformDataFormat
<
CPU
,
float
>
(
"OutputNCHW"
,
DataFormat
::
NCHW
,
"Output"
,
DataFormat
::
NHWC
);
}
net
.
AddInputFromArray
<
D
,
T
>
(
"ExpectedOutput"
,
input_shape
,
output
);
if
(
DataTypeToEnum
<
T
>::
value
==
DT_HALF
)
{
ExpectTensorNear
<
T
>
(
*
net
.
GetOutput
(
"ExpectedOutput"
),
*
net
.
GetOutput
(
"Output"
),
1e-2
,
1e-2
);
}
else
{
ExpectTensorNear
<
T
>
(
*
net
.
GetOutput
(
"ExpectedOutput"
),
*
net
.
GetOutput
(
"Output"
),
1e-3
);
}
}
}
// namespace
TEST_F
(
GroupNormOpTest
,
SimpleTestCPU
)
{
TestGroupNorm
<
DeviceType
::
CPU
,
float
>
(
{
1
,
1
,
2
,
64
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
16
,
{
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
0.468507
,
0.780844
,
1.09318
,
1.40552
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
0.468507
,
0.780844
,
1.09318
,
1.40552
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
0.468507
,
0.780844
,
1.09318
,
1.40552
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
0.468507
,
0.780844
,
1.09318
,
1.40552
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
0.80467
,
0.928465
,
1.05226
,
1.17606
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
-
1.40552
,
-
1.09318
,
-
0.780844
,
-
0.468507
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
-
1.40552
,
-
1.09318
,
-
0.780844
,
-
0.468507
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
-
1.40552
,
-
1.09318
,
-
0.780844
,
-
0.468507
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
-
1.40552
,
-
1.09318
,
-
0.780844
,
-
0.468507
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
-
1.17606
,
-
1.05226
,
-
0.928465
,
-
0.80467
,
0.218208
,
0.654625
,
1.09104
,
1.52746
});
}
TEST_F
(
GroupNormOpTest
,
SimpleTestOpenCL
)
{
TestGroupNorm
<
DeviceType
::
GPU
,
float
>
(
{
1
,
1
,
2
,
64
},
{
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
9
,
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
},
16
,
{
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
0.468507
,
0.780844
,
1.09318
,
1.40552
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
0.468507
,
0.780844
,
1.09318
,
1.40552
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
0.468507
,
0.780844
,
1.09318
,
1.40552
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
0.468507
,
0.780844
,
1.09318
,
1.40552
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
0.80467
,
0.928465
,
1.05226
,
1.17606
,
-
1.52746
,
-
1.09104
,
-
0.654625
,
-
0.218208
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
-
1.40552
,
-
1.09318
,
-
0.780844
,
-
0.468507
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
-
1.40552
,
-
1.09318
,
-
0.780844
,
-
0.468507
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
-
1.40552
,
-
1.09318
,
-
0.780844
,
-
0.468507
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
-
1.40552
,
-
1.09318
,
-
0.780844
,
-
0.468507
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
0.218208
,
0.654625
,
1.09104
,
1.52746
,
-
1.17606
,
-
1.05226
,
-
0.928465
,
-
0.80467
,
0.218208
,
0.654625
,
1.09104
,
1.52746
});
}
TEST_F
(
GroupNormOpTest
,
SimpleRandomOPENCL
)
{
static
unsigned
int
seed
=
time
(
NULL
);
index_t
batch
=
1
+
rand_r
(
&
seed
)
%
5
;
index_t
group
=
4
+
4
*
(
rand_r
(
&
seed
)
%
3
);
index_t
group_num
=
2
+
rand_r
(
&
seed
)
%
16
;
index_t
channels
=
group
*
group_num
;
index_t
height
=
64
;
index_t
width
=
64
;
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
DataFormat
::
NHWC
,
"InputNCHW"
,
DataFormat
::
NCHW
);
// Construct graph
OpDefBuilder
(
"GroupNorm"
,
"GroupNormTest"
)
.
Input
(
"InputNCHW"
)
.
AddFloatArg
(
"epsilon"
,
1e-3
)
.
Output
(
"OutputNCHW"
)
.
AddIntArg
(
"group_num"
,
group_num
)
.
Finalize
(
net
.
NewOperatorDef
());
// run cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
DataFormat
::
NCHW
,
"Output"
,
DataFormat
::
NHWC
);
// Check
auto
expected
=
net
.
CreateTensor
<
float
>
();
expected
->
Copy
(
*
net
.
GetOutput
(
"Output"
));
// Run on opencl
OpDefBuilder
(
"GroupNorm"
,
"GroupNormTest"
)
.
Input
(
"Input"
)
.
AddFloatArg
(
"epsilon"
,
1e-3
)
.
Output
(
"Output"
)
.
AddIntArg
(
"group_num"
,
group_num
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
Setup
(
DeviceType
::
GPU
);
// Tuning
setenv
(
"MACE_TUNING"
,
"1"
,
1
);
net
.
Run
();
unsetenv
(
"MACE_TUNING"
);
// Run on opencl
net
.
Run
();
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-5
,
1e-4
);
}
TEST_F
(
GroupNormOpTest
,
SimpleRandomHalfOPENCL
)
{
// generate random input
static
unsigned
int
seed
=
time
(
NULL
);
index_t
batch
=
1
+
rand_r
(
&
seed
)
%
5
;
index_t
group
=
4
+
4
*
(
rand_r
(
&
seed
)
%
16
);
index_t
group_num
=
2
+
rand_r
(
&
seed
)
%
16
;
index_t
channels
=
group
*
group_num
;
index_t
height
=
64
;
index_t
width
=
64
;
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
DeviceType
::
GPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"Input"
,
DataFormat
::
NHWC
,
"InputNCHW"
,
DataFormat
::
NCHW
);
// Construct graph
OpDefBuilder
(
"GroupNorm"
,
"GroupNormTest"
)
.
Input
(
"InputNCHW"
)
.
AddFloatArg
(
"epsilon"
,
1e-3
)
.
Output
(
"OutputNCHW"
)
.
AddIntArg
(
"group_num"
,
group_num
)
.
Finalize
(
net
.
NewOperatorDef
());
// run cpu
net
.
RunOp
();
net
.
TransformDataFormat
<
DeviceType
::
CPU
,
float
>
(
"OutputNCHW"
,
DataFormat
::
NCHW
,
"Output"
,
DataFormat
::
NHWC
);
// Check
auto
expected
=
net
.
CreateTensor
<
float
>
();
expected
->
Copy
(
*
net
.
GetOutput
(
"Output"
));
// Run on opencl
OpDefBuilder
(
"GroupNorm"
,
"GroupNormTest"
)
.
Input
(
"Input"
)
.
AddFloatArg
(
"epsilon"
,
1e-3
)
.
Output
(
"Output"
)
.
AddIntArg
(
"group_num"
,
group_num
)
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataType
::
DT_HALF
))
.
Finalize
(
net
.
NewOperatorDef
());
net
.
Setup
(
DeviceType
::
GPU
);
// Tuning
setenv
(
"MACE_TUNING"
,
"1"
,
1
);
net
.
Run
();
unsetenv
(
"MACE_TUNING"
);
// Run on opencl
net
.
Run
();
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
1e-1
,
1e-2
);
}
}
// namespace test
}
// namespace ops
}
// namespace mace
third_party/caffe/caffe.proto
浏览文件 @
6b65f42b
...
...
@@ -1955,7 +1955,7 @@ message GroupNormParameter {
}
message
ResizeNearestParameter
{
optional
float
height_scale
=
1
[
default
=
2.0
];
optional
float
width_scale
=
2
[
default
=
2.0
];
optional
float
height_scale
=
1
[
default
=
2.0
];
optional
float
width_scale
=
2
[
default
=
2.0
];
}
tools/python/transform/base_converter.py
浏览文件 @
6b65f42b
...
...
@@ -49,7 +49,6 @@ class ActivationType(Enum):
TANH
=
4
SIGMOID
=
5
LEAKYRELU
=
6
RELU6
=
7
class
EltwiseType
(
Enum
):
...
...
@@ -101,6 +100,7 @@ MaceSupportedOps = [
'Concat'
,
'Conv2D'
,
'Crop'
,
'Cumsum'
,
'Deconv2D'
,
'DepthToSpace'
,
'DepthwiseConv2d'
,
...
...
@@ -112,15 +112,18 @@ MaceSupportedOps = [
'Fill'
,
'FullyConnected'
,
'Gather'
,
'GroupNorm'
,
'Identity'
,
'IfDefined'
,
'InferConv2dShape'
,
'KaldiBatchNorm'
,
'LocalResponseNorm'
,
'LpNorm'
,
'LSTMCell'
,
'LstmNonlinear'
,
'DynamicLSTM'
,
'MatMul'
,
'MVNorm'
,
'OneHot'
,
'Pad'
,
'PadContext'
,
...
...
@@ -154,11 +157,8 @@ MaceSupportedOps = [
'Subsample'
,
'SumGroup'
,
'TargetRMSNorm'
,
'Transpose'
,
'Cumsum'
,
'Tile'
,
'LpNorm'
,
'MVNorm'
,
'Transpose'
,
]
MaceOp
=
Enum
(
'MaceOp'
,
[(
op
,
op
)
for
op
in
MaceSupportedOps
],
type
=
str
)
...
...
@@ -178,7 +178,8 @@ MaceFixedDataFormatOps = [MaceOp.BatchNorm,
MaceOp
.
SpaceToBatchND
,
MaceOp
.
SpaceToDepth
,
MaceOp
.
LpNorm
,
MaceOp
.
MVNorm
]
MaceOp
.
MVNorm
,
MaceOp
.
GroupNorm
]
MaceTransposableDataFormatOps
=
[
MaceOp
.
Activation
,
MaceOp
.
AddN
,
...
...
@@ -255,6 +256,7 @@ class MaceKeyword(object):
mace_opencl_mem_type
=
"opencl_mem_type"
mace_framework_type_str
=
"framework_type"
mace_group_str
=
"group"
mace_group_num_str
=
"group_num"
mace_wino_arg_str
=
"wino_block_size"
mace_quantize_flag_arg_str
=
"quantize_flag"
mace_epsilon_str
=
'epsilon'
...
...
tools/python/transform/caffe_converter.py
浏览文件 @
6b65f42b
...
...
@@ -188,6 +188,7 @@ class CaffeConverter(base_converter.ConverterInterface):
'InnerProduct'
:
self
.
convert_fully_connected
,
'Interp'
:
self
.
convert_interp
,
'BatchNorm'
:
self
.
convert_folded_batchnorm
,
'GroupNorm'
:
self
.
convert_group_norm
,
'Crop'
:
self
.
convert_crop
,
'Scale'
:
self
.
convert_scale
,
'ShuffleChannel'
:
self
.
convert_channel_shuffle
,
...
...
@@ -555,6 +556,24 @@ class CaffeConverter(base_converter.ConverterInterface):
op
.
input
.
extend
([
name
for
name
in
input_names
])
op
.
output
[:]
=
scale_op
.
layer
.
top
[:]
def
convert_group_norm
(
self
,
caffe_op
):
op
=
self
.
convert_general_op
(
caffe_op
)
op
.
type
=
MaceOp
.
GroupNorm
.
name
epsilon_arg
=
op
.
arg
.
add
()
epsilon_arg
.
name
=
MaceKeyword
.
mace_epsilon_str
group_num_arg
=
op
.
arg
.
add
()
group_num_arg
.
name
=
MaceKeyword
.
mace_group_num_str
if
hasattr
(
caffe_op
,
'layer'
)
and
\
hasattr
(
caffe_op
.
layer
,
'group_norm_param'
):
param
=
caffe_op
.
layer
.
group_norm_param
epsilon_arg
.
f
=
param
.
eps
group_num_arg
.
i
=
param
.
group_num
else
:
epsilon_arg
.
f
=
1e-5
group_num_arg
.
i
=
32
def
convert_pooling
(
self
,
caffe_op
):
op
=
self
.
convert_general_op
(
caffe_op
)
param
=
caffe_op
.
layer
.
pooling_param
...
...
tools/python/transform/shape_inference.py
浏览文件 @
6b65f42b
...
...
@@ -38,6 +38,7 @@ class ShapeInference(object):
MaceOp
.
DepthwiseDeconv2d
.
name
:
self
.
infer_shape_deconv
,
MaceOp
.
Eltwise
.
name
:
self
.
infer_shape_eltwise
,
MaceOp
.
BatchNorm
.
name
:
self
.
infer_shape_general
,
MaceOp
.
GroupNorm
.
name
:
self
.
infer_shape_general
,
MaceOp
.
AddN
.
name
:
self
.
infer_shape_general
,
MaceOp
.
Activation
.
name
:
self
.
infer_shape_general
,
MaceOp
.
Pooling
.
name
:
self
.
infer_shape_conv_pool_shape
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录