Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
1fa3449a
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
404
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
1fa3449a
编写于
2月 01, 2023
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(opr): add general normalization cuda naive implementation
GitOrigin-RevId: e42f3c2df8317d83da62454e2cedcfd29a460fc6
上级
5b7e6a80
变更
11
展开全部
显示空白变更内容
内联
并排
Showing
11 changed file
with
213 addition
and
673 deletion
+213
-673
dnn/scripts/opr_param_defs.py
dnn/scripts/opr_param_defs.py
+0
-2
dnn/src/cuda/general_norm/general_norm_cuda.cu
dnn/src/cuda/general_norm/general_norm_cuda.cu
+92
-605
dnn/src/cuda/general_norm/general_norm_cuda.cuh
dnn/src/cuda/general_norm/general_norm_cuda.cuh
+6
-5
dnn/src/cuda/general_norm/opr_impl.cpp
dnn/src/cuda/general_norm/opr_impl.cpp
+28
-32
dnn/src/cuda/general_norm/opr_impl.h
dnn/src/cuda/general_norm/opr_impl.h
+1
-0
dnn/src/naive/general_norm/opr_impl.cpp
dnn/src/naive/general_norm/opr_impl.cpp
+2
-2
dnn/test/cuda/general_norm.cpp
dnn/test/cuda/general_norm.cpp
+78
-7
imperative/python/megengine/functional/nn.py
imperative/python/megengine/functional/nn.py
+1
-12
imperative/python/megengine/module/normalization.py
imperative/python/megengine/module/normalization.py
+5
-5
src/opr/impl/dnn/general_norm.cpp
src/opr/impl/dnn/general_norm.cpp
+0
-1
src/opr/test/dnn/general_norm.cpp
src/opr/test/dnn/general_norm.cpp
+0
-2
未找到文件。
dnn/scripts/opr_param_defs.py
浏览文件 @
1fa3449a
...
...
@@ -1272,8 +1272,6 @@ PADDING_MODES = [Doc('REPLICATE = 0', 'aaaaaa|abcdefgh|hhhhhhh'),
(
pdef
(
'GeneralNorm'
)
.
add_fields
(
'bool'
,
'affine'
,
'true'
)
.
add_fields
(
'float32'
,
'eps'
,
'1e-5f'
)
.
add_fields
(
'uint64'
,
'normalized_dim'
,
'1'
)
.
add_fields
(
'uint64'
,
'normalized_size'
,
'1'
)
.
add_fields
(
'uint64'
,
'normalized_axis'
,
'0'
)
)
...
...
dnn/src/cuda/general_norm/general_norm_cuda.cu
浏览文件 @
1fa3449a
此差异已折叠。
点击以展开。
dnn/src/cuda/general_norm/general_norm_cuda.cuh
浏览文件 @
1fa3449a
...
...
@@ -7,14 +7,15 @@ namespace general_norm {
template
<
typename
T
,
typename
T_ACC
>
void
forward
(
T
*
X
,
T
*
gamma
,
T
*
beta
,
int64_t
M
,
int64_t
N
,
T_ACC
eps
,
T
*
Y
,
T_ACC
*
mean
,
T_ACC
*
rstd
,
cudaStream_t
stream
);
T
*
X_data
,
T
*
weight_data
,
T
*
bias_data
,
T
*
Y_data
,
T_ACC
*
mean_data
,
T_ACC
*
rstd_data
,
T_ACC
eps
,
int64_t
A
,
int64_t
B
,
int64_t
C
,
cudaStream_t
stream
);
template
<
typename
T
,
typename
T_ACC
>
void
backward
(
const
T
*
dY_data
,
const
T
*
X_data
,
const
T_ACC
*
mean_data
,
const
T_ACC
*
rstd_data
,
const
T
*
gamma_data
,
int64_t
M
,
int64_t
N
,
T
*
dX
_data
,
T
*
d
gamma_data
,
T
*
dbeta_data
,
cudaStream_t
stream
);
const
T
*
dY_data
,
const
T
*
X_data
,
const
T
*
gamma_data
,
const
T
_ACC
*
mean_data
,
const
T_ACC
*
rstd_data
,
T
*
dX_data
,
T
*
dgamma
_data
,
T
*
d
beta_data
,
int64_t
A
,
int64_t
B
,
int64_t
C
,
cudaStream_t
stream
);
}
// namespace general_norm
}
// namespace cuda
...
...
dnn/src/cuda/general_norm/opr_impl.cpp
浏览文件 @
1fa3449a
...
...
@@ -16,12 +16,9 @@ void GeneralNormForwardImpl::exec(
auto
p
=
param
();
float
eps
=
p
.
eps
;
bool
affine
=
p
.
affine
;
uint64_t
slice_length
=
p
.
normalized_size
;
uint64_t
slice_dim
=
p
.
normalized_dim
;
uint64_t
n_slices
=
1
;
for
(
size_t
i
=
0
;
i
<
data
.
layout
.
ndim
-
slice_dim
;
++
i
)
{
n_slices
=
n_slices
*
data
.
layout
.
shape
[
i
];
}
uint64_t
axis
=
p
.
normalized_axis
;
uint64_t
A
,
B
,
C
;
megdnn
::
reduce
::
get_ABC
(
data
.
layout
,
A
,
B
,
C
,
axis
);
auto
stream
=
cuda_stream
(
handle
());
using
namespace
::
megdnn
::
cuda
::
general_norm
;
...
...
@@ -32,9 +29,9 @@ void GeneralNormForwardImpl::exec(
using T_ACC = float; \
forward<T, T_ACC>( \
data.ptr<T>(), affine ? weight.ptr<T>() : nullptr, \
affine ? bias.ptr<T>() : nullptr,
static_cast<int64_t>(n_slices),
\
static_cast<int64_t>(slice_length), static_cast<T_ACC>(eps),
\
dst.ptr<T>(), mean.ptr<T_ACC>(), rstd.ptr<T_ACC>(), stream);
\
affine ? bias.ptr<T>() : nullptr,
dst.ptr<T>(), mean.ptr<T_ACC>(),
\
rstd.ptr<T_ACC>(), static_cast<T_ACC>(eps), A, B,
\
C, stream);
\
return; \
}
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT
(
cb
)
...
...
@@ -52,12 +49,9 @@ void GeneralNormBackwardImpl::exec(
ddata
.
layout
,
dweight
.
layout
,
dbias
.
layout
,
workspace
.
size
);
auto
p
=
param
();
bool
affine
=
p
.
affine
;
uint64_t
slice_length
=
p
.
normalized_size
;
uint64_t
slice_dim
=
p
.
normalized_dim
;
uint64_t
n_slices
=
1
;
for
(
size_t
i
=
0
;
i
<
data
.
layout
.
ndim
-
slice_dim
;
++
i
)
{
n_slices
=
n_slices
*
data
.
layout
.
shape
[
i
];
}
uint64_t
axis
=
p
.
normalized_axis
;
uint64_t
A
,
B
,
C
;
megdnn
::
reduce
::
get_ABC
(
data
.
layout
,
A
,
B
,
C
,
axis
);
auto
stream
=
cuda_stream
(
handle
());
using
namespace
::
megdnn
::
cuda
::
general_norm
;
...
...
@@ -66,10 +60,12 @@ void GeneralNormBackwardImpl::exec(
using T = typename DTypeTrait<DType>::ctype; \
using T_ACC = float; \
backward<T, T_ACC>( \
diff.ptr<T>(), data.ptr<T>(), mean.ptr<T_ACC>(), rstd.ptr<T_ACC>(), \
affine ? weight.ptr<T>() : nullptr, n_slices, slice_length, \
ddata.ptr<T>(), affine ? dweight.ptr<T>() : nullptr, \
affine ? dbias.ptr<T>() : nullptr, stream); \
diff.ptr<T>(), data.ptr<T>(), affine ? weight.ptr<T>() : nullptr, \
mean.ptr<T_ACC>(), rstd.ptr<T_ACC>(), \
ddata.ptr<T>(), \
affine ? dweight.ptr<T>() : nullptr, \
affine ? dbias.ptr<T>() : nullptr, A, B, C, \
stream); \
return; \
}
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT
(
cb
)
...
...
dnn/src/cuda/general_norm/opr_impl.h
浏览文件 @
1fa3449a
#pragma once
#include "megdnn/oprs.h"
#include "src/common/reduce_helper.h"
#include "src/cuda/cudnn_wrapper.h"
...
...
dnn/src/naive/general_norm/opr_impl.cpp
浏览文件 @
1fa3449a
...
...
@@ -16,7 +16,7 @@ void forward(
_megdnn_tensor_in
data
,
_megdnn_tensor_in
weight
,
_megdnn_tensor_in
bias
,
_megdnn_tensor_out
dst
,
_megdnn_tensor_out
mean
,
_megdnn_tensor_out
rstd
,
const
Param
&
param
)
{
printf
(
"general forward
\n
"
);
printf
(
"
Cpu
general forward
\n
"
);
float
eps
=
param
.
eps
;
bool
affine
=
param
.
affine
;
uint64_t
axis
=
param
.
normalized_axis
;
...
...
@@ -105,7 +105,7 @@ void backward(
btmp
=
(
db
*
mean
.
ptr
<
T_ACC
>
()[
a
*
C
+
c
]
-
ds
)
*
atmp
*
atmp
*
atmp
/
B
;
ctmp
=
-
btmp
*
mean
.
ptr
<
T_ACC
>
()[
a
*
C
+
c
]
-
db
*
atmp
/
B
;
for
(
uint64
_t
b
=
0
;
b
<
B
;
b
++
)
{
for
(
size
_t
b
=
0
;
b
<
B
;
b
++
)
{
auto
weight_v
=
affine
?
weight
.
ptr
<
T
>
()[
b
]
:
static_cast
<
T
>
(
1.0
f
);
ddata
.
ptr
<
T
>
()[
a
*
B
*
C
+
b
*
C
+
c
]
=
diff
.
ptr
<
T
>
()[
a
*
B
*
C
+
b
*
C
+
c
]
*
atmp
*
weight_v
+
...
...
dnn/test/cuda/general_norm.cpp
浏览文件 @
1fa3449a
#include "test/cuda/fixture.h"
#include "test/common/checker.h"
#include "test/cuda/benchmark.h"
namespace
megdnn
{
namespace
test
{
TEST_F
(
CUDA
,
G
eneralNorm
_FORWARD
)
{
TEST_F
(
CUDA
,
G
ENERALNORM
_FORWARD
)
{
using
Param
=
GeneralNormForward
::
Param
;
Param
param
;
param
.
affine
=
true
;
param
.
eps
=
1e-6
;
param
.
normalized_dim
=
1
;
Checker
<
GeneralNormForward
>
checker
(
handle_cuda
());
checker
.
set_epsilon
(
1e-2
);
auto
run
=
[
&
](
DType
d
)
{
for
(
size_t
n_slices
:
{
10
,
30
})
for
(
size_t
slice_len
:
{
10
,
30
})
{
param
.
normalized_size
=
slice_len
;
param
.
normalized_axis
=
0
;
checker
.
set_param
(
param
)
.
set_dtype
(
0
,
d
)
.
set_dtype
(
1
,
d
)
.
set_dtype
(
2
,
d
)
.
set_dtype
(
3
,
d
)
.
set_dtype
(
4
,
dtype
::
Float32
())
.
set_dtype
(
5
,
dtype
::
Float32
())
.
execs
({{
n_slices
,
slice_len
},
{
n_slices
},
{
n_slices
},
{
n_slices
,
slice_len
},
{
slice_len
},
{
slice_len
}});
param
.
normalized_axis
=
1
;
checker
.
set_param
(
param
)
.
set_dtype
(
0
,
d
)
.
set_dtype
(
1
,
d
)
...
...
@@ -39,19 +53,76 @@ TEST_F(CUDA, GeneralNorm_FORWARD) {
run
(
dtype
::
BFloat16
());
}
TEST_F
(
CUDA
,
GeneralNorm_BACKWARD
)
{
TEST_F
(
CUDA
,
GENERALNORM_SPEED_FP32
)
{
using
Param
=
GeneralNormForward
::
Param
;
auto
benchmarker
=
Benchmarker
<
GeneralNormForward
>
(
handle_cuda
());
benchmarker
.
set_dtype
(
0
,
dtype
::
Float32
());
benchmarker
.
set_dtype
(
1
,
dtype
::
Float32
());
Param
param
;
param
.
affine
=
true
;
float
eachTime
;
float
totalTime
=
0.
f
;
#define ITER 10
param
.
normalized_axis
=
0
;
for
(
auto
i
=
0
;
i
<
ITER
;
i
++
)
{
eachTime
=
benchmarker
.
set_param
(
param
).
exec
({{
100
,
2000
},
{
100
},
{
100
},
{},
{},
{}});
totalTime
+=
eachTime
;
}
totalTime
/=
ITER
;
printf
(
"PGENERALNORM_SPEED_FP32 AVG TIME: %.6fms
\n
"
,
totalTime
);
totalTime
=
0.
f
;
param
.
normalized_axis
=
1
;
for
(
auto
i
=
0
;
i
<
ITER
;
i
++
)
{
eachTime
=
benchmarker
.
set_param
(
param
).
exec
({{
2000
,
100
},
{
100
},
{
100
},
{},
{},
{}});
totalTime
+=
eachTime
;
}
totalTime
/=
ITER
;
printf
(
"PGENERALNORM_SPEED_FP32 AVG TIME: %.6fms
\n
"
,
totalTime
);
#undef ITER
}
TEST_F
(
CUDA
,
GENERALNORM_BACKWARD
)
{
using
Param
=
GeneralNormBackward
::
Param
;
Param
param
;
param
.
affine
=
true
;
param
.
eps
=
1e-6
;
param
.
normalized_dim
=
1
;
Checker
<
GeneralNormBackward
>
checker
(
handle_cuda
());
checker
.
set_epsilon
(
1e-1
);
auto
run
=
[
&
](
DType
d
)
{
for
(
size_t
n_slices
:
{
10
,
30
})
for
(
size_t
slice_len
:
{
10
,
30
})
{
param
.
normalized_size
=
slice_len
;
param
.
normalized_axis
=
0
;
checker
.
set_param
(
param
)
.
set_dtype
(
0
,
d
)
.
set_dtype
(
1
,
d
)
.
set_dtype
(
2
,
d
)
.
set_dtype
(
3
,
dtype
::
Float32
())
.
set_dtype
(
4
,
dtype
::
Float32
())
.
set_dtype
(
5
,
d
)
.
set_dtype
(
6
,
d
)
.
set_dtype
(
7
,
d
)
.
execs
({{
n_slices
,
slice_len
},
{
n_slices
,
slice_len
},
{
n_slices
},
{
slice_len
},
{
slice_len
},
{
n_slices
,
slice_len
},
{
n_slices
},
{
n_slices
}});
param
.
normalized_axis
=
1
;
checker
.
set_param
(
param
)
.
set_dtype
(
0
,
d
)
.
set_dtype
(
1
,
d
)
...
...
imperative/python/megengine/functional/nn.py
浏览文件 @
1fa3449a
...
...
@@ -1136,7 +1136,6 @@ def layer_norm(
def
general_norm
(
inp
:
Tensor
,
normalized_shape
:
tuple
,
normalized_axis
:
int
,
affine
:
bool
,
weight
:
Optional
[
Tensor
]
=
None
,
...
...
@@ -1158,21 +1157,11 @@ def general_norm(
See :math:`\beta` in :class:`~.GeneralNorm`.
eps: a value added to the denominator for numerical stability. Default: 1e-5
"""
if
isinstance
(
normalized_shape
,
int
):
normalized_shape
=
[
normalized_shape
]
normalized_dim
=
len
(
normalized_shape
)
assert
normalized_dim
>
0
normalized_size
=
1
for
i
in
range
(
normalized_dim
):
normalized_size
=
normalized_size
*
normalized_shape
[
i
]
assert
normalized_axis
>=
0
and
normalized_axis
<
inp
.
ndim
op
=
builtin
.
GeneralNorm
(
affine
=
affine
,
eps
=
eps
,
normalized_dim
=
normalized_dim
,
normalized_size
=
normalized_size
,
normalized_axis
=
normalized_axis
,
)
if
affine
:
...
...
imperative/python/megengine/module/normalization.py
浏览文件 @
1fa3449a
...
...
@@ -231,7 +231,7 @@ class GeneralNorm(Module):
(2, 3, 4, 4)
"""
def
__init__
(
self
,
normalized
_shape
,
normalized_axis
,
eps
=
1e-05
,
affine
=
True
,
**
kwargs
):
def
__init__
(
self
,
inp
_shape
,
normalized_axis
,
eps
=
1e-05
,
affine
=
True
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
if
isinstance
(
normalized_shape
,
int
):
normalized_shape
=
(
normalized_shape
,)
...
...
@@ -241,9 +241,9 @@ class GeneralNorm(Module):
self
.
affine
=
affine
if
self
.
affine
:
self
.
weight
=
Parameter
(
np
.
ones
(
self
.
normalized_shape
,
dtype
=
"float32"
))
np
.
ones
(
inp_shape
[
normalized_axis
]
,
dtype
=
"float32"
))
self
.
bias
=
Parameter
(
np
.
zeros
(
self
.
normalized_shape
,
dtype
=
"float32"
))
np
.
zeros
(
inp_shape
[
normalized_axis
]
,
dtype
=
"float32"
))
else
:
self
.
weight
=
None
self
.
bias
=
None
...
...
@@ -257,10 +257,10 @@ class GeneralNorm(Module):
def
forward
(
self
,
x
):
x
=
F
.
nn
.
general_norm
(
x
,
self
.
normalized_
shape
,
self
.
normalized_
axis
,
self
.
affine
,
self
.
weight
,
self
.
bias
,
self
.
eps
x
,
self
.
normalized_axis
,
self
.
affine
,
self
.
weight
,
self
.
bias
,
self
.
eps
)
return
x
def
_module_info_string
(
self
)
->
str
:
s
=
"normalized_
shape={normalized_shape}, normalized_
axis={normalized_axis}, eps={eps}, affine={affine}"
s
=
"normalized_axis={normalized_axis}, eps={eps}, affine={affine}"
return
s
.
format
(
**
self
.
__dict__
)
src/opr/impl/dnn/general_norm.cpp
浏览文件 @
1fa3449a
...
...
@@ -66,7 +66,6 @@ SymbolVarArray GeneralNormForward::make(
void
GeneralNormForward
::
get_output_var_shape
(
const
TensorShapeArray
&
inp_shape
,
TensorShapeArray
&
out_shape
)
const
{
uint64_t
normalized_dim
=
param
().
normalized_dim
;
out_shape
[
0
]
=
inp_shape
[
0
];
TensorShape
unnormalized_shape
=
inp_shape
[
0
];
unnormalized_shape
.
ndim
-=
1
;
...
...
src/opr/test/dnn/general_norm.cpp
浏览文件 @
1fa3449a
...
...
@@ -23,8 +23,6 @@ void run_forward(bool is_affine, size_t normalized_size, size_t normalized_axis)
Param
param
;
param
.
eps
=
1e-5
;
param
.
affine
=
is_affine
;
param
.
normalized_dim
=
1
;
param
.
normalized_size
=
normalized_size
;
param
.
normalized_axis
=
normalized_axis
;
auto
make_graph
=
[
&
](
const
Checker
::
SymInpArray
&
inputs
)
->
Checker
::
SymOutArray
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录