Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
1eee3d69
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
1eee3d69
编写于
7月 30, 2020
作者:
W
wilfChen
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
gpu layernorm
上级
12a150bb
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
140 addition
and
60 deletion
+140
-60
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/layer_norm_grad_impl.cu
...end/kernel_compiler/gpu/cuda_impl/layer_norm_grad_impl.cu
+42
-50
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/layer_norm_impl.cu
.../backend/kernel_compiler/gpu/cuda_impl/layer_norm_impl.cu
+4
-10
tests/st/ops/gpu/test_layer_norm_grad_op.py
tests/st/ops/gpu/test_layer_norm_grad_op.py
+52
-0
tests/st/ops/gpu/test_layer_norm_op.py
tests/st/ops/gpu/test_layer_norm_op.py
+42
-0
未找到文件。
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/layer_norm_grad_impl.cu
浏览文件 @
1eee3d69
...
...
@@ -34,9 +34,9 @@ inline __device__ half my_pow(half a, double b) {
}
template
<
typename
T
>
inline
__device__
void
GammaAndBetaThreadReduce
(
const
int
&
col
,
const
int
&
row_dim
,
const
int
&
col_dim
,
const
T
&
epsilon
,
const
T
*
dy
,
const
T
*
x
,
const
T
*
mean
,
const
T
*
var
,
T
*
dg
,
T
*
db
)
{
inline
__device__
void
GammaAndBetaThreadReduce
(
const
int
&
col
,
const
int
&
row_dim
,
const
int
&
col_dim
,
const
T
&
epsilon
,
const
T
*
dy
,
const
T
*
x
,
const
T
*
mean
,
const
T
*
var
,
T
*
dg
,
T
*
db
)
{
int
loop_num
=
(
row_dim
+
NUM_PER_THREAD_REDUCE
-
1
)
/
NUM_PER_THREAD_REDUCE
;
for
(
int
i
=
threadIdx
.
x
;
i
<
loop_num
;
i
+=
blockDim
.
x
)
{
for
(
int
j
=
0
;
j
<
NUM_PER_THREAD_REDUCE
;
j
++
)
{
...
...
@@ -53,7 +53,7 @@ inline __device__ void GammaAndBetaThreadReduce(const int& col, const int& row_d
}
template
<
typename
T
>
inline
__device__
void
GammaAndBetaWarpReduce
(
T
*
dg
,
T
*
db
)
{
inline
__device__
void
GammaAndBetaWarpReduce
(
T
*
dg
,
T
*
db
)
{
for
(
int
delta
=
(
WARP_SIZE
>>
1
);
delta
>
0
;
delta
>>=
1
)
{
dg
[
0
]
+=
__shfl_down_sync
(
0xffffffff
,
dg
[
0
],
delta
);
db
[
0
]
+=
__shfl_down_sync
(
0xffffffff
,
db
[
0
],
delta
);
...
...
@@ -61,12 +61,8 @@ inline __device__ void GammaAndBetaWarpReduce(T* dg, T* db) {
}
template
<
typename
T
>
inline
__device__
void
GammaAndBetaBlockReduce
(
const
int
&
col
,
const
int
&
row_dim
,
T
*
dg
,
T
*
db
,
T
*
dg_addr
,
T
*
db_addr
)
{
if
(
threadIdx
.
x
>=
row_dim
)
{
return
;
}
inline
__device__
void
GammaAndBetaBlockReduce
(
const
int
&
col
,
const
int
&
row_dim
,
T
*
dg
,
T
*
db
,
T
*
dg_addr
,
T
*
db_addr
)
{
// load data to share memory
// thread(0, 32, 64, 96, ...) keep the data
DynamicSharedMem
<
T
>
share_mem
;
...
...
@@ -93,8 +89,8 @@ inline __device__ void GammaAndBetaBlockReduce(const int& col, const int& row_di
}
template
<
typename
T
>
__global__
void
GammaAndBetaPropKernel
(
const
int
row_dim
,
const
int
col_dim
,
const
T
epsilon
,
const
T
*
dy
,
const
T
*
x
,
const
T
*
mean_addr
,
const
T
*
var_addr
,
T
*
dg_addr
,
T
*
db_addr
)
{
__global__
void
GammaAndBetaPropKernel
(
const
int
row_dim
,
const
int
col_dim
,
const
T
epsilon
,
const
T
*
dy
,
const
T
*
x
,
const
T
*
mean_addr
,
const
T
*
var_addr
,
T
*
dg_addr
,
T
*
db_addr
)
{
// row: [0:param_axis]
// col: [param_axis:]
// dg[i][j] = dy[i][j] * (var[i] + epsilon, -0.5) * (x[i][j] - mean[i])
...
...
@@ -109,9 +105,9 @@ __global__ void GammaAndBetaPropKernel(const int row_dim, const int col_dim, con
}
template
<
typename
T
>
inline
__device__
void
InputThreadReduce
(
const
int
&
row
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
T
&
epsilon
,
T
*
sum1
,
T
*
sum2
,
T
*
sum3
,
const
T
*
dy
,
const
T
*
x
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
gamma
)
{
inline
__device__
void
InputThreadReduce
(
const
int
&
row
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
T
&
epsilon
,
T
*
sum1
,
T
*
sum2
,
T
*
sum3
,
const
T
*
dy
,
const
T
*
x
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
gamma
)
{
int
loop_num
=
(
col_dim
+
NUM_PER_THREAD_REDUCE
-
1
)
/
NUM_PER_THREAD_REDUCE
;
for
(
int
i
=
threadIdx
.
x
;
i
<
loop_num
;
i
+=
blockDim
.
x
)
{
for
(
int
j
=
0
;
j
<
NUM_PER_THREAD_REDUCE
;
j
++
)
{
...
...
@@ -133,9 +129,9 @@ inline __device__ void InputThreadReduce(const int& row, const int& col_dim, con
}
template
<
>
inline
__device__
void
InputThreadReduce
(
const
int
&
row
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
half
&
epsilon
,
half
*
sum1
,
half
*
sum2
,
half
*
sum3
,
const
half
*
dy
,
const
half
*
x
,
const
half
*
mean
,
const
half
*
var
,
const
half
*
gamma
)
{
inline
__device__
void
InputThreadReduce
(
const
int
&
row
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
half
&
epsilon
,
half
*
sum1
,
half
*
sum2
,
half
*
sum3
,
const
half
*
dy
,
const
half
*
x
,
const
half
*
mean
,
const
half
*
var
,
const
half
*
gamma
)
{
int
loop_num
=
(
col_dim
+
NUM_PER_THREAD_REDUCE
-
1
)
/
NUM_PER_THREAD_REDUCE
;
for
(
int
i
=
threadIdx
.
x
;
i
<
loop_num
;
i
+=
blockDim
.
x
)
{
for
(
int
j
=
0
;
j
<
NUM_PER_THREAD_REDUCE
;
j
++
)
{
...
...
@@ -157,7 +153,7 @@ inline __device__ void InputThreadReduce(const int& row, const int& col_dim, con
}
template
<
typename
T
>
inline
__device__
void
InputWarpReduce
(
T
*
sum1
,
T
*
sum2
,
T
*
sum3
)
{
inline
__device__
void
InputWarpReduce
(
T
*
sum1
,
T
*
sum2
,
T
*
sum3
)
{
for
(
int
delta
=
(
WARP_SIZE
>>
1
);
delta
>
0
;
delta
>>=
1
)
{
sum1
[
0
]
+=
__shfl_down_sync
(
0xffffffff
,
sum1
[
0
],
delta
);
sum2
[
0
]
+=
__shfl_down_sync
(
0xffffffff
,
sum2
[
0
],
delta
);
...
...
@@ -166,11 +162,7 @@ inline __device__ void InputWarpReduce(T* sum1, T* sum2, T* sum3) {
}
template
<
typename
T
>
inline
__device__
void
InputBlockReduce
(
const
int
&
col_dim
,
T
*
sum1
,
T
*
sum2
,
T
*
sum3
,
T
*
share_mem
)
{
if
(
threadIdx
.
x
>=
col_dim
)
{
return
;
}
inline
__device__
void
InputBlockReduce
(
const
int
&
col_dim
,
T
*
sum1
,
T
*
sum2
,
T
*
sum3
,
T
*
share_mem
)
{
// load data to share memory
// thread(0, 32, 64, 96, ...) keep the data
if
(
threadIdx
.
x
%
WARP_SIZE
==
0
)
{
...
...
@@ -193,9 +185,9 @@ inline __device__ void InputBlockReduce(const int& col_dim, T* sum1, T* sum2, T*
}
template
<
typename
T
>
inline
__device__
void
InputProp
(
const
int
&
row
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
T
&
epsilon
,
const
T
*
dy
,
const
T
*
x
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
gamma
,
T
*
dx
,
const
T
*
share_mem
)
{
inline
__device__
void
InputProp
(
const
int
&
row
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
T
&
epsilon
,
const
T
*
dy
,
const
T
*
x
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
gamma
,
T
*
dx
,
const
T
*
share_mem
)
{
for
(
int
col
=
threadIdx
.
x
;
col
<
col_dim
;
col
+=
blockDim
.
x
)
{
int
pos
=
(
row
*
col_dim
+
col
);
int
gamma_offset
=
pos
%
param_dim
;
...
...
@@ -208,9 +200,9 @@ inline __device__ void InputProp(const int& row, const int& col_dim, const int&
}
template
<
>
inline
__device__
void
InputProp
(
const
int
&
row
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
half
&
epsilon
,
const
half
*
dy
,
const
half
*
x
,
const
half
*
mean
,
const
half
*
var
,
const
half
*
gamma
,
half
*
dx
,
const
half
*
share_mem
)
{
inline
__device__
void
InputProp
(
const
int
&
row
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
half
&
epsilon
,
const
half
*
dy
,
const
half
*
x
,
const
half
*
mean
,
const
half
*
var
,
const
half
*
gamma
,
half
*
dx
,
const
half
*
share_mem
)
{
for
(
int
col
=
threadIdx
.
x
;
col
<
col_dim
;
col
+=
blockDim
.
x
)
{
int
pos
=
(
row
*
col_dim
+
col
);
int
gamma_offset
=
pos
%
param_dim
;
...
...
@@ -218,14 +210,14 @@ inline __device__ void InputProp(const int& row, const int& col_dim, const int&
half
v2
=
x
[
pos
]
-
mean
[
row
];
half
v3
=
my_pow
(
var
[
row
]
+
epsilon
,
-
0.5
);
dx
[
pos
]
=
v1
*
v3
+
share_mem
[
0
]
*
__float2half
(
2.0
/
col_dim
)
*
v2
+
(
__float2half
(
-
1.0
)
*
v3
*
share_mem
[
1
]
+
__float2half
(
1.0
/
col_dim
)
*
share_mem
[
0
]
*
share_mem
[
2
])
\
*
__float2half
(
1.0
/
col_dim
);
(
__float2half
(
-
1.0
)
*
v3
*
share_mem
[
1
]
+
__float2half
(
1.0
/
col_dim
)
*
share_mem
[
0
]
*
share_mem
[
2
])
*
__float2half
(
1.0
/
col_dim
);
}
}
template
<
typename
T
>
__global__
void
InputPropKernel
(
const
int
row_dim
,
const
int
col_dim
,
const
int
param_dim
,
const
T
epsilon
,
const
T
*
dy
,
const
T
*
x
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
gamma
,
T
*
dx
)
{
__global__
void
InputPropKernel
(
const
int
row_dim
,
const
int
col_dim
,
const
int
param_dim
,
const
T
epsilon
,
const
T
*
dy
,
const
T
*
x
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
gamma
,
T
*
dx
)
{
for
(
int
row
=
blockIdx
.
x
;
row
<
row_dim
;
row
+=
gridDim
.
x
)
{
T
sum1
=
0
;
T
sum2
=
0
;
...
...
@@ -239,21 +231,21 @@ __global__ void InputPropKernel(const int row_dim, const int col_dim, const int
}
template
<
typename
T
>
void
LayerNormGrad
(
const
int
&
row_dim
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
T
&
epsilon
,
const
T
*
dy
,
const
T
*
x
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
gamma
,
T
*
dx
,
T
*
dg
,
T
*
db
,
cudaStream_t
stream
)
{
int
share_mem_size
=
((
col_dim
+
NUM_PER_THREAD_REDUCE
-
1
)
/
NUM_PER_THREAD_REDUCE
+
WARP_SIZE
-
1
)
/
WARP_SIZE
*
3
*
sizeof
(
T
);
InputPropKernel
<<<
row_dim
,
256
,
share_mem_size
,
stream
>>>
(
row_dim
,
col_dim
,
param_dim
,
epsilon
,
dy
,
x
,
mean
,
var
,
gamma
,
dx
);
share_mem_size
=
((
row_dim
+
NUM_PER_THREAD_REDUCE
-
1
)
/
NUM_PER_THREAD_REDUCE
+
WARP_SIZE
-
1
)
/
WARP_SIZE
*
2
*
sizeof
(
T
);
GammaAndBetaPropKernel
<<<
col_dim
,
256
,
share_mem_size
,
stream
>>>
(
row_dim
,
col_dim
,
epsilon
,
dy
,
x
,
mean
,
var
,
dg
,
db
);
void
LayerNormGrad
(
const
int
&
row_dim
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
T
&
epsilon
,
const
T
*
dy
,
const
T
*
x
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
gamma
,
T
*
dx
,
T
*
dg
,
T
*
db
,
cudaStream_t
stream
)
{
const
int
thread_per_block
=
256
;
int
share_mem_size
=
thread_per_block
/
WARP_SIZE
*
3
*
sizeof
(
T
);
InputPropKernel
<<<
row_dim
,
thread_per_block
,
share_mem_size
,
stream
>>>
(
row_dim
,
col_dim
,
param_dim
,
epsilon
,
dy
,
x
,
mean
,
var
,
gamma
,
dx
);
share_mem_size
=
thread_per_block
/
WARP_SIZE
*
2
*
sizeof
(
T
);
GammaAndBetaPropKernel
<<<
col_dim
,
thread_per_block
,
share_mem_size
,
stream
>>>
(
row_dim
,
col_dim
,
epsilon
,
dy
,
x
,
mean
,
var
,
dg
,
db
);
}
template
void
LayerNormGrad
(
const
int
&
row_dim
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
float
&
epsilon
,
const
float
*
dy
,
const
float
*
x
,
const
float
*
mean
,
const
float
*
var
,
const
float
*
gamma
,
float
*
dx
,
float
*
dg
,
float
*
db
,
cudaStream_t
stream
);
template
void
LayerNormGrad
(
const
int
&
row_dim
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
half
&
epsilon
,
const
half
*
dy
,
const
half
*
x
,
const
half
*
mean
,
const
half
*
var
,
const
half
*
gamma
,
half
*
dx
,
half
*
dg
,
half
*
db
,
cudaStream_t
stream
);
template
void
LayerNormGrad
(
const
int
&
row_dim
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
float
&
epsilon
,
const
float
*
dy
,
const
float
*
x
,
const
float
*
mean
,
const
float
*
var
,
const
float
*
gamma
,
float
*
dx
,
float
*
dg
,
float
*
db
,
cudaStream_t
stream
);
template
void
LayerNormGrad
(
const
int
&
row_dim
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
half
&
epsilon
,
const
half
*
dy
,
const
half
*
x
,
const
half
*
mean
,
const
half
*
var
,
const
half
*
gamma
,
half
*
dx
,
half
*
dg
,
half
*
db
,
cudaStream_t
stream
);
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/layer_norm_impl.cu
浏览文件 @
1eee3d69
...
...
@@ -73,10 +73,6 @@ inline __device__ void WarpReduce(T *mean, T *var, T *num) {
template
<
typename
T
>
inline
__device__
void
BlockReduce
(
const
int
&
col_dim
,
T
*
mean
,
T
*
var
,
T
*
num
,
T
*
mean_addr
,
T
*
var_addr
,
T
*
share_mem
)
{
if
(
threadIdx
.
x
>=
col_dim
)
{
return
;
}
// load data to share memory
// thread(0, 32, 64, 96, ...) keep the data
if
(
threadIdx
.
x
%
WARP_SIZE
==
0
)
{
...
...
@@ -146,13 +142,11 @@ __global__ void LayerNormKernel(const int row_dim, const int col_dim, const int
template
<
typename
T
>
void
LayerNorm
(
const
int
&
row_dim
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
T
&
epsilon
,
const
T
*
x
,
const
T
*
gamma
,
const
T
*
beta
,
T
*
y
,
T
*
mean
,
T
*
var
,
cudaStream_t
stream
)
{
const
dim3
block
(
row_dim
);
const
dim3
thread
(
256
);
const
int
thread_per_block
=
256
;
// keep the mean/var/num after warp reduce
int
share_mem_size
=
((
col_dim
+
NUM_PER_THREAD_REDUCE
-
1
)
/
NUM_PER_THREAD_REDUCE
+
WARP_SIZE
-
1
)
/
WARP_SIZE
*
3
*
sizeof
(
T
);
LayerNormKernel
<<<
block
,
thread
,
share_mem_size
,
stream
>>>
(
row_dim
,
col_dim
,
param_dim
,
epsilon
,
x
,
gamma
,
beta
,
y
,
mean
,
var
);
int
share_mem_size
=
thread_per_block
/
WARP_SIZE
*
3
*
sizeof
(
T
);
LayerNormKernel
<<<
row_dim
,
thread_per_block
,
share_mem_size
,
stream
>>>
(
row_dim
,
col_dim
,
param_dim
,
epsilon
,
x
,
gamma
,
beta
,
y
,
mean
,
var
);
}
template
void
LayerNorm
(
const
int
&
row_dim
,
const
int
&
col_dim
,
const
int
&
param_dim
,
const
float
&
epsilon
,
...
...
tests/st/ops/gpu/test_layer_norm_grad_op.py
浏览文件 @
1eee3d69
...
...
@@ -141,3 +141,55 @@ def test_layernormgrad2():
assert
np
.
allclose
(
dx_ms
.
asnumpy
(),
dx_np
,
rtol
=
1e-6
,
atol
=
1e-6
)
assert
np
.
allclose
(
dg_ms
.
asnumpy
(),
dg_np
,
rtol
=
1e-6
,
atol
=
1e-3
)
assert
np
.
allclose
(
db_ms
.
asnumpy
(),
db_np
,
rtol
=
1e-6
,
atol
=
1e-3
)
@
pytest
.
mark
.
level0
@
pytest
.
mark
.
platform_x86_gpu_training
@
pytest
.
mark
.
env_onecard
def
test_layernormgrad3
():
begin_norm_axis
=
-
1
begin_params_axis
=
-
1
x_np
=
np
.
random
.
randn
(
32
,
64
).
astype
(
np
.
float32
)
dy_np
=
np
.
random
.
randn
(
32
,
64
).
astype
(
np
.
float32
)
gamma_np
=
np
.
random
.
randn
(
*
x_np
.
shape
[
begin_params_axis
:]).
astype
(
np
.
float32
)
epsilon
=
10e-12
dx_np
,
dg_np
,
db_np
,
mean_np
,
var_np
=
LayerNormGradReference
(
x_np
,
dy_np
,
gamma_np
,
epsilon
,
begin_norm_axis
,
begin_params_axis
)
dy_ms
=
Tensor
(
dy_np
)
x_ms
=
Tensor
(
x_np
)
var_ms
=
Tensor
(
var_np
)
mean_ms
=
Tensor
(
mean_np
)
gamma_ms
=
Tensor
(
gamma_np
)
net
=
LayerNormGradNet
(
begin_norm_axis
,
begin_params_axis
)
dx_ms
,
dg_ms
,
db_ms
=
net
(
x_ms
,
dy_ms
,
var_ms
,
mean_ms
,
gamma_ms
)
assert
np
.
allclose
(
dx_ms
.
asnumpy
(),
dx_np
,
rtol
=
1e-6
,
atol
=
1e-6
)
assert
np
.
allclose
(
dg_ms
.
asnumpy
(),
dg_np
,
rtol
=
1e-6
,
atol
=
1e-3
)
assert
np
.
allclose
(
db_ms
.
asnumpy
(),
db_np
,
rtol
=
1e-6
,
atol
=
1e-3
)
@
pytest
.
mark
.
level0
@
pytest
.
mark
.
platform_x86_gpu_training
@
pytest
.
mark
.
env_onecard
def
test_layernormgrad4
():
begin_norm_axis
=
-
1
begin_params_axis
=
-
1
x_np
=
np
.
random
.
randn
(
32
,
64
).
astype
(
np
.
float32
)
dy_np
=
np
.
random
.
randn
(
32
,
64
).
astype
(
np
.
float32
)
gamma_np
=
np
.
random
.
randn
(
*
x_np
.
shape
[
begin_params_axis
:]).
astype
(
np
.
float32
)
epsilon
=
10e-12
dx_np
,
dg_np
,
db_np
,
mean_np
,
var_np
=
LayerNormGradReference
(
x_np
,
dy_np
,
gamma_np
,
epsilon
,
begin_norm_axis
,
begin_params_axis
)
dy_ms
=
Tensor
(
dy_np
)
x_ms
=
Tensor
(
x_np
)
var_ms
=
Tensor
(
var_np
)
mean_ms
=
Tensor
(
mean_np
)
gamma_ms
=
Tensor
(
gamma_np
)
net
=
LayerNormGradNet
(
begin_norm_axis
,
begin_params_axis
)
dx_ms
,
dg_ms
,
db_ms
=
net
(
x_ms
,
dy_ms
,
var_ms
,
mean_ms
,
gamma_ms
)
assert
np
.
allclose
(
dx_ms
.
asnumpy
(),
dx_np
,
rtol
=
1e-6
,
atol
=
1e-6
)
assert
np
.
allclose
(
dg_ms
.
asnumpy
(),
dg_np
,
rtol
=
1e-6
,
atol
=
1e-3
)
assert
np
.
allclose
(
db_ms
.
asnumpy
(),
db_np
,
rtol
=
1e-6
,
atol
=
1e-3
)
tests/st/ops/gpu/test_layer_norm_op.py
浏览文件 @
1eee3d69
...
...
@@ -133,3 +133,45 @@ def test_layernorm3d_2():
assert
np
.
allclose
(
y_ms
.
asnumpy
(),
y_np
,
rtol
=
1e-6
,
atol
=
1e-6
)
assert
np
.
allclose
(
mean_ms
.
asnumpy
(),
mean_np
,
rtol
=
1e-6
,
atol
=
1e-6
)
assert
np
.
allclose
(
var_ms
.
asnumpy
(),
var_np
,
rtol
=
1e-6
,
atol
=
1e-6
)
@
pytest
.
mark
.
level0
@
pytest
.
mark
.
platform_x86_gpu_training
@
pytest
.
mark
.
env_onecard
def
test_layernorm2d_2
():
begin_norm_axis
=
-
1
begin_params_axis
=
1
x_np
=
np
.
random
.
randn
(
64
,
32
).
astype
(
np
.
float32
)
gamma_np
=
np
.
random
.
randn
(
*
x_np
.
shape
[
begin_params_axis
:]).
astype
(
np
.
float32
)
beta_np
=
np
.
random
.
randn
(
*
x_np
.
shape
[
begin_params_axis
:]).
astype
(
np
.
float32
)
y_np
,
mean_np
,
var_np
=
LayerNormReference
(
begin_norm_axis
,
begin_params_axis
,
x_np
,
gamma_np
,
beta_np
)
x_ms
=
Tensor
(
x_np
)
gamma_ms
=
Tensor
(
gamma_np
)
beta_ms
=
Tensor
(
beta_np
)
net
=
LayerNormNet
(
begin_norm_axis
,
begin_params_axis
)
y_ms
,
mean_ms
,
var_ms
=
net
(
x_ms
,
gamma_ms
,
beta_ms
)
assert
np
.
allclose
(
y_ms
.
asnumpy
(),
y_np
,
rtol
=
1e-6
,
atol
=
1e-6
)
assert
np
.
allclose
(
mean_ms
.
asnumpy
(),
mean_np
,
rtol
=
1e-6
,
atol
=
1e-6
)
assert
np
.
allclose
(
var_ms
.
asnumpy
(),
var_np
,
rtol
=
1e-6
,
atol
=
1e-6
)
@
pytest
.
mark
.
level0
@
pytest
.
mark
.
platform_x86_gpu_training
@
pytest
.
mark
.
env_onecard
def
test_layernorm2d_3
():
begin_norm_axis
=
-
1
begin_params_axis
=
1
x_np
=
np
.
random
.
randn
(
128
,
128
).
astype
(
np
.
float32
)
gamma_np
=
np
.
random
.
randn
(
*
x_np
.
shape
[
begin_params_axis
:]).
astype
(
np
.
float32
)
beta_np
=
np
.
random
.
randn
(
*
x_np
.
shape
[
begin_params_axis
:]).
astype
(
np
.
float32
)
y_np
,
mean_np
,
var_np
=
LayerNormReference
(
begin_norm_axis
,
begin_params_axis
,
x_np
,
gamma_np
,
beta_np
)
x_ms
=
Tensor
(
x_np
)
gamma_ms
=
Tensor
(
gamma_np
)
beta_ms
=
Tensor
(
beta_np
)
net
=
LayerNormNet
(
begin_norm_axis
,
begin_params_axis
)
y_ms
,
mean_ms
,
var_ms
=
net
(
x_ms
,
gamma_ms
,
beta_ms
)
assert
np
.
allclose
(
y_ms
.
asnumpy
(),
y_np
,
rtol
=
1e-6
,
atol
=
1e-6
)
assert
np
.
allclose
(
mean_ms
.
asnumpy
(),
mean_np
,
rtol
=
1e-6
,
atol
=
1e-6
)
assert
np
.
allclose
(
var_ms
.
asnumpy
(),
var_np
,
rtol
=
1e-6
,
atol
=
1e-6
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录