Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
09570b48
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
09570b48
编写于
1月 30, 2018
作者:
C
chengduoZH
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
layer norm -> scale + bias
上级
7e0d21de
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
25 addition
and
21 deletion
+25
-21
paddle/operators/layer_norm_op.cc
paddle/operators/layer_norm_op.cc
+10
-9
python/paddle/v2/fluid/tests/test_layer_norm_op.py
python/paddle/v2/fluid/tests/test_layer_norm_op.py
+15
-12
未找到文件。
paddle/operators/layer_norm_op.cc
浏览文件 @
09570b48
...
...
@@ -45,11 +45,12 @@ class LayerNormOp : public framework::OperatorWithKernel {
auto
matrix_dim
=
framework
::
flatten_to_2d
(
x_dim
,
begin_norm_axis
);
int
left
=
static_cast
<
int
>
(
matrix_dim
[
0
]);
int
right
=
static_cast
<
int
>
(
matrix_dim
[
1
]);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Scale"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Scale"
)[
0
],
lef
t
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Scale"
)[
0
],
righ
t
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Bias"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Bias"
)[
0
],
lef
t
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Bias"
)[
0
],
righ
t
);
ctx
->
SetOutputDim
(
"Y"
,
ctx
->
GetInputDim
(
"X"
));
ctx
->
SetOutputDim
(
"Mean"
,
{
left
});
...
...
@@ -143,10 +144,10 @@ class LayerNormKernel<platform::CPUDeviceContext, T>
// TODO(zcd): Some thinking about output_map, is it appropriate that
// `output_map` and `input_map` point to the same memory.
auto
inv_std
_scale
=
var_map
.
unaryExpr
(
inv_std_func
);
auto
inv_std
=
var_map
.
unaryExpr
(
inv_std_func
);
output_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
inv_std
_scale
.
replicate
(
1
,
right
))
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
))
-
.
cwiseProduct
(
inv_std
.
replicate
(
1
,
right
))
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
))
+
bias_map
.
replicate
(
left
,
1
);
}
};
...
...
@@ -230,7 +231,7 @@ class LayerNormGradKernel<platform::CPUDeviceContext, T>
if
(
d_bias
)
{
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
d_bias_map
=
EigenMatrixMapRowMajor
<
T
>
(
d_bias
->
data
<
T
>
(),
1
,
right
);
d_bias_map
=
d_y_map
.
colwise
().
mean
();
d_bias_map
=
d_y_map
.
colwise
().
sum
();
}
if
(
d_scale
)
{
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
...
...
@@ -245,7 +246,7 @@ class LayerNormGradKernel<platform::CPUDeviceContext, T>
var_map
.
unaryExpr
(
inv_std_func
).
replicate
(
1
,
right
))
.
cwiseProduct
(
d_y_map
))
.
colwise
()
.
mean
();
.
sum
();
}
if
(
d_x
)
{
...
...
@@ -269,14 +270,14 @@ class LayerNormGradKernel<platform::CPUDeviceContext, T>
.
replicate
(
1
,
right
);
// dy_var_dx
auto
dvar_end_part
=
(
x_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
))
.
cwiseProduct
(
d_y_map
)
.
rowwise
()
.
sum
();
auto
dvar_end
=
var_map
.
unaryExpr
(
inv_std_func
)
.
unaryExpr
(
triple_product_func
)
.
cwiseProduct
(
dvar_end_part
)
.
replicate
(
1
,
right
)
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
));
.
replicate
(
1
,
right
);
auto
dx_var
=
(
T
(
-
1.0
)
/
right
)
*
(
x_map
-
mean_map
.
replicate
(
1
,
right
)).
cwiseProduct
(
dvar_end
);
...
...
python/paddle/v2/fluid/tests/test_layer_norm_op.py
浏览文件 @
09570b48
...
...
@@ -49,35 +49,38 @@ def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
def
_reference_layer_norm_grad
(
x
,
grad_y
,
scale
,
mean
,
var
,
begin_norm_axis
=
1
):
x_shape
=
x
.
shape
scale_shape
=
scale
.
shape
N
=
reduce
(
mul
,
x_shape
[
0
:
begin_norm_axis
],
1
)
D
=
reduce
(
mul
,
x_shape
[
begin_norm_axis
:
len
(
x_shape
)],
1
)
grad_y
.
shape
=
[
N
,
D
]
x
.
shape
=
[
N
,
D
]
mean
.
shape
=
[
N
,
1
]
var
.
shape
=
[
N
,
1
]
scale
.
shape
=
[
1
,
D
]
d_scale
=
np
.
sum
(
grad_y
,
axis
=
1
).
reshape
([
1
,
D
])
d_bias
=
scale
.
reshape
([
1
,
D
])
*
np
.
sum
((
(
x
-
mean
)
*
np
.
sqrt
(
1
/
var
))
*
grad_y
,
axis
=
1
).
reshape
([
1
,
D
])
d_bias
=
np
.
sum
(
grad_y
,
axis
=
0
).
reshape
([
1
,
D
])
d_scale
=
np
.
sum
(((
x
-
mean
)
*
np
.
sqrt
(
1
/
var
))
*
grad_y
,
axis
=
0
).
reshape
([
1
,
D
])
dx_end
=
np
.
sqrt
(
1.0
/
var
)
*
grad_y
dx_end
=
scale
*
np
.
sqrt
(
1.0
/
var
)
*
grad_y
d_mean_0
=
np
.
sum
(
-
np
.
sqrt
(
1.0
/
var
)
*
grad_y
,
axis
=
1
).
reshape
([
N
,
1
])
d_mean_0
=
np
.
sum
(
-
np
.
sqrt
(
1.0
/
var
)
*
grad_y
*
scale
,
axis
=
1
).
reshape
(
[
N
,
1
])
# d_mean_1 = np.sum(-1.0 / var * (x - mean) * grad_y, axis=1).reshape(
# [N, 1]) * (-1.0 / D * np.sqrt(1.0 / var) *
# np.sum(x - mean, axis=1).reshape([N, 1])).reshape([N, 1])
d_mean
=
1.0
/
D
*
(
d_mean_0
)
d_mean
=
1.0
/
D
*
d_mean_0
d_std
=
np
.
sum
(
-
1.0
/
var
*
(
x
-
mean
)
*
grad_y
,
axis
=
1
).
reshape
([
N
,
1
])
*
(
d_std
=
np
.
sum
(
-
1.0
/
var
*
(
x
-
mean
)
*
grad_y
*
scale
,
axis
=
1
).
reshape
([
N
,
1
])
*
(
1.0
/
D
*
np
.
sqrt
(
1.0
/
var
).
reshape
([
N
,
1
])
*
(
x
-
mean
))
grad_x
=
scale
.
reshape
([
1
,
D
])
*
(
dx_end
+
d_mean
+
d_std
)
grad_x
=
dx_end
+
d_mean
+
d_std
grad_y
.
shape
=
x_shape
x
.
shape
=
x_shape
return
grad_x
,
d_
bias
,
d_scale
scale
.
shape
=
scale_shape
return
grad_x
,
d_
scale
,
d_bias
def
create_or_get_tensor
(
scope
,
var_name
,
var
,
place
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录