Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
87b5559c
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
87b5559c
编写于
1月 29, 2018
作者:
C
chengduoZH
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix scale and bias dim
上级
0f47703d
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
52 addition
and
48 deletion
+52
-48
paddle/operators/layer_norm_op.cc
paddle/operators/layer_norm_op.cc
+42
-42
python/paddle/v2/fluid/tests/test_layer_norm_op.py
python/paddle/v2/fluid/tests/test_layer_norm_op.py
+10
-6
未找到文件。
paddle/operators/layer_norm_op.cc
浏览文件 @
87b5559c
...
...
@@ -38,10 +38,6 @@ class LayerNormOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Bias"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Y"
),
""
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Scale"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Scale"
)[
0
],
1
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Bias"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Bias"
)[
0
],
1
);
auto
x_dim
=
ctx
->
GetInputDim
(
"X"
);
auto
begin_norm_axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"begin_norm_axis"
);
PADDLE_ENFORCE_LT
(
begin_norm_axis
,
x_dim
.
size
(),
...
...
@@ -50,6 +46,11 @@ class LayerNormOp : public framework::OperatorWithKernel {
auto
matrix_dim
=
framework
::
flatten_to_2d
(
x_dim
,
begin_norm_axis
);
int
left
=
static_cast
<
int
>
(
matrix_dim
[
0
]);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Scale"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Scale"
)[
0
],
left
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Bias"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Bias"
)[
0
],
left
);
ctx
->
SetOutputDim
(
"Y"
,
ctx
->
GetInputDim
(
"X"
));
ctx
->
SetOutputDim
(
"Mean"
,
{
left
});
ctx
->
SetOutputDim
(
"Variance"
,
{
left
});
...
...
@@ -64,10 +65,10 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"The input tensor"
);
AddInput
(
"Scale"
,
"Scale is a 1-dimensional tensor of size
1
"
"Scale is a 1-dimensional tensor of size
H
"
"that is applied to the output"
);
AddInput
(
"Bias"
,
"Bias is a 1-dimensional tensor of size
1
"
"Bias is a 1-dimensional tensor of size
H
"
"that is applied to the output"
);
AddOutput
(
"Y"
,
"result after normalization"
);
AddOutput
(
"Mean"
,
"Mean of the current mini batch."
);
...
...
@@ -110,9 +111,6 @@ class LayerNormKernel<platform::CPUDeviceContext, T>
const
auto
&
x_dims
=
x
->
dims
();
const
auto
begin_norm_axis
=
ctx
.
Attr
<
int
>
(
"begin_norm_axis"
);
auto
scale_data
=
scale
->
data
<
T
>
()[
0
];
auto
bias_data
=
bias
->
data
<
T
>
()[
0
];
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
auto
*
mean
=
ctx
.
Output
<
Tensor
>
(
"Mean"
);
auto
*
var
=
ctx
.
Output
<
Tensor
>
(
"Variance"
);
...
...
@@ -123,7 +121,10 @@ class LayerNormKernel<platform::CPUDeviceContext, T>
auto
matrix_dim
=
framework
::
flatten_to_2d
(
x_dims
,
begin_norm_axis
);
int
left
=
static_cast
<
int
>
(
matrix_dim
[
0
]);
int
right
=
static_cast
<
int
>
(
matrix_dim
[
1
]);
auto
input_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
x
->
data
<
T
>
(),
left
,
right
);
auto
scale_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
scale
->
data
<
T
>
(),
left
,
1
);
auto
bias_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
bias
->
data
<
T
>
(),
left
,
1
);
auto
mean_map
=
EigenMatrixMapRowMajor
<
T
>
(
mean
->
data
<
T
>
(),
left
,
1
);
auto
var_map
=
EigenMatrixMapRowMajor
<
T
>
(
var
->
data
<
T
>
(),
left
,
1
);
auto
output_map
=
EigenMatrixMapRowMajor
<
T
>
(
output
->
data
<
T
>
(),
left
,
right
);
...
...
@@ -138,18 +139,15 @@ class LayerNormKernel<platform::CPUDeviceContext, T>
.
mean
()
.
unaryExpr
(
add_epslion
);
auto
scale_inv_std
=
[
scale_data
](
T
ele
)
{
return
std
::
sqrt
(
1
/
ele
)
*
scale_data
;
};
auto
sub_bias
=
[
bias_data
](
T
ele
)
{
return
bias_data
-
ele
;
};
auto
inv_std_func
=
[](
T
ele
)
{
return
std
::
sqrt
(
1
/
ele
);
};
// TODO(zcd): Some thinking about output_map, is it appropriate that
// `output_map` and `input_map` point to the same memory.
output_map
=
(
var_map
.
unaryExpr
(
scale_inv_std
).
replicate
(
1
,
right
))
.
cwiseProduct
(
input_map
)
+
var_map
.
unaryExpr
(
scale_inv_std
)
.
cwiseProduct
(
mean_map
)
.
unaryExpr
(
sub_bias
)
.
replicate
(
1
,
right
);
auto
inv_std_scale
=
var_map
.
unaryExpr
(
inv_std_func
).
cwiseProduct
(
scale_map
);
output_map
=
inv_std_scale
.
replicate
(
1
,
right
).
cwiseProduct
(
input_map
)
+
(
bias_map
-
inv_std_scale
.
cwiseProduct
(
mean_map
)).
replicate
(
1
,
right
);
}
};
...
...
@@ -165,17 +163,17 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Variance"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Y"
)),
""
);
const
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
// check output
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
x_dims
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"X"
)
);
}
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Scale"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Scale"
),
{
1
});
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Scale"
),
ctx
->
GetInputDim
(
"Scale"
));
}
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Bias"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Bias"
),
{
1
});
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Bias"
),
ctx
->
GetInputDim
(
"Bias"
));
}
}
...
...
@@ -210,20 +208,20 @@ class LayerNormGradKernel<platform::CPUDeviceContext, T>
const
auto
*
var
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
d_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
scale_data
=
scale
->
data
<
T
>
()[
0
];
const
auto
&
x_dims
=
x
->
dims
();
const
auto
begin_norm_axis
=
ctx
.
Attr
<
int
>
(
"begin_norm_axis"
);
auto
matrix_dim
=
framework
::
flatten_to_2d
(
x_dims
,
begin_norm_axis
);
int
left
=
static_cast
<
int
>
(
matrix_dim
[
0
])
,
right
=
static_cast
<
int
>
(
matrix_dim
[
1
]);
int
left
=
static_cast
<
int
>
(
matrix_dim
[
0
])
;
int
right
=
static_cast
<
int
>
(
matrix_dim
[
1
]);
// init output
auto
*
d_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
auto
scale_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
scale
->
data
<
T
>
(),
left
,
1
);
auto
x_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
x
->
data
<
T
>
(),
left
,
right
);
auto
d_y_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
d_y
->
data
<
T
>
(),
left
,
right
);
auto
mean_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
mean
->
data
<
T
>
(),
left
,
1
);
...
...
@@ -231,36 +229,38 @@ class LayerNormGradKernel<platform::CPUDeviceContext, T>
if
(
d_bias
)
{
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
d_bias
->
data
<
T
>
()[
0
]
=
d_y_map
.
sum
();
auto
d_bias_map
=
EigenMatrixMapRowMajor
<
T
>
(
d_bias
->
data
<
T
>
(),
left
,
1
);
d_bias_map
=
d_y_map
.
colwise
().
mean
();
}
if
(
d_scale
)
{
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
inv_std
=
[](
T
ele
)
{
return
std
::
sqrt
(
1
/
ele
);
};
auto
d_scale_map
=
EigenMatrixMapRowMajor
<
T
>
(
d_scale
->
data
<
T
>
(),
left
,
1
);
auto
inv_std_func
=
[](
T
ele
)
{
return
std
::
sqrt
(
1
/
ele
);
};
// There are two equation to compute d_scale. One uses "Y" and the other
// does not use "Y"
d_scale
->
data
<
T
>
()[
0
]
=
d_scale
_map
=
((
x_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
var_map
.
unaryExpr
(
inv_std
).
replicate
(
1
,
right
))
.
cwiseProduct
(
var_map
.
unaryExpr
(
inv_std_func
).
replicate
(
1
,
right
))
.
cwiseProduct
(
d_y_map
))
.
sum
();
.
colwise
()
.
mean
();
}
if
(
d_x
)
{
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
d_x_map
=
EigenMatrixMapRowMajor
<
T
>
(
d_x
->
data
<
T
>
(),
left
,
right
);
auto
triple_product_func
=
[](
T
ele
)
{
return
ele
*
ele
*
ele
;
};
auto
scale_func
=
[
scale_data
](
T
ele
)
{
return
ele
*
scale_data
;
};
auto
inv_std_func
=
[](
T
ele
)
{
return
std
::
sqrt
(
1
/
ele
);
};
auto
inv_std_scale_func
=
[
scale_data
](
T
ele
)
{
return
std
::
sqrt
(
1
/
ele
)
*
scale_data
;
};
// dy_dx
auto
dx_end
=
var_map
.
unaryExpr
(
inv_std_scale_func
)
auto
dx_end
=
var_map
.
unaryExpr
(
inv_std_func
)
.
cwiseProduct
(
scale_map
)
.
replicate
(
1
,
right
)
.
cwiseProduct
(
d_y_map
);
// dy_dmean_dx
auto
dx_mean
=
(
T
(
-
1.0
)
/
right
)
*
var_map
.
unaryExpr
(
inv_std_scale_func
)
var_map
.
unaryExpr
(
inv_std_func
)
.
cwiseProduct
(
scale_map
)
.
replicate
(
1
,
right
)
.
cwiseProduct
(
d_y_map
)
.
rowwise
()
...
...
@@ -274,11 +274,11 @@ class LayerNormGradKernel<platform::CPUDeviceContext, T>
auto
dvar_end
=
var_map
.
unaryExpr
(
inv_std_func
)
.
unaryExpr
(
triple_product_func
)
.
cwiseProduct
(
dvar_end_part
)
.
cwiseProduct
(
scale_map
)
.
replicate
(
1
,
right
);
auto
dx_var
=
(
T
(
-
1.0
)
/
right
)
*
(
x_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
dvar_end
)
.
unaryExpr
(
scale_func
);
auto
dx_var
=
(
T
(
-
1.0
)
/
right
)
*
(
x_map
-
mean_map
.
replicate
(
1
,
right
)).
cwiseProduct
(
dvar_end
);
d_x_map
=
dx_end
+
dx_mean
+
dx_var
;
}
...
...
python/paddle/v2/fluid/tests/test_layer_norm_op.py
浏览文件 @
87b5559c
...
...
@@ -39,8 +39,9 @@ def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
x
.
shape
=
[
N
,
D
]
mean
=
np
.
mean
(
x
,
axis
=
1
)
var
=
np
.
var
(
x
,
axis
=
1
)
+
epsilon
output
=
scale
*
np
.
divide
((
x
-
mean
.
reshape
([
N
,
1
])),
(
np
.
sqrt
(
var
)).
reshape
([
N
,
1
]))
+
beta
output
=
scale
.
reshape
([
1
,
D
])
*
np
.
divide
(
(
x
-
mean
.
reshape
([
N
,
1
])),
(
np
.
sqrt
(
var
)).
reshape
([
N
,
1
]))
+
beta
.
reshape
([
1
,
D
])
output
.
shape
=
old_shape
x
.
shape
=
old_shape
return
output
,
mean
,
var
...
...
@@ -55,8 +56,10 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
mean
.
shape
=
[
N
,
1
]
var
.
shape
=
[
N
,
1
]
d_scale
=
np
.
sum
(
grad_y
).
reshape
([
1
,
])
d_bias
=
np
.
sum
(((
x
-
mean
)
*
np
.
sqrt
(
1
/
var
))
*
grad_y
).
reshape
([
1
,
])
d_scale
=
np
.
sum
(
grad_y
,
axis
=
1
).
reshape
([
1
,
D
])
d_bias
=
scale
.
reshape
([
1
,
D
])
*
np
.
sum
((
(
x
-
mean
)
*
np
.
sqrt
(
1
/
var
))
*
grad_y
,
axis
=
1
).
reshape
([
1
,
D
])
dx_end
=
np
.
sqrt
(
1.0
/
var
)
*
grad_y
...
...
@@ -69,7 +72,7 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
d_std
=
np
.
sum
(
-
1.0
/
var
*
(
x
-
mean
)
*
grad_y
,
axis
=
1
).
reshape
([
N
,
1
])
*
(
1.0
/
D
*
np
.
sqrt
(
1.0
/
var
).
reshape
([
N
,
1
])
*
(
x
-
mean
))
grad_x
=
scale
*
(
dx_end
+
d_mean
+
d_std
)
grad_x
=
scale
.
reshape
([
1
,
D
])
*
(
dx_end
+
d_mean
+
d_std
)
grad_y
.
shape
=
x_shape
x
.
shape
=
x_shape
...
...
@@ -146,7 +149,8 @@ class TestLayerNormdOp(OpTest):
# attr
epsilon
=
0.00001
x_shape
=
shape
scale_shape
=
[
1
]
D
=
reduce
(
mul
,
x_shape
[
begin_norm_axis
:
len
(
x_shape
)],
1
)
scale_shape
=
[
D
]
np
.
random
.
random
(
123
)
x_val
=
np
.
random
.
random_sample
(
x_shape
).
astype
(
np
.
float32
)
scale_val
=
np
.
random
.
random_sample
(
scale_shape
).
astype
(
np
.
float32
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录