Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
263e0197
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
263e0197
编写于
1月 30, 2018
作者:
C
chengduoZH
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
follow comments
上级
09570b48
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
133 addition
and
64 deletion
+133
-64
paddle/operators/layer_norm_op.cc
paddle/operators/layer_norm_op.cc
+133
-64
未找到文件。
paddle/operators/layer_norm_op.cc
浏览文件 @
263e0197
...
...
@@ -33,29 +33,35 @@ class LayerNormOp : public framework::OperatorWithKernel {
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Scale"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Bias"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Y"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of LayerNormOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Y"
),
"Output(Y) of LayerNormOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Mean"
),
"Output(Mean) of LayerNormOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Variance"
),
"Output(Variance) of LayerNormOp should not be null."
);
auto
x_dim
=
ctx
->
GetInputDim
(
"X"
);
auto
begin_norm_axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"begin_norm_axis"
);
PADDLE_ENFORCE_LT
(
begin_norm_axis
,
x_dim
.
size
(),
"'begin_norm_axis' must be less than the rank of X"
);
"'begin_norm_axis' must be less than the rank of X
.
"
);
auto
matrix_dim
=
framework
::
flatten_to_2d
(
x_dim
,
begin_norm_axis
);
int
left
=
static_cast
<
int
>
(
matrix_dim
[
0
]);
int
right
=
static_cast
<
int
>
(
matrix_dim
[
1
]);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Scale"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Scale"
)[
0
],
right
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Bias"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Bias"
)[
0
],
right
);
if
(
ctx
->
HasInput
(
"Scale"
))
{
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Scale"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Scale"
)[
0
],
right
);
}
if
(
ctx
->
HasInput
(
"Bias"
))
{
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Bias"
).
size
(),
1UL
);
PADDLE_ENFORCE_EQ
(
ctx
->
GetInputDim
(
"Bias"
)[
0
],
right
);
}
ctx
->
SetOutputDim
(
"Y"
,
ctx
->
GetInputDim
(
"X"
));
ctx
->
SetOutputDim
(
"Mean"
,
{
left
});
ctx
->
SetOutputDim
(
"Variance"
,
{
left
});
ctx
->
ShareLoD
(
"X"
,
"Y"
);
}
};
...
...
@@ -64,18 +70,26 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LayerNormOpMaker
(
OpProto
*
proto
,
OpAttrChecker
*
op_checker
)
:
OpProtoAndCheckerMaker
(
proto
,
op_checker
)
{
AddInput
(
"X"
,
"
The input tensor
"
);
AddInput
(
"X"
,
"
(LoDTensor) The input tensor.
"
);
AddInput
(
"Scale"
,
"Scale is a 1-dimensional tensor of size H "
"that is applied to the output"
);
"(Tensor, optional) Scale is a 1-dimensional tensor of size "
"H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
"It is applied to the output."
)
.
AsDispensable
();
AddInput
(
"Bias"
,
"Bias is a 1-dimensional tensor of size H "
"that is applied to the output"
);
AddOutput
(
"Y"
,
"result after normalization"
);
AddOutput
(
"Mean"
,
"Mean of the current mini batch."
);
AddOutput
(
"Variance"
,
"Variance of the current mini batch."
);
AddAttr
<
float
>
(
"epsilon"
,
""
)
"(Tensor, optional) Bias is a 1-dimensional tensor of size "
"H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
"It is applied to the output."
)
.
AsDispensable
();
AddOutput
(
"Y"
,
"(LoDTensor) Result after normalization."
);
AddOutput
(
"Mean"
,
"(Tensor) Mean of the current mini batch."
)
.
AsIntermediate
();
AddOutput
(
"Variance"
,
"(Tensor) Variance of the current mini batch."
)
.
AsIntermediate
();
AddAttr
<
float
>
(
"epsilon"
,
"(float, default 1e-5) Constant for "
"numerical stability"
)
.
SetDefault
(
1e-5
)
.
AddCustomChecker
([](
const
float
&
epsilon
)
{
PADDLE_ENFORCE
(
epsilon
>=
0.0
f
&&
epsilon
<=
0.001
f
,
...
...
@@ -83,7 +97,9 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
});
AddAttr
<
int
>
(
"begin_norm_axis"
,
"(int default:1), the "
"axis of `begin_norm_axis ... Rank(X) - 1` will be normalized"
)
"axis of `begin_norm_axis ... Rank(X) - 1` will be "
"normalized. `begin_norm_axis` splits the tensor(`X`) to a "
"matrix [N,H]."
)
.
SetDefault
(
1
)
.
AddCustomChecker
([](
const
int
&
begin_norm_axis
)
{
PADDLE_ENFORCE_GT
(
begin_norm_axis
,
0
,
...
...
@@ -124,8 +140,7 @@ class LayerNormKernel<platform::CPUDeviceContext, T>
int
right
=
static_cast
<
int
>
(
matrix_dim
[
1
]);
auto
input_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
x
->
data
<
T
>
(),
left
,
right
);
auto
scale_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
scale
->
data
<
T
>
(),
1
,
right
);
auto
bias_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
bias
->
data
<
T
>
(),
1
,
right
);
auto
mean_map
=
EigenMatrixMapRowMajor
<
T
>
(
mean
->
data
<
T
>
(),
left
,
1
);
auto
var_map
=
EigenMatrixMapRowMajor
<
T
>
(
var
->
data
<
T
>
(),
left
,
1
);
auto
output_map
=
EigenMatrixMapRowMajor
<
T
>
(
output
->
data
<
T
>
(),
left
,
right
);
...
...
@@ -141,14 +156,32 @@ class LayerNormKernel<platform::CPUDeviceContext, T>
.
unaryExpr
(
add_epslion
);
auto
inv_std_func
=
[](
T
ele
)
{
return
std
::
sqrt
(
1
/
ele
);
};
// TODO(zcd): Some thinking about output_map, is it appropriate that
// `output_map` and `input_map` point to the same memory.
auto
inv_std
=
var_map
.
unaryExpr
(
inv_std_func
);
output_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
inv_std
.
replicate
(
1
,
right
))
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
))
+
bias_map
.
replicate
(
left
,
1
);
if
(
scale
&&
bias
)
{
auto
scale_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
scale
->
data
<
T
>
(),
1
,
right
);
auto
bias_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
bias
->
data
<
T
>
(),
1
,
right
);
output_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
inv_std
.
replicate
(
1
,
right
))
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
))
+
bias_map
.
replicate
(
left
,
1
);
}
else
if
(
scale
)
{
auto
scale_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
scale
->
data
<
T
>
(),
1
,
right
);
output_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
inv_std
.
replicate
(
1
,
right
))
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
));
}
else
if
(
bias
)
{
auto
bias_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
bias
->
data
<
T
>
(),
1
,
right
);
output_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
inv_std
.
replicate
(
1
,
right
))
+
bias_map
.
replicate
(
left
,
1
);
}
else
{
output_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
inv_std
.
replicate
(
1
,
right
));
}
}
};
...
...
@@ -158,11 +191,16 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
// check input
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
));
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Scale"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Mean"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Variance"
),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Y"
)),
""
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of LayerNormOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Scale"
),
"Input(Scale) of LayerNormOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Mean"
),
"Input(Mean) of LayerNormOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Variance"
),
"Input(Variance) of LayerNormOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Y"
)),
"Input(Y@GRAD) of LayerNormOp should not be null."
);
// check output
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)))
{
...
...
@@ -222,7 +260,6 @@ class LayerNormGradKernel<platform::CPUDeviceContext, T>
auto
*
d_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
auto
scale_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
scale
->
data
<
T
>
(),
1
,
right
);
auto
x_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
x
->
data
<
T
>
(),
left
,
right
);
auto
d_y_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
d_y
->
data
<
T
>
(),
left
,
right
);
auto
mean_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
mean
->
data
<
T
>
(),
left
,
1
);
...
...
@@ -254,35 +291,67 @@ class LayerNormGradKernel<platform::CPUDeviceContext, T>
auto
d_x_map
=
EigenMatrixMapRowMajor
<
T
>
(
d_x
->
data
<
T
>
(),
left
,
right
);
auto
triple_product_func
=
[](
T
ele
)
{
return
ele
*
ele
*
ele
;
};
auto
inv_std_func
=
[](
T
ele
)
{
return
std
::
sqrt
(
1
/
ele
);
};
// dy_dx
auto
dx_end
=
var_map
.
unaryExpr
(
inv_std_func
)
.
replicate
(
1
,
right
)
.
cwiseProduct
(
d_y_map
)
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
));
// dy_dmean_dx
auto
dx_mean
=
(
T
(
-
1.0
)
/
right
)
*
var_map
.
unaryExpr
(
inv_std_func
)
.
replicate
(
1
,
right
)
.
cwiseProduct
(
d_y_map
)
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
))
.
rowwise
()
.
sum
()
.
replicate
(
1
,
right
);
// dy_var_dx
auto
dvar_end_part
=
(
x_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
))
.
cwiseProduct
(
d_y_map
)
.
rowwise
()
.
sum
();
auto
dvar_end
=
var_map
.
unaryExpr
(
inv_std_func
)
.
unaryExpr
(
triple_product_func
)
.
cwiseProduct
(
dvar_end_part
)
.
replicate
(
1
,
right
);
auto
dx_var
=
(
T
(
-
1.0
)
/
right
)
*
(
x_map
-
mean_map
.
replicate
(
1
,
right
)).
cwiseProduct
(
dvar_end
);
d_x_map
=
dx_end
+
dx_mean
+
dx_var
;
// TODO(zcd): these code can be refined
if
(
d_scale
)
{
auto
scale_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
scale
->
data
<
T
>
(),
1
,
right
);
// dy_dx
auto
dx_end
=
var_map
.
unaryExpr
(
inv_std_func
)
.
replicate
(
1
,
right
)
.
cwiseProduct
(
d_y_map
)
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
));
// dy_dmean_dx
auto
dx_mean
=
(
T
(
-
1.0
)
/
right
)
*
var_map
.
unaryExpr
(
inv_std_func
)
.
replicate
(
1
,
right
)
.
cwiseProduct
(
d_y_map
)
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
))
.
rowwise
()
.
sum
()
.
replicate
(
1
,
right
);
// dy_var_dx
auto
dvar_end_part
=
(
x_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
))
.
cwiseProduct
(
d_y_map
)
.
rowwise
()
.
sum
();
auto
dvar_end
=
var_map
.
unaryExpr
(
inv_std_func
)
.
unaryExpr
(
triple_product_func
)
.
cwiseProduct
(
dvar_end_part
)
.
replicate
(
1
,
right
);
auto
dx_var
=
(
T
(
-
1.0
)
/
right
)
*
(
x_map
-
mean_map
.
replicate
(
1
,
right
)).
cwiseProduct
(
dvar_end
);
d_x_map
=
dx_end
+
dx_mean
+
dx_var
;
}
else
{
// dy_dx
auto
dx_end
=
var_map
.
unaryExpr
(
inv_std_func
)
.
replicate
(
1
,
right
)
.
cwiseProduct
(
d_y_map
);
// dy_dmean_dx
auto
dx_mean
=
(
T
(
-
1.0
)
/
right
)
*
var_map
.
unaryExpr
(
inv_std_func
)
.
replicate
(
1
,
right
)
.
cwiseProduct
(
d_y_map
)
.
rowwise
()
.
sum
()
.
replicate
(
1
,
right
);
// dy_var_dx
auto
dvar_end_part
=
(
x_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
d_y_map
)
.
rowwise
()
.
sum
();
auto
dvar_end
=
var_map
.
unaryExpr
(
inv_std_func
)
.
unaryExpr
(
triple_product_func
)
.
cwiseProduct
(
dvar_end_part
)
.
replicate
(
1
,
right
);
auto
dx_var
=
(
T
(
-
1.0
)
/
right
)
*
(
x_map
-
mean_map
.
replicate
(
1
,
right
)).
cwiseProduct
(
dvar_end
);
d_x_map
=
dx_end
+
dx_mean
+
dx_var
;
}
}
}
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录