Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
5092f529
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5092f529
编写于
2月 03, 2018
作者:
C
chengduoZH
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Separate GPU and CPU implementation
上级
e0333735
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
202 addition
and
24 deletion
+202
-24
paddle/operators/layer_norm_op.cc
paddle/operators/layer_norm_op.cc
+185
-1
paddle/operators/layer_norm_op.h
paddle/operators/layer_norm_op.h
+13
-16
python/paddle/v2/fluid/tests/test_layer_norm_op.py
python/paddle/v2/fluid/tests/test_layer_norm_op.py
+4
-7
未找到文件。
paddle/operators/layer_norm_op.cc
浏览文件 @
5092f529
...
...
@@ -21,6 +21,13 @@ using Tensor = framework::Tensor;
using
LoDTensor
=
framework
::
LoDTensor
;
using
DataLayout
=
framework
::
DataLayout
;
template
<
typename
T
>
using
EigenMatrixMapRowMajor
=
Eigen
::
Map
<
Eigen
::
Matrix
<
T
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
,
Eigen
::
RowMajor
>>
;
template
<
typename
T
>
using
ConstEigenMatrixMapRowMajor
=
Eigen
::
Map
<
const
Eigen
::
Matrix
<
T
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
,
Eigen
::
RowMajor
>>
;
class
LayerNormOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
...
...
@@ -101,7 +108,6 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment
(
R"DOC(
Layer Normalization.
Layer Norm has been implemented as discussed in the paper:
https://arxiv.org/abs/1607.06450
...
...
...
@@ -109,6 +115,75 @@ https://arxiv.org/abs/1607.06450
}
};
template
<
typename
T
>
class
LayerNormKernel
<
platform
::
CPUDeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
&
x_dims
=
x
->
dims
();
const
auto
begin_norm_axis
=
ctx
.
Attr
<
int
>
(
"begin_norm_axis"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
auto
*
mean
=
ctx
.
Output
<
Tensor
>
(
"Mean"
);
auto
*
var
=
ctx
.
Output
<
Tensor
>
(
"Variance"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
var
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
matrix_dim
=
framework
::
flatten_to_2d
(
x_dims
,
begin_norm_axis
);
int
left
=
static_cast
<
int
>
(
matrix_dim
[
0
]);
int
right
=
static_cast
<
int
>
(
matrix_dim
[
1
]);
auto
input_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
x
->
data
<
T
>
(),
left
,
right
);
auto
mean_map
=
EigenMatrixMapRowMajor
<
T
>
(
mean
->
data
<
T
>
(),
left
,
1
);
auto
var_map
=
EigenMatrixMapRowMajor
<
T
>
(
var
->
data
<
T
>
(),
left
,
1
);
auto
output_map
=
EigenMatrixMapRowMajor
<
T
>
(
output
->
data
<
T
>
(),
left
,
right
);
auto
squre
=
[](
T
ele
)
{
return
ele
*
ele
;
};
auto
add_epslion
=
[
epsilon
](
T
ele
)
{
return
ele
+
epsilon
;
};
mean_map
=
input_map
.
rowwise
().
mean
();
var_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
unaryExpr
(
squre
)
.
rowwise
()
.
mean
()
.
unaryExpr
(
add_epslion
);
auto
inv_std_func
=
[](
T
ele
)
{
return
std
::
sqrt
(
1
/
ele
);
};
// TODO(zcd): Some thinking about output_map, is it appropriate that
// `output_map` and `input_map` point to the same memory.
auto
inv_std
=
var_map
.
unaryExpr
(
inv_std_func
);
if
(
scale
&&
bias
)
{
auto
scale_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
scale
->
data
<
T
>
(),
1
,
right
);
auto
bias_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
bias
->
data
<
T
>
(),
1
,
right
);
output_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
inv_std
.
replicate
(
1
,
right
))
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
))
+
bias_map
.
replicate
(
left
,
1
);
}
else
if
(
scale
)
{
auto
scale_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
scale
->
data
<
T
>
(),
1
,
right
);
output_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
inv_std
.
replicate
(
1
,
right
))
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
));
}
else
if
(
bias
)
{
auto
bias_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
bias
->
data
<
T
>
(),
1
,
right
);
output_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
inv_std
.
replicate
(
1
,
right
))
+
bias_map
.
replicate
(
left
,
1
);
}
else
{
output_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
inv_std
.
replicate
(
1
,
right
));
}
}
};
class
LayerNormGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
...
...
@@ -161,6 +236,115 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
}
};
template
<
typename
T
>
class
LayerNormGradKernel
<
platform
::
CPUDeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
*
mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
const
auto
*
var
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
d_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
const
auto
&
x_dims
=
x
->
dims
();
const
auto
begin_norm_axis
=
ctx
.
Attr
<
int
>
(
"begin_norm_axis"
);
auto
matrix_dim
=
framework
::
flatten_to_2d
(
x_dims
,
begin_norm_axis
);
int
left
=
static_cast
<
int
>
(
matrix_dim
[
0
]);
int
right
=
static_cast
<
int
>
(
matrix_dim
[
1
]);
// init output
auto
*
d_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
auto
x_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
x
->
data
<
T
>
(),
left
,
right
);
auto
d_y_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
d_y
->
data
<
T
>
(),
left
,
right
);
auto
mean_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
mean
->
data
<
T
>
(),
left
,
1
);
auto
var_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
var
->
data
<
T
>
(),
left
,
1
);
if
(
d_bias
)
{
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
d_bias_map
=
EigenMatrixMapRowMajor
<
T
>
(
d_bias
->
data
<
T
>
(),
1
,
right
);
d_bias_map
=
d_y_map
.
colwise
().
sum
();
}
if
(
d_scale
)
{
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
d_scale_map
=
EigenMatrixMapRowMajor
<
T
>
(
d_scale
->
data
<
T
>
(),
1
,
right
);
auto
inv_std_func
=
[](
T
ele
)
{
return
std
::
sqrt
(
1
/
ele
);
};
// There are two equation to compute d_scale. One uses "Y" and the other
// does not use "Y"
d_scale_map
=
((
x_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
var_map
.
unaryExpr
(
inv_std_func
).
replicate
(
1
,
right
))
.
cwiseProduct
(
d_y_map
))
.
colwise
()
.
sum
();
}
if
(
d_x
)
{
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
d_x_map
=
EigenMatrixMapRowMajor
<
T
>
(
d_x
->
data
<
T
>
(),
left
,
right
);
auto
triple_product_func
=
[](
T
ele
)
{
return
ele
*
ele
*
ele
;
};
auto
inv_std_func
=
[](
T
ele
)
{
return
std
::
sqrt
(
1
/
ele
);
};
auto
inv_std_map
=
var_map
.
unaryExpr
(
inv_std_func
).
eval
();
// TODO(zcd): these code can be refined
if
(
d_scale
)
{
auto
scale_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
scale
->
data
<
T
>
(),
1
,
right
);
// dy_dx
auto
dx_end
=
inv_std_map
.
replicate
(
1
,
right
).
cwiseProduct
(
d_y_map
).
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
));
// dy_dmean_dx
auto
dx_mean
=
(
T
(
-
1.0
)
/
right
)
*
dx_end
.
rowwise
().
sum
().
replicate
(
1
,
right
);
// dy_var_dx
auto
dvar_end_part
=
(
x_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
))
.
cwiseProduct
(
d_y_map
)
.
rowwise
()
.
sum
();
auto
dvar_end
=
inv_std_map
.
unaryExpr
(
triple_product_func
)
.
cwiseProduct
(
dvar_end_part
)
.
replicate
(
1
,
right
);
auto
dx_var
=
(
T
(
-
1.0
)
/
right
)
*
(
x_map
-
mean_map
.
replicate
(
1
,
right
)).
cwiseProduct
(
dvar_end
);
d_x_map
=
dx_end
+
dx_mean
+
dx_var
;
}
else
{
// dy_dx
auto
dx_end
=
inv_std_map
.
replicate
(
1
,
right
).
cwiseProduct
(
d_y_map
);
// dy_dmean_dx
auto
dx_mean
=
(
T
(
-
1.0
)
/
right
)
*
dx_end
.
rowwise
().
sum
().
replicate
(
1
,
right
);
// dy_var_dx
auto
dvar_end_part
=
(
x_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
d_y_map
)
.
rowwise
()
.
sum
();
auto
dvar_end
=
inv_std_map
.
unaryExpr
(
triple_product_func
)
.
cwiseProduct
(
dvar_end_part
)
.
replicate
(
1
,
right
);
auto
dx_var
=
(
T
(
-
1.0
)
/
right
)
*
(
x_map
-
mean_map
.
replicate
(
1
,
right
)).
cwiseProduct
(
dvar_end
);
d_x_map
=
dx_end
+
dx_mean
+
dx_var
;
}
}
}
};
}
// namespace operators
}
// namespace paddle
...
...
paddle/operators/layer_norm_op.h
浏览文件 @
5092f529
...
...
@@ -78,7 +78,7 @@ class LayerNormKernel : public framework::OpKernel<T> {
auto
*
var
=
ctx
.
Output
<
Tensor
>
(
"Variance"
);
const
auto
begin_norm_axis
=
ctx
.
Attr
<
int
>
(
"begin_norm_axis"
);
const
auto
&
x_dims
=
x
.
dims
();
const
auto
x_dims
=
x
.
dims
();
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
...
...
@@ -87,11 +87,12 @@ class LayerNormKernel : public framework::OpKernel<T> {
auto
matrix_dim
=
framework
::
flatten_to_2d
(
x_dims
,
begin_norm_axis
);
int
left
=
static_cast
<
int
>
(
matrix_dim
[
0
]);
int
right
=
static_cast
<
int
>
(
matrix_dim
[
1
]);
framework
::
DDim
matrix_shape
({
left
,
right
});
x
.
Resize
(
matrix_shape
);
y
->
Resize
(
matrix_shape
);
Tensor
out
;
out
.
ShareDataWith
(
*
y
);
out
.
Resize
(
matrix_shape
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
math
::
RowwiseMean
<
DeviceContext
,
T
>
row_mean
;
...
...
@@ -101,30 +102,24 @@ class LayerNormKernel : public framework::OpKernel<T> {
// functor-> get variance
ElementwiseComputeEx
<
SubAndSquareFunctor
<
T
>
,
DeviceContext
,
T
>
(
ctx
,
&
x
,
mean
,
/*axis*/
0
,
SubAndSquareFunctor
<
T
>
(),
y
);
row_mean
(
dev_ctx
,
*
y
,
var
);
ctx
,
&
x
,
mean
,
/*axis*/
0
,
SubAndSquareFunctor
<
T
>
(),
&
out
);
row_mean
(
dev_ctx
,
out
,
var
);
// functor-> get norm_out
ElementwiseComputeEx
<
SubFunctor
<
T
>
,
DeviceContext
,
T
>
(
ctx
,
&
x
,
mean
,
/*axis*/
0
,
SubFunctor
<
T
>
(),
y
);
ctx
,
&
x
,
mean
,
/*axis*/
0
,
SubFunctor
<
T
>
(),
&
out
);
ElementwiseComputeEx
<
DivAndSqrtFunctor
<
T
>
,
DeviceContext
,
T
>
(
ctx
,
y
,
var
,
/*axis*/
0
,
DivAndSqrtFunctor
<
T
>
(
static_cast
<
T
>
(
epsilon
))
,
y
);
ctx
,
&
out
,
var
,
/*axis*/
0
,
DivAndSqrtFunctor
<
T
>
(
static_cast
<
T
>
(
epsilon
)),
&
out
);
framework
::
DDim
scale_shape
({
right
});
if
(
scale
)
{
Tensor
scale_matrix
=
*
scale
;
scale_matrix
.
Resize
(
scale_shape
);
ElementwiseComputeEx
<
MulFunctor
<
T
>
,
DeviceContext
,
T
>
(
ctx
,
y
,
&
scale_matrix
,
/*axis*/
1
,
MulFunctor
<
T
>
(),
y
);
ctx
,
&
out
,
scale
,
/*axis*/
1
,
MulFunctor
<
T
>
(),
&
out
);
}
if
(
bias
)
{
Tensor
bias_matrix
=
*
bias
;
bias_matrix
.
Resize
(
scale_shape
);
ElementwiseComputeEx
<
AddFunctor
<
T
>
,
DeviceContext
,
T
>
(
ctx
,
y
,
&
bias_matrix
,
/*axis*/
1
,
AddFunctor
<
T
>
(),
y
);
ctx
,
&
out
,
bias
,
/*axis*/
1
,
AddFunctor
<
T
>
(),
&
out
);
}
y
->
Resize
(
x_dims
);
}
};
...
...
@@ -184,6 +179,7 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
if
(
d_x
)
{
framework
::
DDim
vec_shape
({
left
});
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
dx_dim
=
d_x
->
dims
();
Tensor
temp_vec
;
temp_vec
.
mutable_data
<
T
>
(
vec_shape
,
ctx
.
GetPlace
());
...
...
@@ -227,6 +223,7 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
ElementwiseComputeEx
<
DivAndSqrtFunctor
<
T
>
,
DeviceContext
,
T
>
(
ctx
,
d_x
,
&
var
,
/*axis*/
0
,
DivAndSqrtFunctor
<
T
>
(
static_cast
<
T
>
(
epsilon
)),
d_x
);
d_x
->
Resize
(
dx_dim
);
}
}
};
...
...
python/paddle/v2/fluid/tests/test_layer_norm_op.py
浏览文件 @
5092f529
...
...
@@ -62,9 +62,9 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
grad_x
=
dx_end
+
d_mean
+
d_std
grad_y
.
shape
=
x_shape
x
.
shape
=
x_shape
grad_x
.
shape
,
x
.
shape
,
grad_y
.
shape
=
x_shape
,
x_shape
,
x_shape
scale
.
shape
=
scale_shape
var
.
shape
,
mean
.
shape
=
[
N
,
],
[
N
,
]
return
grad_x
,
d_scale
,
d_bias
...
...
@@ -112,10 +112,7 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
class
TestLayerNormdOp
(
OpTest
):
def
__assert_close
(
self
,
tensor
,
np_array
,
msg
,
atol
=
1e-4
):
self
.
assertTrue
(
np
.
allclose
(
np
.
array
(
tensor
).
reshape
(
np_array
.
shape
),
np_array
,
atol
=
atol
),
msg
)
self
.
assertTrue
(
np
.
allclose
(
np
.
array
(
tensor
),
np_array
,
atol
=
atol
),
msg
)
def
__assert_grad_close
(
self
,
tensor
,
...
...
@@ -123,7 +120,7 @@ class TestLayerNormdOp(OpTest):
name
,
place
,
max_relative_error
=
0.02
):
a
=
np
.
array
(
tensor
)
.
reshape
(
np_array
.
shape
)
a
=
np
.
array
(
tensor
)
b
=
np_array
abs_a
=
np
.
abs
(
a
)
abs_a
[
abs_a
<
1e-5
]
=
1
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录