Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
df0e74db
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
df0e74db
编写于
2月 05, 2018
作者:
C
chengduoZH
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
unifid GPU and CPU implementation
上级
5092f529
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
4 addition
and
187 deletion
+4
-187
paddle/operators/layer_norm_op.cc
paddle/operators/layer_norm_op.cc
+0
-185
paddle/operators/layer_norm_op.h
paddle/operators/layer_norm_op.h
+1
-1
python/paddle/v2/fluid/tests/test_layer_norm_op.py
python/paddle/v2/fluid/tests/test_layer_norm_op.py
+3
-1
未找到文件。
paddle/operators/layer_norm_op.cc
浏览文件 @
df0e74db
...
...
@@ -21,13 +21,6 @@ using Tensor = framework::Tensor;
using
LoDTensor
=
framework
::
LoDTensor
;
using
DataLayout
=
framework
::
DataLayout
;
template
<
typename
T
>
using
EigenMatrixMapRowMajor
=
Eigen
::
Map
<
Eigen
::
Matrix
<
T
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
,
Eigen
::
RowMajor
>>
;
template
<
typename
T
>
using
ConstEigenMatrixMapRowMajor
=
Eigen
::
Map
<
const
Eigen
::
Matrix
<
T
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
,
Eigen
::
RowMajor
>>
;
class
LayerNormOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
...
...
@@ -115,75 +108,6 @@ https://arxiv.org/abs/1607.06450
}
};
template
<
typename
T
>
class
LayerNormKernel
<
platform
::
CPUDeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
&
x_dims
=
x
->
dims
();
const
auto
begin_norm_axis
=
ctx
.
Attr
<
int
>
(
"begin_norm_axis"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
auto
*
mean
=
ctx
.
Output
<
Tensor
>
(
"Mean"
);
auto
*
var
=
ctx
.
Output
<
Tensor
>
(
"Variance"
);
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
var
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
matrix_dim
=
framework
::
flatten_to_2d
(
x_dims
,
begin_norm_axis
);
int
left
=
static_cast
<
int
>
(
matrix_dim
[
0
]);
int
right
=
static_cast
<
int
>
(
matrix_dim
[
1
]);
auto
input_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
x
->
data
<
T
>
(),
left
,
right
);
auto
mean_map
=
EigenMatrixMapRowMajor
<
T
>
(
mean
->
data
<
T
>
(),
left
,
1
);
auto
var_map
=
EigenMatrixMapRowMajor
<
T
>
(
var
->
data
<
T
>
(),
left
,
1
);
auto
output_map
=
EigenMatrixMapRowMajor
<
T
>
(
output
->
data
<
T
>
(),
left
,
right
);
auto
squre
=
[](
T
ele
)
{
return
ele
*
ele
;
};
auto
add_epslion
=
[
epsilon
](
T
ele
)
{
return
ele
+
epsilon
;
};
mean_map
=
input_map
.
rowwise
().
mean
();
var_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
unaryExpr
(
squre
)
.
rowwise
()
.
mean
()
.
unaryExpr
(
add_epslion
);
auto
inv_std_func
=
[](
T
ele
)
{
return
std
::
sqrt
(
1
/
ele
);
};
// TODO(zcd): Some thinking about output_map, is it appropriate that
// `output_map` and `input_map` point to the same memory.
auto
inv_std
=
var_map
.
unaryExpr
(
inv_std_func
);
if
(
scale
&&
bias
)
{
auto
scale_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
scale
->
data
<
T
>
(),
1
,
right
);
auto
bias_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
bias
->
data
<
T
>
(),
1
,
right
);
output_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
inv_std
.
replicate
(
1
,
right
))
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
))
+
bias_map
.
replicate
(
left
,
1
);
}
else
if
(
scale
)
{
auto
scale_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
scale
->
data
<
T
>
(),
1
,
right
);
output_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
inv_std
.
replicate
(
1
,
right
))
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
));
}
else
if
(
bias
)
{
auto
bias_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
bias
->
data
<
T
>
(),
1
,
right
);
output_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
inv_std
.
replicate
(
1
,
right
))
+
bias_map
.
replicate
(
left
,
1
);
}
else
{
output_map
=
(
input_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
inv_std
.
replicate
(
1
,
right
));
}
}
};
class
LayerNormGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
...
...
@@ -236,115 +160,6 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
}
};
template
<
typename
T
>
class
LayerNormGradKernel
<
platform
::
CPUDeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
*
mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
const
auto
*
var
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
const
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
const
auto
*
d_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
const
auto
&
x_dims
=
x
->
dims
();
const
auto
begin_norm_axis
=
ctx
.
Attr
<
int
>
(
"begin_norm_axis"
);
auto
matrix_dim
=
framework
::
flatten_to_2d
(
x_dims
,
begin_norm_axis
);
int
left
=
static_cast
<
int
>
(
matrix_dim
[
0
]);
int
right
=
static_cast
<
int
>
(
matrix_dim
[
1
]);
// init output
auto
*
d_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
auto
x_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
x
->
data
<
T
>
(),
left
,
right
);
auto
d_y_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
d_y
->
data
<
T
>
(),
left
,
right
);
auto
mean_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
mean
->
data
<
T
>
(),
left
,
1
);
auto
var_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
var
->
data
<
T
>
(),
left
,
1
);
if
(
d_bias
)
{
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
d_bias_map
=
EigenMatrixMapRowMajor
<
T
>
(
d_bias
->
data
<
T
>
(),
1
,
right
);
d_bias_map
=
d_y_map
.
colwise
().
sum
();
}
if
(
d_scale
)
{
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
d_scale_map
=
EigenMatrixMapRowMajor
<
T
>
(
d_scale
->
data
<
T
>
(),
1
,
right
);
auto
inv_std_func
=
[](
T
ele
)
{
return
std
::
sqrt
(
1
/
ele
);
};
// There are two equation to compute d_scale. One uses "Y" and the other
// does not use "Y"
d_scale_map
=
((
x_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
var_map
.
unaryExpr
(
inv_std_func
).
replicate
(
1
,
right
))
.
cwiseProduct
(
d_y_map
))
.
colwise
()
.
sum
();
}
if
(
d_x
)
{
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
d_x_map
=
EigenMatrixMapRowMajor
<
T
>
(
d_x
->
data
<
T
>
(),
left
,
right
);
auto
triple_product_func
=
[](
T
ele
)
{
return
ele
*
ele
*
ele
;
};
auto
inv_std_func
=
[](
T
ele
)
{
return
std
::
sqrt
(
1
/
ele
);
};
auto
inv_std_map
=
var_map
.
unaryExpr
(
inv_std_func
).
eval
();
// TODO(zcd): these code can be refined
if
(
d_scale
)
{
auto
scale_map
=
ConstEigenMatrixMapRowMajor
<
T
>
(
scale
->
data
<
T
>
(),
1
,
right
);
// dy_dx
auto
dx_end
=
inv_std_map
.
replicate
(
1
,
right
).
cwiseProduct
(
d_y_map
).
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
));
// dy_dmean_dx
auto
dx_mean
=
(
T
(
-
1.0
)
/
right
)
*
dx_end
.
rowwise
().
sum
().
replicate
(
1
,
right
);
// dy_var_dx
auto
dvar_end_part
=
(
x_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
scale_map
.
replicate
(
left
,
1
))
.
cwiseProduct
(
d_y_map
)
.
rowwise
()
.
sum
();
auto
dvar_end
=
inv_std_map
.
unaryExpr
(
triple_product_func
)
.
cwiseProduct
(
dvar_end_part
)
.
replicate
(
1
,
right
);
auto
dx_var
=
(
T
(
-
1.0
)
/
right
)
*
(
x_map
-
mean_map
.
replicate
(
1
,
right
)).
cwiseProduct
(
dvar_end
);
d_x_map
=
dx_end
+
dx_mean
+
dx_var
;
}
else
{
// dy_dx
auto
dx_end
=
inv_std_map
.
replicate
(
1
,
right
).
cwiseProduct
(
d_y_map
);
// dy_dmean_dx
auto
dx_mean
=
(
T
(
-
1.0
)
/
right
)
*
dx_end
.
rowwise
().
sum
().
replicate
(
1
,
right
);
// dy_var_dx
auto
dvar_end_part
=
(
x_map
-
mean_map
.
replicate
(
1
,
right
))
.
cwiseProduct
(
d_y_map
)
.
rowwise
()
.
sum
();
auto
dvar_end
=
inv_std_map
.
unaryExpr
(
triple_product_func
)
.
cwiseProduct
(
dvar_end_part
)
.
replicate
(
1
,
right
);
auto
dx_var
=
(
T
(
-
1.0
)
/
right
)
*
(
x_map
-
mean_map
.
replicate
(
1
,
right
)).
cwiseProduct
(
dvar_end
);
d_x_map
=
dx_end
+
dx_mean
+
dx_var
;
}
}
}
};
}
// namespace operators
}
// namespace paddle
...
...
paddle/operators/layer_norm_op.h
浏览文件 @
df0e74db
...
...
@@ -31,7 +31,7 @@ template <typename T>
struct
DivAndSqrtFunctor
{
explicit
DivAndSqrtFunctor
(
T
epsilon
)
{
epsilon_
=
epsilon
;
}
inline
HOSTDEVICE
T
operator
()(
T
a
,
T
b
)
const
{
return
a
/
(
sqrt
(
b
)
+
epsilon_
);
return
a
/
(
sqrt
(
b
+
epsilon_
)
);
}
private:
...
...
python/paddle/v2/fluid/tests/test_layer_norm_op.py
浏览文件 @
df0e74db
...
...
@@ -20,6 +20,8 @@ import paddle.v2.fluid.core as core
from
paddle.v2.fluid.op
import
Operator
from
paddle.v2.fluid.framework
import
grad_var_name
np
.
random
.
random
(
123
)
def
_reference_layer_norm_naive
(
x
,
scale
,
beta
,
epsilon
,
begin_norm_axis
=
1
):
x_shape
=
x
.
shape
...
...
@@ -148,7 +150,7 @@ class TestLayerNormdOp(OpTest):
x_shape
=
shape
D
=
reduce
(
mul
,
x_shape
[
begin_norm_axis
:
len
(
x_shape
)],
1
)
scale_shape
=
[
D
]
np
.
random
.
random
(
123
)
x_val
=
np
.
random
.
random_sample
(
x_shape
).
astype
(
np
.
float32
)
scale_val
=
np
.
random
.
random_sample
(
scale_shape
).
astype
(
np
.
float32
)
bias_val
=
np
.
random
.
random_sample
(
scale_shape
).
astype
(
np
.
float32
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录