Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
e7a4cfc0
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e7a4cfc0
编写于
7月 11, 2018
作者:
G
guosheng
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
complete the hsigmoid_op
上级
d6953816
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
63 addition
and
54 deletion
+63
-54
paddle/fluid/operators/hierarchical_sigmoid_op.cc
paddle/fluid/operators/hierarchical_sigmoid_op.cc
+10
-10
paddle/fluid/operators/hierarchical_sigmoid_op.h
paddle/fluid/operators/hierarchical_sigmoid_op.h
+10
-9
paddle/fluid/operators/math/matrix_bit_code.h
paddle/fluid/operators/math/matrix_bit_code.h
+13
-2
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+23
-22
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+7
-11
未找到文件。
paddle/fluid/operators/hierarchical_sigmoid_op.cc
浏览文件 @
e7a4cfc0
...
...
@@ -86,25 +86,25 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor, required) The input Tensor, which the shape is"
"[N, D], which N is the size of mini-batch,"
"D is the embded size"
);
"(Tensor, required) The input tensor with shape [N, D], "
"where N is the size of mini-batch, and D is the feature size."
);
AddInput
(
"W"
,
"(Tensor, required), The parameters of hierarchical "
"sigmoid operator, each of them is
s
a 2-D tensor, the shape is"
"[num_classes - 1, D]"
);
"sigmoid operator, each of them is a 2-D tensor, the shape is"
"[num_classes - 1, D]
.
"
);
AddInput
(
"Label"
,
"(Tensor, required), The labels of training data. It's a"
"
1-D tensor, which the shape is [N, 1]
"
);
"
tensor with shape [N, 1].
"
);
AddInput
(
"Bias"
,
"(Tensor, optional), The bias is a tensor with shape"
"[1, num_classes - 1]"
);
"[1, num_classes - 1]
.
"
);
AddOutput
(
"Out"
,
"(Tensor, required) The output of hierarchical sigmoid operator."
"
the shape is [N, 1]
"
);
"
The shape is [N, 1].
"
);
AddOutput
(
"PreOut"
,
"(Tensor, required) A intermedia 2-D Tensor, which the shape is "
"[batch_size, code_length]"
)
"(Tensor, required) A intermedia 2-D tensor with shape "
"[batch_size, code_length], where code_length represents the "
"maximum path length from root to leaf nodes."
)
.
AsIntermediate
();
AddAttr
<
AttrType
>
(
"num_classes"
,
"(int, required), The number of classes"
)
.
SetDefault
(
2
);
...
...
paddle/fluid/operators/hierarchical_sigmoid_op.h
浏览文件 @
e7a4cfc0
...
...
@@ -44,9 +44,11 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
framework
::
Tensor
sum
;
math
::
SetConstant
<
DeviceContext
,
T
>
zero
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
pre_out_data
=
pre_out
->
mutable_data
<
T
>
(
auto
*
pre_out_data
=
pre_out
->
mutable_data
<
T
>
(
framework
::
make_ddim
({
batch_size
,
code_length
}),
ctx
.
GetPlace
());
auto
pre_out_mat
=
EigenMatrix
<
T
>::
From
(
*
pre_out
);
// Not all class(leaf) nodes' path lengths equal code_length, thus init as
// 0s can avoid out of path's loss.
zero
(
dev_ctx
,
pre_out
,
static_cast
<
T
>
(
0.0
));
auto
&
place
=
*
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
math
::
RowwiseSum
<
DeviceContext
,
T
>
row_sum
;
...
...
@@ -61,16 +63,13 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
bit_code
.
Add
(
pre_out
,
*
bias
);
}
bit_code
.
Mul
(
pre_out
,
*
w
,
*
in
);
// clip t
he matrix with (-40, 40)
// clip t
o [-40, 40]
Transform
<
DeviceContext
>
trans
;
trans
(
ctx
.
template
device_context
<
DeviceContext
>(),
pre_out_data
,
pre_out_data
+
pre_out
->
numel
(),
pre_out_data
,
ClipFunctor
<
T
>
(
static_cast
<
T
>
(
-
40.0
),
static_cast
<
T
>
(
40.0
)));
bit_code
.
Sum
(
*
pre_out
,
out
,
static_cast
<
T
>
(
-
1
));
// softrelu with threshold is 40.0
trans
(
ctx
.
template
device_context
<
DeviceContext
>(),
pre_out_data
,
pre_out_data
+
pre_out
->
numel
(),
pre_out_data
,
ClipFunctor
<
T
>
(
static_cast
<
T
>
(
-
40.0
),
static_cast
<
T
>
(
40.0
)));
// use softrelu to calculate cross entropy
pre_out_mat
.
device
(
place
)
=
(
static_cast
<
T
>
(
1.0
)
+
pre_out_mat
.
exp
()).
log
();
row_sum
(
dev_ctx
,
*
pre_out
,
&
sum
);
out_mat
.
device
(
place
)
=
sum_mat
+
out_mat
;
...
...
@@ -102,14 +101,16 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
auto
pre_out_mat
=
EigenMatrix
<
T
>::
From
(
*
pre_out
);
auto
pre_out_grad_mat
=
EigenMatrix
<
T
>::
From
(
pre_out_grad
);
math
::
MatrixBitCodeFunctor
<
T
>
bit_code
(
num_classes
,
label
->
data
<
int64_t
>
());
// softrelu derivative
Eigen
::
array
<
int
,
2
>
bcast
({
1
,
static_cast
<
int
>
(
pre_out_grad
.
dims
()[
1
])});
Eigen
::
array
<
int
,
2
>
bcast
({{
1
,
static_cast
<
int
>
(
pre_out_grad
.
dims
()[
1
])}});
auto
out_grad_mat
=
EigenMatrix
<
T
>::
From
(
*
out_grad
);
pre_out_grad_mat
=
out_grad_mat
.
broadcast
(
bcast
);
pre_out_grad_mat
.
device
(
place
)
=
pre_out_grad_mat
*
(
static_cast
<
T
>
(
1.0
)
-
static_cast
<
T
>
(
1.0
)
/
pre_out_mat
.
exp
());
(
static_cast
<
T
>
(
1.0
)
-
static_cast
<
T
>
(
1.0
)
/
pre_out_mat
.
exp
());
// softrelu derivative
bit_code
.
Sub
(
&
pre_out_grad
);
// TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
// be consistent with the clipping in forward.
if
(
bias_grad
)
{
bias_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
bit_code
.
AddGrad
(
pre_out_grad
,
bias_grad
);
...
...
paddle/fluid/operators/math/matrix_bit_code.h
浏览文件 @
e7a4cfc0
...
...
@@ -65,12 +65,24 @@ inline constexpr size_t FindLastSet(size_t x) {
struct
SimpleCode
{
SimpleCode
(
size_t
code
,
size_t
num_classes
)
:
c_
(
code
+
num_classes
)
{}
/**
* calc_index should make sure that all siblings have the same weight indice.
* As for which weight index it maps to, it doesn't matter. To satisfy this,
* the id of root should be 1, and the left child of a node i is 2*i, the
* right child of a node i is 2*i+1.
*/
inline
size_t
calc_index
(
int
bit
)
const
{
return
(
c_
>>
(
bit
+
1
))
-
1
;
}
/**
* calc_bit uses the right most bits, while calc_index uses the left most
* bits. They are not the same, and that's why we say it doesn't matter which
* weight index calc_index maps to.
*/
inline
bool
calc_bit
(
int
bit
)
const
{
return
c_
&
(
1
<<
bit
);
}
inline
int
get_length
()
const
{
return
FindLastSet
(
c_
)
-
1
;
}
private:
size_t
c_
;
size_t
c_
;
// Here the id of root is 1 rather than 0, thus the id of class c
// is `c + num_classes`.
};
struct
SimpleCodeTable
{
...
...
@@ -83,7 +95,6 @@ struct SimpleCodeTable {
private:
size_t
num_classes_
;
int
max_code_length_
;
};
template
<
typename
T
>
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
e7a4cfc0
...
...
@@ -3858,29 +3858,32 @@ def nce(input,
return
cost
/
(
num_neg_samples
+
1
)
def
hsigmoid
(
input
,
label
,
num_classes
=
2
,
param_attr
=
None
,
bias_attr
=
None
):
def
hsigmoid
(
input
,
label
,
num_classes
,
param_attr
=
None
,
bias_attr
=
None
):
"""
The hierarchical sigmoid operator is used to accelerate the training
process of language model. This operator organizes the classes into a
complete binary tree, each leaf node represents a class(a word) and each internal
node acts likea binary classifier. For each word there's a unique path from root
to it's leaf node, hsigmoid calculate the cost for each internal node on the path
(include root), and sum them to get a total cost. hsigmoid can achive a acceleration
from N to logN, for which N represents the size of word dict. This idea is from "F.
Morin, Y. Bengio(AISTATS 05): Hierarchical Probabilistic Neural Network Language Model.
complete binary tree, each leaf node represents a class(a word) and each
internal node acts as a binary classifier. For each word there's a unique
path from root to it's leaf node, hsigmoid calculate the cost for each
internal node on the path, and sum them to get a total cost. hsigmoid can
achive a acceleration from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
represents the size of word dict.
Refer to `Hierarchical Probabilistic Neural Network Language Model
<http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
Args:
input (Variable):
(Tensor) The input Tensor, which the shape is
[N * D], which N is the size of mini-batch,D is the embded size
label (Variable): (Tensor), The labels of training data. It's a
1-D tensor, which the shape is [1, N]
num_classes: (int, default 2), The number of classes, must be lager or
equal
than 2.
input (Variable):
The input tensor variable with shape
:math:`[N
\\
times D]`, where :math:`N` is the size of mini-batch,
and :math:`D` is the feature size.
label (Variable): The tensor variable contains labels of training data.
It's a tensor with shape is :math:`[N
\\
times 1]`.
num_classes: (int), The number of classes, must not be less
than 2.
param_attr (ParamAttr|list of ParamAttr, default None): The parameter
attribute for learnable parameters/weights of this layer.
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter
attribute for the bias of this layer. If it is set to
None, no bias
will be added to the output units
.
attribute for the bias of this layer. If it is set to
False, no
bias will be applied
.
Returns:
Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
...
...
@@ -3889,11 +3892,9 @@ def hsigmoid(input, label, num_classes=2, param_attr=None, bias_attr=None):
.. code-block:: python
x = fluid.layers.data(name='x', shape=[3, 2],
dtype='float32')
y = fluid.layers.data(name='y', shape=[1, 3],
dtype='int64')
out = fluid.layers.hsigmoid(input=x, label=y, num_classes=2)
x = fluid.layers.data(name='x', shape=[2], dtype='float32')
y = fluid.layers.data(name='y', shape=[1], dtype='int64')
out = fluid.layers.hsigmoid(input=x, label=y, num_classes=6)
"""
helper
=
LayerHelper
(
'hierarchical_sigmoid'
,
**
locals
())
...
...
@@ -3902,7 +3903,7 @@ def hsigmoid(input, label, num_classes=2, param_attr=None, bias_attr=None):
pre_out
=
helper
.
create_tmp_variable
(
dtype
)
dim
=
input
.
shape
[
1
]
if
num_classes
<
2
:
raise
ValueError
(
"num_classes must
be lager or equal
than 2."
)
raise
ValueError
(
"num_classes must
not be less
than 2."
)
weights
=
helper
.
create_parameter
(
attr
=
helper
.
param_attr
,
shape
=
[
num_classes
-
1
,
dim
],
...
...
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
浏览文件 @
e7a4cfc0
...
...
@@ -55,10 +55,7 @@ def hsigmoid(x, w, label, bias, num_classes):
length
=
code_table
.
get_length
()
for
k
in
range
(
length
):
idx
=
code_table
.
cal_index
(
k
)
sum
=
0.0
for
l
in
range
(
x
.
shape
[
1
]):
sum
+=
w
[
idx
][
l
]
*
x
[
j
][
l
]
pre_output
[
j
][
k
]
+=
sum
pre_output
[
j
][
k
]
=
np
.
dot
(
w
[
idx
],
x
[
j
])
# clip[-40.0, 40.0]
pre_output
=
np
.
clip
(
pre_output
,
-
40.0
,
40.0
)
# out(i, 0) = \sum_j bit(i, j) * preout(i, j)
...
...
@@ -71,7 +68,6 @@ def hsigmoid(x, w, label, bias, num_classes):
sum
+=
pre_output
[
i
][
j
]
out
[
i
]
=
-
1.0
*
sum
# soft relu
np
.
clip
(
pre_output
,
-
40.0
,
40.0
)
pre_output
=
np
.
log
(
1
+
np
.
exp
(
pre_output
))
pre_sum
=
pre_output
.
sum
(
1
).
reshape
((
batch_size
,
1
))
out
+=
pre_sum
...
...
@@ -81,11 +77,11 @@ def hsigmoid(x, w, label, bias, num_classes):
class
TestHSigmoidOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"hierarchical_sigmoid"
num_classes
=
4
embded_size
=
1
batch_size
=
1
x
=
np
.
random
.
random
((
batch_size
,
embded
_size
)).
astype
(
"float32"
)
w
=
np
.
random
.
random
((
num_classes
-
1
,
embded
_size
)).
astype
(
"float32"
)
num_classes
=
6
feature_size
=
5
batch_size
=
4
x
=
np
.
random
.
random
((
batch_size
,
feature
_size
)).
astype
(
"float32"
)
w
=
np
.
random
.
random
((
num_classes
-
1
,
feature
_size
)).
astype
(
"float32"
)
label
=
np
.
random
.
randint
(
0
,
num_classes
,
batch_size
)
bias
=
np
.
random
.
random
((
1
,
num_classes
-
1
)).
astype
(
"float32"
)
self
.
attrs
=
{
'num_classes'
:
num_classes
}
...
...
@@ -97,7 +93,7 @@ class TestHSigmoidOp(OpTest):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'Bias'
,
'X'
,
'W'
],
'Out'
,
no_grad_set
=
set
(
'Label'
))
self
.
check_grad
([
'Bias'
,
'X'
,
'W'
],
[
'Out'
]
,
no_grad_set
=
set
(
'Label'
))
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录