Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
e7a4cfc0
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e7a4cfc0
编写于
7月 11, 2018
作者:
G
guosheng
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
complete the hsigmoid_op
上级
d6953816
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
63 addition
and
54 deletion
+63
-54
paddle/fluid/operators/hierarchical_sigmoid_op.cc
paddle/fluid/operators/hierarchical_sigmoid_op.cc
+10
-10
paddle/fluid/operators/hierarchical_sigmoid_op.h
paddle/fluid/operators/hierarchical_sigmoid_op.h
+10
-9
paddle/fluid/operators/math/matrix_bit_code.h
paddle/fluid/operators/math/matrix_bit_code.h
+13
-2
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+23
-22
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+7
-11
未找到文件。
paddle/fluid/operators/hierarchical_sigmoid_op.cc
浏览文件 @
e7a4cfc0
...
@@ -86,25 +86,25 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
...
@@ -86,25 +86,25 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
public:
public:
void
Make
()
override
{
void
Make
()
override
{
AddInput
(
"X"
,
AddInput
(
"X"
,
"(Tensor, required) The input Tensor, which the shape is"
"(Tensor, required) The input tensor with shape [N, D], "
"[N, D], which N is the size of mini-batch,"
"where N is the size of mini-batch, and D is the feature size."
);
"D is the embded size"
);
AddInput
(
"W"
,
AddInput
(
"W"
,
"(Tensor, required), The parameters of hierarchical "
"(Tensor, required), The parameters of hierarchical "
"sigmoid operator, each of them is
s
a 2-D tensor, the shape is"
"sigmoid operator, each of them is a 2-D tensor, the shape is"
"[num_classes - 1, D]"
);
"[num_classes - 1, D]
.
"
);
AddInput
(
"Label"
,
AddInput
(
"Label"
,
"(Tensor, required), The labels of training data. It's a"
"(Tensor, required), The labels of training data. It's a"
"
1-D tensor, which the shape is [N, 1]
"
);
"
tensor with shape [N, 1].
"
);
AddInput
(
"Bias"
,
AddInput
(
"Bias"
,
"(Tensor, optional), The bias is a tensor with shape"
"(Tensor, optional), The bias is a tensor with shape"
"[1, num_classes - 1]"
);
"[1, num_classes - 1]
.
"
);
AddOutput
(
"Out"
,
AddOutput
(
"Out"
,
"(Tensor, required) The output of hierarchical sigmoid operator."
"(Tensor, required) The output of hierarchical sigmoid operator."
"
the shape is [N, 1]
"
);
"
The shape is [N, 1].
"
);
AddOutput
(
"PreOut"
,
AddOutput
(
"PreOut"
,
"(Tensor, required) A intermedia 2-D Tensor, which the shape is "
"(Tensor, required) A intermedia 2-D tensor with shape "
"[batch_size, code_length]"
)
"[batch_size, code_length], where code_length represents the "
"maximum path length from root to leaf nodes."
)
.
AsIntermediate
();
.
AsIntermediate
();
AddAttr
<
AttrType
>
(
"num_classes"
,
"(int, required), The number of classes"
)
AddAttr
<
AttrType
>
(
"num_classes"
,
"(int, required), The number of classes"
)
.
SetDefault
(
2
);
.
SetDefault
(
2
);
...
...
paddle/fluid/operators/hierarchical_sigmoid_op.h
浏览文件 @
e7a4cfc0
...
@@ -44,9 +44,11 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
...
@@ -44,9 +44,11 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
framework
::
Tensor
sum
;
framework
::
Tensor
sum
;
math
::
SetConstant
<
DeviceContext
,
T
>
zero
;
math
::
SetConstant
<
DeviceContext
,
T
>
zero
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
pre_out_data
=
pre_out
->
mutable_data
<
T
>
(
auto
*
pre_out_data
=
pre_out
->
mutable_data
<
T
>
(
framework
::
make_ddim
({
batch_size
,
code_length
}),
ctx
.
GetPlace
());
framework
::
make_ddim
({
batch_size
,
code_length
}),
ctx
.
GetPlace
());
auto
pre_out_mat
=
EigenMatrix
<
T
>::
From
(
*
pre_out
);
auto
pre_out_mat
=
EigenMatrix
<
T
>::
From
(
*
pre_out
);
// Not all class(leaf) nodes' path lengths equal code_length, thus init as
// 0s can avoid out of path's loss.
zero
(
dev_ctx
,
pre_out
,
static_cast
<
T
>
(
0.0
));
zero
(
dev_ctx
,
pre_out
,
static_cast
<
T
>
(
0.0
));
auto
&
place
=
*
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
auto
&
place
=
*
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
math
::
RowwiseSum
<
DeviceContext
,
T
>
row_sum
;
math
::
RowwiseSum
<
DeviceContext
,
T
>
row_sum
;
...
@@ -61,16 +63,13 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
...
@@ -61,16 +63,13 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
bit_code
.
Add
(
pre_out
,
*
bias
);
bit_code
.
Add
(
pre_out
,
*
bias
);
}
}
bit_code
.
Mul
(
pre_out
,
*
w
,
*
in
);
bit_code
.
Mul
(
pre_out
,
*
w
,
*
in
);
// clip t
he matrix with (-40, 40)
// clip t
o [-40, 40]
Transform
<
DeviceContext
>
trans
;
Transform
<
DeviceContext
>
trans
;
trans
(
ctx
.
template
device_context
<
DeviceContext
>(),
pre_out_data
,
trans
(
ctx
.
template
device_context
<
DeviceContext
>(),
pre_out_data
,
pre_out_data
+
pre_out
->
numel
(),
pre_out_data
,
pre_out_data
+
pre_out
->
numel
(),
pre_out_data
,
ClipFunctor
<
T
>
(
static_cast
<
T
>
(
-
40.0
),
static_cast
<
T
>
(
40.0
)));
ClipFunctor
<
T
>
(
static_cast
<
T
>
(
-
40.0
),
static_cast
<
T
>
(
40.0
)));
bit_code
.
Sum
(
*
pre_out
,
out
,
static_cast
<
T
>
(
-
1
));
bit_code
.
Sum
(
*
pre_out
,
out
,
static_cast
<
T
>
(
-
1
));
// softrelu with threshold is 40.0
// use softrelu to calculate cross entropy
trans
(
ctx
.
template
device_context
<
DeviceContext
>(),
pre_out_data
,
pre_out_data
+
pre_out
->
numel
(),
pre_out_data
,
ClipFunctor
<
T
>
(
static_cast
<
T
>
(
-
40.0
),
static_cast
<
T
>
(
40.0
)));
pre_out_mat
.
device
(
place
)
=
(
static_cast
<
T
>
(
1.0
)
+
pre_out_mat
.
exp
()).
log
();
pre_out_mat
.
device
(
place
)
=
(
static_cast
<
T
>
(
1.0
)
+
pre_out_mat
.
exp
()).
log
();
row_sum
(
dev_ctx
,
*
pre_out
,
&
sum
);
row_sum
(
dev_ctx
,
*
pre_out
,
&
sum
);
out_mat
.
device
(
place
)
=
sum_mat
+
out_mat
;
out_mat
.
device
(
place
)
=
sum_mat
+
out_mat
;
...
@@ -102,14 +101,16 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
...
@@ -102,14 +101,16 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
auto
pre_out_mat
=
EigenMatrix
<
T
>::
From
(
*
pre_out
);
auto
pre_out_mat
=
EigenMatrix
<
T
>::
From
(
*
pre_out
);
auto
pre_out_grad_mat
=
EigenMatrix
<
T
>::
From
(
pre_out_grad
);
auto
pre_out_grad_mat
=
EigenMatrix
<
T
>::
From
(
pre_out_grad
);
math
::
MatrixBitCodeFunctor
<
T
>
bit_code
(
num_classes
,
label
->
data
<
int64_t
>
());
math
::
MatrixBitCodeFunctor
<
T
>
bit_code
(
num_classes
,
label
->
data
<
int64_t
>
());
// softrelu derivative
Eigen
::
array
<
int
,
2
>
bcast
({{
1
,
static_cast
<
int
>
(
pre_out_grad
.
dims
()[
1
])}});
Eigen
::
array
<
int
,
2
>
bcast
({
1
,
static_cast
<
int
>
(
pre_out_grad
.
dims
()[
1
])});
auto
out_grad_mat
=
EigenMatrix
<
T
>::
From
(
*
out_grad
);
auto
out_grad_mat
=
EigenMatrix
<
T
>::
From
(
*
out_grad
);
pre_out_grad_mat
=
out_grad_mat
.
broadcast
(
bcast
);
pre_out_grad_mat
=
out_grad_mat
.
broadcast
(
bcast
);
pre_out_grad_mat
.
device
(
place
)
=
pre_out_grad_mat
.
device
(
place
)
=
pre_out_grad_mat
*
pre_out_grad_mat
*
(
static_cast
<
T
>
(
1.0
)
-
static_cast
<
T
>
(
1.0
)
/
pre_out_mat
.
exp
());
(
static_cast
<
T
>
(
1.0
)
-
static_cast
<
T
>
(
1.0
)
/
pre_out_mat
.
exp
());
// softrelu derivative
bit_code
.
Sub
(
&
pre_out_grad
);
bit_code
.
Sub
(
&
pre_out_grad
);
// TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
// be consistent with the clipping in forward.
if
(
bias_grad
)
{
if
(
bias_grad
)
{
bias_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
bias_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
bit_code
.
AddGrad
(
pre_out_grad
,
bias_grad
);
bit_code
.
AddGrad
(
pre_out_grad
,
bias_grad
);
...
...
paddle/fluid/operators/math/matrix_bit_code.h
浏览文件 @
e7a4cfc0
...
@@ -65,12 +65,24 @@ inline constexpr size_t FindLastSet(size_t x) {
...
@@ -65,12 +65,24 @@ inline constexpr size_t FindLastSet(size_t x) {
struct
SimpleCode
{
struct
SimpleCode
{
SimpleCode
(
size_t
code
,
size_t
num_classes
)
:
c_
(
code
+
num_classes
)
{}
SimpleCode
(
size_t
code
,
size_t
num_classes
)
:
c_
(
code
+
num_classes
)
{}
/**
* calc_index should make sure that all siblings have the same weight indice.
* As for which weight index it maps to, it doesn't matter. To satisfy this,
* the id of root should be 1, and the left child of a node i is 2*i, the
* right child of a node i is 2*i+1.
*/
inline
size_t
calc_index
(
int
bit
)
const
{
return
(
c_
>>
(
bit
+
1
))
-
1
;
}
inline
size_t
calc_index
(
int
bit
)
const
{
return
(
c_
>>
(
bit
+
1
))
-
1
;
}
/**
* calc_bit uses the right most bits, while calc_index uses the left most
* bits. They are not the same, and that's why we say it doesn't matter which
* weight index calc_index maps to.
*/
inline
bool
calc_bit
(
int
bit
)
const
{
return
c_
&
(
1
<<
bit
);
}
inline
bool
calc_bit
(
int
bit
)
const
{
return
c_
&
(
1
<<
bit
);
}
inline
int
get_length
()
const
{
return
FindLastSet
(
c_
)
-
1
;
}
inline
int
get_length
()
const
{
return
FindLastSet
(
c_
)
-
1
;
}
private:
private:
size_t
c_
;
size_t
c_
;
// Here the id of root is 1 rather than 0, thus the id of class c
// is `c + num_classes`.
};
};
struct
SimpleCodeTable
{
struct
SimpleCodeTable
{
...
@@ -83,7 +95,6 @@ struct SimpleCodeTable {
...
@@ -83,7 +95,6 @@ struct SimpleCodeTable {
private:
private:
size_t
num_classes_
;
size_t
num_classes_
;
int
max_code_length_
;
};
};
template
<
typename
T
>
template
<
typename
T
>
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
e7a4cfc0
...
@@ -3858,29 +3858,32 @@ def nce(input,
...
@@ -3858,29 +3858,32 @@ def nce(input,
return
cost
/
(
num_neg_samples
+
1
)
return
cost
/
(
num_neg_samples
+
1
)
def
hsigmoid
(
input
,
label
,
num_classes
=
2
,
param_attr
=
None
,
bias_attr
=
None
):
def
hsigmoid
(
input
,
label
,
num_classes
,
param_attr
=
None
,
bias_attr
=
None
):
"""
"""
The hierarchical sigmoid operator is used to accelerate the training
The hierarchical sigmoid operator is used to accelerate the training
process of language model. This operator organizes the classes into a
process of language model. This operator organizes the classes into a
complete binary tree, each leaf node represents a class(a word) and each internal
complete binary tree, each leaf node represents a class(a word) and each
node acts likea binary classifier. For each word there's a unique path from root
internal node acts as a binary classifier. For each word there's a unique
to it's leaf node, hsigmoid calculate the cost for each internal node on the path
path from root to it's leaf node, hsigmoid calculate the cost for each
(include root), and sum them to get a total cost. hsigmoid can achive a acceleration
internal node on the path, and sum them to get a total cost. hsigmoid can
from N to logN, for which N represents the size of word dict. This idea is from "F.
achive a acceleration from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
Morin, Y. Bengio(AISTATS 05): Hierarchical Probabilistic Neural Network Language Model.
represents the size of word dict.
Refer to `Hierarchical Probabilistic Neural Network Language Model
<http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
Args:
Args:
input (Variable):
(Tensor) The input Tensor, which the shape is
input (Variable):
The input tensor variable with shape
[N * D], which N is the size of mini-batch,D is the embded size
:math:`[N
\\
times D]`, where :math:`N` is the size of mini-batch,
label (Variable): (Tensor), The labels of training data. It's a
and :math:`D` is the feature size.
1-D tensor, which the shape is [1, N]
label (Variable): The tensor variable contains labels of training data.
num_classes: (int, default 2), The number of classes, must be lager or
It's a tensor with shape is :math:`[N
\\
times 1]`.
equal
than 2.
num_classes: (int), The number of classes, must not be less
than 2.
param_attr (ParamAttr|list of ParamAttr, default None): The parameter
param_attr (ParamAttr|list of ParamAttr, default None): The parameter
attribute for learnable parameters/weights of this layer.
attribute for learnable parameters/weights of this layer.
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter
attribute for the bias of this layer. If it is set to
None, no bias
attribute for the bias of this layer. If it is set to
False, no
will be added to the output units
.
bias will be applied
.
Returns:
Returns:
Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
...
@@ -3889,11 +3892,9 @@ def hsigmoid(input, label, num_classes=2, param_attr=None, bias_attr=None):
...
@@ -3889,11 +3892,9 @@ def hsigmoid(input, label, num_classes=2, param_attr=None, bias_attr=None):
.. code-block:: python
.. code-block:: python
x = fluid.layers.data(name='x', shape=[3, 2],
x = fluid.layers.data(name='x', shape=[2], dtype='float32')
dtype='float32')
y = fluid.layers.data(name='y', shape=[1], dtype='int64')
y = fluid.layers.data(name='y', shape=[1, 3],
out = fluid.layers.hsigmoid(input=x, label=y, num_classes=6)
dtype='int64')
out = fluid.layers.hsigmoid(input=x, label=y, num_classes=2)
"""
"""
helper
=
LayerHelper
(
'hierarchical_sigmoid'
,
**
locals
())
helper
=
LayerHelper
(
'hierarchical_sigmoid'
,
**
locals
())
...
@@ -3902,7 +3903,7 @@ def hsigmoid(input, label, num_classes=2, param_attr=None, bias_attr=None):
...
@@ -3902,7 +3903,7 @@ def hsigmoid(input, label, num_classes=2, param_attr=None, bias_attr=None):
pre_out
=
helper
.
create_tmp_variable
(
dtype
)
pre_out
=
helper
.
create_tmp_variable
(
dtype
)
dim
=
input
.
shape
[
1
]
dim
=
input
.
shape
[
1
]
if
num_classes
<
2
:
if
num_classes
<
2
:
raise
ValueError
(
"num_classes must
be lager or equal
than 2."
)
raise
ValueError
(
"num_classes must
not be less
than 2."
)
weights
=
helper
.
create_parameter
(
weights
=
helper
.
create_parameter
(
attr
=
helper
.
param_attr
,
attr
=
helper
.
param_attr
,
shape
=
[
num_classes
-
1
,
dim
],
shape
=
[
num_classes
-
1
,
dim
],
...
...
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
浏览文件 @
e7a4cfc0
...
@@ -55,10 +55,7 @@ def hsigmoid(x, w, label, bias, num_classes):
...
@@ -55,10 +55,7 @@ def hsigmoid(x, w, label, bias, num_classes):
length
=
code_table
.
get_length
()
length
=
code_table
.
get_length
()
for
k
in
range
(
length
):
for
k
in
range
(
length
):
idx
=
code_table
.
cal_index
(
k
)
idx
=
code_table
.
cal_index
(
k
)
sum
=
0.0
pre_output
[
j
][
k
]
=
np
.
dot
(
w
[
idx
],
x
[
j
])
for
l
in
range
(
x
.
shape
[
1
]):
sum
+=
w
[
idx
][
l
]
*
x
[
j
][
l
]
pre_output
[
j
][
k
]
+=
sum
# clip[-40.0, 40.0]
# clip[-40.0, 40.0]
pre_output
=
np
.
clip
(
pre_output
,
-
40.0
,
40.0
)
pre_output
=
np
.
clip
(
pre_output
,
-
40.0
,
40.0
)
# out(i, 0) = \sum_j bit(i, j) * preout(i, j)
# out(i, 0) = \sum_j bit(i, j) * preout(i, j)
...
@@ -71,7 +68,6 @@ def hsigmoid(x, w, label, bias, num_classes):
...
@@ -71,7 +68,6 @@ def hsigmoid(x, w, label, bias, num_classes):
sum
+=
pre_output
[
i
][
j
]
sum
+=
pre_output
[
i
][
j
]
out
[
i
]
=
-
1.0
*
sum
out
[
i
]
=
-
1.0
*
sum
# soft relu
# soft relu
np
.
clip
(
pre_output
,
-
40.0
,
40.0
)
pre_output
=
np
.
log
(
1
+
np
.
exp
(
pre_output
))
pre_output
=
np
.
log
(
1
+
np
.
exp
(
pre_output
))
pre_sum
=
pre_output
.
sum
(
1
).
reshape
((
batch_size
,
1
))
pre_sum
=
pre_output
.
sum
(
1
).
reshape
((
batch_size
,
1
))
out
+=
pre_sum
out
+=
pre_sum
...
@@ -81,11 +77,11 @@ def hsigmoid(x, w, label, bias, num_classes):
...
@@ -81,11 +77,11 @@ def hsigmoid(x, w, label, bias, num_classes):
class
TestHSigmoidOp
(
OpTest
):
class
TestHSigmoidOp
(
OpTest
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
op_type
=
"hierarchical_sigmoid"
self
.
op_type
=
"hierarchical_sigmoid"
num_classes
=
4
num_classes
=
6
embded_size
=
1
feature_size
=
5
batch_size
=
1
batch_size
=
4
x
=
np
.
random
.
random
((
batch_size
,
embded
_size
)).
astype
(
"float32"
)
x
=
np
.
random
.
random
((
batch_size
,
feature
_size
)).
astype
(
"float32"
)
w
=
np
.
random
.
random
((
num_classes
-
1
,
embded
_size
)).
astype
(
"float32"
)
w
=
np
.
random
.
random
((
num_classes
-
1
,
feature
_size
)).
astype
(
"float32"
)
label
=
np
.
random
.
randint
(
0
,
num_classes
,
batch_size
)
label
=
np
.
random
.
randint
(
0
,
num_classes
,
batch_size
)
bias
=
np
.
random
.
random
((
1
,
num_classes
-
1
)).
astype
(
"float32"
)
bias
=
np
.
random
.
random
((
1
,
num_classes
-
1
)).
astype
(
"float32"
)
self
.
attrs
=
{
'num_classes'
:
num_classes
}
self
.
attrs
=
{
'num_classes'
:
num_classes
}
...
@@ -97,7 +93,7 @@ class TestHSigmoidOp(OpTest):
...
@@ -97,7 +93,7 @@ class TestHSigmoidOp(OpTest):
self
.
check_output
()
self
.
check_output
()
def
test_check_grad
(
self
):
def
test_check_grad
(
self
):
self
.
check_grad
([
'Bias'
,
'X'
,
'W'
],
'Out'
,
no_grad_set
=
set
(
'Label'
))
self
.
check_grad
([
'Bias'
,
'X'
,
'W'
],
[
'Out'
]
,
no_grad_set
=
set
(
'Label'
))
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录