Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
12e1719f
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
12e1719f
编写于
11月 28, 2018
作者:
J
Jiabin Yang
提交者:
GitHub
11月 28, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #14352 from JiabinYang/enhance_hierachical_sigmod_op
Enhance hierarchical sigmoid op
上级
36e26a53
eda06906
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
765 addition
and
131 deletion
+765
-131
paddle/fluid/API.spec
paddle/fluid/API.spec
+1
-1
paddle/fluid/framework/selected_rows.h
paddle/fluid/framework/selected_rows.h
+18
-3
paddle/fluid/operators/hierarchical_sigmoid_op.cc
paddle/fluid/operators/hierarchical_sigmoid_op.cc
+89
-21
paddle/fluid/operators/hierarchical_sigmoid_op.h
paddle/fluid/operators/hierarchical_sigmoid_op.h
+122
-39
paddle/fluid/operators/math/matrix_bit_code.cc
paddle/fluid/operators/math/matrix_bit_code.cc
+65
-30
paddle/fluid/operators/math/matrix_bit_code.h
paddle/fluid/operators/math/matrix_bit_code.h
+119
-12
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+81
-20
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+251
-5
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+19
-0
未找到文件。
paddle/fluid/API.spec
浏览文件 @
12e1719f
...
...
@@ -98,7 +98,7 @@ paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=
paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'
], varargs=None, keywords=None, defaults=(None, None, Non
e))
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'
, 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, Fals
e))
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
...
...
paddle/fluid/framework/selected_rows.h
浏览文件 @
12e1719f
...
...
@@ -120,8 +120,22 @@ class SelectedRows {
*/
int64_t
AutoGrownIndex
(
int64_t
key
,
bool
auto_grown
,
bool
is_test
=
false
);
void
SyncIndex
();
/*
* @brief Get the index of the key from id_to_index_ map.
*/
inline
int64_t
GetIndexFromId
(
int64_t
key
)
{
auto
iter
=
id_to_index_
.
find
(
key
);
if
(
iter
==
id_to_index_
.
end
())
{
return
-
1
;
}
else
{
return
iter
->
second
;
}
}
void
SyncIndex
();
/*
* @brief Get complete Dims before
*/
DDim
GetCompleteDims
()
const
{
std
::
vector
<
int64_t
>
dims
=
vectorize
(
value_
->
dims
());
dims
[
0
]
=
height_
;
...
...
@@ -133,9 +147,10 @@ class SelectedRows {
// SelectedRows are simply concated when adding together. Until a
// SelectedRows add a Tensor, will the duplicate rows be handled.
Vector
<
int64_t
>
rows_
;
std
::
unordered_map
<
int64_t
,
int64_t
>
id_to_index_
;
std
::
unordered_map
<
int64_t
,
int64_t
>
id_to_index_
;
// should not be used when rows_ has duplicate member
std
::
unique_ptr
<
Tensor
>
value_
{
nullptr
};
int64_t
height_
;
int64_t
height_
;
// height indicates the underline tensor's height
std
::
unique_ptr
<
RWLock
>
rwlock_
{
nullptr
};
};
...
...
paddle/fluid/operators/hierarchical_sigmoid_op.cc
浏览文件 @
12e1719f
...
...
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/hierarchical_sigmoid_op.h"
#include <string>
#include <vector>
namespace
paddle
{
namespace
operators
{
...
...
@@ -70,13 +70,14 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
const
int64_t
batch_size
=
ctx
->
GetInputDim
(
"X"
)[
0
];
std
::
vector
<
int64_t
>
output_shape
({
batch_size
,
1
});
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
output_shape
));
ctx
->
ShareLoD
(
"X"
,
/*->*/
"Out"
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
)
->
type
()),
framework
::
ToDataType
(
ctx
.
Input
<
framework
::
LoD
Tensor
>
(
"X"
)
->
type
()),
ctx
.
GetPlace
());
}
};
...
...
@@ -86,27 +87,40 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor, required) The input tensor with shape [N, D], "
"(
LoD
Tensor, required) The input tensor with shape [N, D], "
"where N is the size of mini-batch, and D is the feature size."
);
AddInput
(
"W"
,
"(Tensor, required), The parameters of hierarchical "
"(
LoD
Tensor, required), The parameters of hierarchical "
"sigmoid operator, each of them is a 2-D tensor, the shape is"
"[
num_classes - 1, D].
"
);
"[
K, D]. Which K is the num of non-leaf node in Path Tree
"
);
AddInput
(
"Label"
,
"(Tensor, required), The labels of training data. It's a"
"(
LoD
Tensor, required), The labels of training data. It's a"
"tensor with shape [N, 1]."
);
AddInput
(
"PTable"
,
"(LoDTensor, optional), The Path Table from root to current word"
"it should have shape like [N, L], L is the length of the Path"
)
.
AsDispensable
();
AddInput
(
"PathCode"
,
"(LoDTensor, optional), The Code on each Node of the Path from root "
"to current word"
"it should have shape like [N, L], L is the length of the Path"
)
.
AsDispensable
();
AddInput
(
"Bias"
,
"(Tensor, optional), The bias is a tensor with shape"
"[1, num_classes - 1]."
);
AddOutput
(
"Out"
,
"(Tensor, required) The output of hierarchical sigmoid operator."
"The shape is [N, 1]."
);
"(LoDTensor, optional), The bias is a tensor with shape or "
"[num_classes, 1]"
"[num_classes - 1, 1]."
)
.
AsDispensable
();
AddOutput
(
"Out"
,
"(LoDTensor, required) The output of hierarchical sigmoid operator."
"The shape is [N, 1]."
);
AddOutput
(
"PreOut"
,
"(Tensor, required) A intermedia 2-D tensor with shape "
"(
LoD
Tensor, required) A intermedia 2-D tensor with shape "
"[batch_size, code_length], where code_length represents the "
"maximum path length from root to leaf nodes."
)
.
AsIntermediate
();
AddAttr
<
AttrType
>
(
"num_classes"
,
"(int,
required
), The number of classes"
)
AddAttr
<
AttrType
>
(
"num_classes"
,
"(int,
optional
), The number of classes"
)
.
SetDefault
(
2
);
AddComment
(
R"DOC(
The hierarchical sigmoid operator organize the classes into a binary tree.
...
...
@@ -115,6 +129,10 @@ belonging to the right branch. This idea is from
"F. Morin, Y. Bengio (AISTATS 05):
Hierarchical Probabilistic Neural Network Language Model."
)DOC"
);
AddAttr
<
bool
>
(
"is_sparse"
,
"(boolean, default false) "
"Sparse update."
)
.
SetDefault
(
false
);
}
};
...
...
@@ -124,16 +142,21 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"W"
),
"Input(W) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Label"
),
"Input(Label) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"Input(Out@Grad) should not be null"
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"PreOut"
),
"Input(Preout) should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"W"
)),
"Output(W@Grad should not be null.)"
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)));
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Bias"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Bias"
),
ctx
->
GetInputDim
(
"Bias"
));
"Output(W@Grad should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)),
"Output(X@Grad should not be null."
);
if
(
!
ctx
->
Attrs
().
Get
<
bool
>
(
"is_sparse"
))
{
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Bias"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Bias"
),
ctx
->
GetInputDim
(
"Bias"
));
}
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"W"
),
ctx
->
GetInputDim
(
"W"
));
}
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"W"
),
ctx
->
GetInputDim
(
"W"
));
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"X"
));
}
...
...
@@ -141,11 +164,55 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
)
->
type
()),
framework
::
ToDataType
(
ctx
.
Input
<
framework
::
LoD
Tensor
>
(
"X"
)
->
type
()),
ctx
.
GetPlace
());
}
};
class
HierarchicalSigmoidGradOpGradVarTypeInference
:
public
framework
::
VarTypeInference
{
public:
void
operator
()(
const
framework
::
OpDesc
&
op_desc
,
framework
::
BlockDesc
*
block
)
const
override
{
auto
w_grad_var_name
=
op_desc
.
Output
(
framework
::
GradVarName
(
"W"
)).
front
();
auto
bias_grad_var_name_vec
=
op_desc
.
Output
(
framework
::
GradVarName
(
"Bias"
));
std
::
string
bias_grad_var_name
;
bool
hasBias
=
false
;
if
(
bias_grad_var_name_vec
.
size
())
{
hasBias
=
true
;
bias_grad_var_name
=
op_desc
.
Output
(
framework
::
GradVarName
(
"Bias"
)).
front
();
}
auto
attr
=
op_desc
.
GetAttr
(
"is_sparse"
);
bool
is_sparse
=
boost
::
get
<
bool
>
(
attr
);
if
(
is_sparse
)
{
VLOG
(
30
)
<<
"hierarchical_sigmoid_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to SelectedRows"
;
block
->
Var
(
w_grad_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
if
(
hasBias
)
{
VLOG
(
30
)
<<
"hierarchical_sigmoid_grad op "
<<
framework
::
GradVarName
(
"Bias"
)
<<
" is set to SelectedRows"
;
block
->
Var
(
bias_grad_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
SELECTED_ROWS
);
}
}
else
{
VLOG
(
30
)
<<
"hierarchical_sigmoid_grad op "
<<
framework
::
GradVarName
(
"W"
)
<<
" is set to LoDTensor"
;
block
->
Var
(
w_grad_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
if
(
hasBias
)
{
VLOG
(
30
)
<<
"hierarchical_sigmoid_grad op "
<<
framework
::
GradVarName
(
"Bias"
)
<<
" is set to LoDTensor"
;
block
->
Var
(
bias_grad_var_name
)
->
SetType
(
framework
::
proto
::
VarType
::
LOD_TENSOR
);
}
}
block
->
Var
(
w_grad_var_name
)
->
SetDataType
(
block
->
Var
(
"W"
)
->
GetDataType
());
}
};
}
// namespace operators
}
// namespace paddle
...
...
@@ -153,7 +220,8 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR
(
hierarchical_sigmoid
,
ops
::
HierarchicalSigmoidOp
,
ops
::
HierarchicalSigmoidOpMaker
<
int
>
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
hierarchical_sigmoid_grad
,
ops
::
HierarchicalSigmoidGradOp
);
REGISTER_OPERATOR
(
hierarchical_sigmoid_grad
,
ops
::
HierarchicalSigmoidGradOp
,
ops
::
HierarchicalSigmoidGradOpGradVarTypeInference
);
REGISTER_OP_CPU_KERNEL
(
hierarchical_sigmoid
,
ops
::
HierarchicalSigmoidOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
...
...
paddle/fluid/operators/hierarchical_sigmoid_op.h
浏览文件 @
12e1719f
...
...
@@ -14,12 +14,16 @@ limitations under the License. */
#pragma once
#include <iostream>
#include <set>
#include <vector>
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/clip_op.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/matrix_bit_code.h"
#include "paddle/fluid/platform/transform.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -28,20 +32,38 @@ template <typename T, int MajorType = Eigen::RowMajor,
using
EigenMatrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
using
platform
::
Transform
;
static
std
::
vector
<
int64_t
>
PathToRows
(
const
framework
::
LoDTensor
&
path
)
{
std
::
set
<
int64_t
>
rows
;
for
(
int64_t
i
=
0
;
i
<
path
.
numel
();
++
i
)
{
int64_t
row
=
path
.
data
<
int64_t
>
()[
i
];
if
(
row
<
0
)
{
continue
;
}
rows
.
emplace
(
row
);
}
return
std
::
vector
<
int64_t
>
(
rows
.
begin
(),
rows
.
end
());
}
template
<
typename
DeviceContext
,
typename
T
>
class
HierarchicalSigmoidOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
w
=
ctx
.
Input
<
framework
::
Tensor
>
(
"W"
);
auto
*
label
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Label"
);
auto
*
bias
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Bias"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
auto
*
pre_out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"PreOut"
);
auto
&
in
=
detail
::
Ref
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"X"
));
auto
&
w
=
detail
::
Ref
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"W"
));
auto
*
path
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"PTable"
);
auto
*
code
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"PathCode"
);
auto
&
label
=
detail
::
Ref
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Label"
));
auto
*
bias
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Bias"
);
auto
*
out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"Out"
);
auto
*
pre_out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"PreOut"
);
size_t
num_classes
=
static_cast
<
size_t
>
(
ctx
.
Attr
<
int
>
(
"num_classes"
));
int64_t
code_length
=
math
::
FindLastSet
(
num_classes
-
1
);
int64_t
batch_size
=
in
->
dims
()[
0
];
framework
::
Tensor
sum
;
bool
is_custom
=
false
;
if
(
path
)
{
is_custom
=
true
;
}
int64_t
code_length
=
path
?
path
->
dims
()[
1
]
:
math
::
FindLastSet
(
num_classes
-
1
);
int64_t
batch_size
=
in
.
dims
()[
0
];
framework
::
LoDTensor
sum
;
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
*
pre_out_data
=
pre_out
->
mutable_data
<
T
>
(
framework
::
make_ddim
({
batch_size
,
code_length
}),
ctx
.
GetPlace
());
...
...
@@ -52,7 +74,15 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
zero
(
dev_ctx
,
pre_out
,
static_cast
<
T
>
(
0.0
));
auto
&
place
=
*
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
math
::
RowwiseSum
<
DeviceContext
,
T
>
row_sum
;
math
::
MatrixBitCodeFunctor
<
T
>
bit_code
(
num_classes
,
label
->
data
<
int64_t
>
());
std
::
unique_ptr
<
math
::
MatrixBitCodeFunctor
<
T
>>
bit_code
;
if
(
!
is_custom
)
{
bit_code
.
reset
(
new
math
::
MatrixBitCodeFunctor
<
T
>
(
num_classes
,
label
.
data
<
int64_t
>
()));
}
else
{
bit_code
.
reset
(
new
math
::
MatrixBitCodeFunctor
<
T
>
(
*
path
,
*
code
,
label
.
data
<
int64_t
>
()));
}
std
::
vector
<
int64_t
>
sum_dims
({
batch_size
,
1UL
});
sum
.
mutable_data
<
T
>
(
framework
::
make_ddim
(
sum_dims
),
ctx
.
GetPlace
());
...
...
@@ -60,15 +90,15 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
out_mat
=
framework
::
EigenVector
<
T
>::
Flatten
(
*
out
);
if
(
bias
)
{
bit_code
.
Add
(
pre_out
,
*
bias
);
bit_code
->
Add
(
*
bias
,
pre_out
);
}
bit_code
.
Mul
(
pre_out
,
*
w
,
*
in
);
bit_code
->
Mul
(
pre_out
,
w
,
in
);
// clip to [-40, 40]
Transform
<
DeviceContext
>
trans
;
trans
(
ctx
.
template
device_context
<
DeviceContext
>(),
pre_out_data
,
pre_out_data
+
pre_out
->
numel
(),
pre_out_data
,
ClipFunctor
<
T
>
(
static_cast
<
T
>
(
-
40.0
),
static_cast
<
T
>
(
40.0
)));
bit_code
.
Sum
(
*
pre_out
,
out
,
static_cast
<
T
>
(
-
1
));
bit_code
->
Sum
(
*
pre_out
,
out
,
static_cast
<
T
>
(
-
1
));
// use softrelu to calculate cross entropy
pre_out_mat
.
device
(
place
)
=
(
static_cast
<
T
>
(
1.0
)
+
pre_out_mat
.
exp
()).
log
();
row_sum
(
dev_ctx
,
*
pre_out
,
&
sum
);
...
...
@@ -84,50 +114,103 @@ template <typename DeviceContext, typename T>
class
HierarchicalSigmoidGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
w
=
ctx
.
Input
<
framework
::
Tensor
>
(
"W"
);
auto
*
in_grad
=
ctx
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
w_grad
=
ctx
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"W"
));
auto
*
bias_grad
=
ctx
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
auto
*
label
=
ctx
.
Input
<
framework
::
Tensor
>
(
"Label"
);
auto
*
pre_out
=
ctx
.
Input
<
framework
::
Tensor
>
(
"PreOut"
);
auto
*
out_grad
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
framework
::
Tensor
pre_out_grad
;
pre_out_grad
.
mutable_data
<
T
>
(
pre_out
->
dims
(),
ctx
.
GetPlace
());
in_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
w_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
in
=
detail
::
Ref
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"X"
));
auto
&
w
=
detail
::
Ref
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"W"
));
auto
*
path
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"PTable"
);
auto
*
code
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"PathCode"
);
auto
*
bias
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Bias"
);
auto
*
in_grad
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
framework
::
GradVarName
(
"X"
));
bool
is_sparse
=
ctx
.
Attr
<
bool
>
(
"is_sparse"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
math
::
SetConstant
<
DeviceContext
,
T
>
zero
;
auto
&
label
=
detail
::
Ref
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"Label"
));
auto
&
pre_out
=
detail
::
Ref
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
"PreOut"
));
auto
&
out_grad
=
detail
::
Ref
(
ctx
.
Input
<
framework
::
LoDTensor
>
(
framework
::
GradVarName
(
"Out"
)));
framework
::
LoDTensor
pre_out_grad
;
pre_out_grad
.
mutable_data
<
T
>
(
pre_out
.
dims
(),
ctx
.
GetPlace
());
in_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
zero
(
dev_ctx
,
in_grad
,
static_cast
<
T
>
(
0.0
));
zero
(
dev_ctx
,
w_grad
,
static_cast
<
T
>
(
0.0
));
size_t
num_classes
=
static_cast
<
size_t
>
(
ctx
.
Attr
<
int
>
(
"num_classes"
));
math
::
MatrixBitCodeFunctor
<
T
>
bit_code
(
num_classes
,
label
->
data
<
int64_t
>
());
bool
is_custom
=
false
;
if
(
path
)
{
is_custom
=
true
;
}
std
::
unique_ptr
<
math
::
MatrixBitCodeFunctor
<
T
>>
bit_code
;
if
(
!
is_custom
)
{
bit_code
.
reset
(
new
math
::
MatrixBitCodeFunctor
<
T
>
(
num_classes
,
label
.
data
<
int64_t
>
()));
}
else
{
bit_code
.
reset
(
new
math
::
MatrixBitCodeFunctor
<
T
>
(
*
path
,
*
code
,
label
.
data
<
int64_t
>
()));
}
auto
&
place
=
*
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
auto
pre_out_mat
=
EigenMatrix
<
T
>::
From
(
*
pre_out
);
auto
pre_out_mat
=
EigenMatrix
<
T
>::
From
(
pre_out
);
auto
pre_out_grad_mat
=
EigenMatrix
<
T
>::
From
(
pre_out_grad
);
auto
out_grad_mat
=
EigenMatrix
<
T
>::
From
(
*
out_grad
);
auto
out_grad_mat
=
EigenMatrix
<
T
>::
From
(
out_grad
);
Eigen
::
array
<
int
,
2
>
bcast
{
1
,
static_cast
<
int
>
(
pre_out_grad
.
dims
()[
1
])};
// softrelu derivative
pre_out_grad_mat
.
device
(
place
)
=
static_cast
<
T
>
(
1.0
)
-
static_cast
<
T
>
(
1.0
)
/
pre_out_mat
.
exp
();
bit_code
.
Sub
(
&
pre_out_grad
);
// the gradient of clip(w * x + b)
bit_code
->
Sub
(
&
pre_out_grad
);
// the gradient of clip(w * x + b)
pre_out_grad_mat
.
device
(
place
)
=
pre_out_grad_mat
*
out_grad_mat
.
broadcast
(
bcast
);
// TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
// be consistent with the clipping in forward.
if
(
bias_grad
)
{
bias_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
zero
(
dev_ctx
,
bias_grad
,
static_cast
<
T
>
(
0.0
));
bit_code
.
AddGrad
(
pre_out_grad
,
bias_grad
);
if
(
!
is_sparse
)
{
auto
*
bias_grad
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
framework
::
GradVarName
(
"Bias"
));
if
(
bias_grad
)
{
bias_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
zero
(
dev_ctx
,
bias_grad
,
static_cast
<
T
>
(
0.0
));
bit_code
->
AddGrad
(
pre_out_grad
,
bias_grad
);
}
auto
*
w_grad
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
framework
::
GradVarName
(
"W"
));
w_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
zero
(
dev_ctx
,
w_grad
,
static_cast
<
T
>
(
0.0
));
bit_code
->
MulGradWeight
(
pre_out_grad
,
w_grad
,
in
);
}
else
{
framework
::
Vector
<
int64_t
>
real_rows
=
PathToRows
(
*
path
);
auto
*
w_grad
=
ctx
.
Output
<
framework
::
SelectedRows
>
(
framework
::
GradVarName
(
"W"
));
w_grad
->
set_rows
(
real_rows
);
// Build a map of id -> row_index to speed up finding the index of one id
w_grad
->
SyncIndex
();
w_grad
->
set_height
(
w
.
dims
()[
0
]);
auto
*
w_grad_value
=
w_grad
->
mutable_value
();
framework
::
DDim
temp_dim
(
w
.
dims
());
set
(
temp_dim
,
0
,
real_rows
.
size
());
w_grad_value
->
mutable_data
<
T
>
(
temp_dim
,
ctx
.
GetPlace
());
zero
(
dev_ctx
,
w_grad_value
,
static_cast
<
T
>
(
0.0
));
auto
*
bias_grad
=
ctx
.
Output
<
framework
::
SelectedRows
>
(
framework
::
GradVarName
(
"Bias"
));
if
(
bias_grad
)
{
bias_grad
->
set_rows
(
real_rows
);
// build ids -> rows index map
bias_grad
->
SyncIndex
();
bias_grad
->
set_height
(
bias
->
dims
()[
0
]);
auto
*
bias_grad_value
=
bias_grad
->
mutable_value
();
std
::
vector
<
int64_t
>
dims
=
{
static_cast
<
int64_t
>
(
real_rows
.
size
()),
bias
->
dims
()[
1
]};
bias_grad_value
->
mutable_data
<
T
>
(
framework
::
make_ddim
(
dims
),
ctx
.
GetPlace
());
zero
(
dev_ctx
,
bias_grad_value
,
static_cast
<
T
>
(
0.0
));
bit_code
->
AddGrad
(
pre_out_grad
,
bias_grad
);
}
bit_code
->
MulGradWeight
(
pre_out_grad
,
w_grad
,
in
);
}
bit_code
.
MulGradWeight
(
pre_out_grad
,
w_grad
,
*
in
);
bit_code
.
MulGradError
(
pre_out_grad
,
*
w
,
in_grad
);
bit_code
->
MulGradError
(
pre_out_grad
,
w
,
in_grad
);
}
};
...
...
paddle/fluid/operators/math/matrix_bit_code.cc
浏览文件 @
12e1719f
...
...
@@ -19,16 +19,15 @@ namespace operators {
namespace
math
{
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Add
(
framework
::
Tensor
*
tmat
,
const
framework
::
Tensor
&
vec
)
{
SimpleCodeTable
code_table
(
num_classes_
);
void
MatrixBitCodeFunctor
<
T
>::
Add
(
const
framework
::
Tensor
&
vec
,
framework
::
Tensor
*
tmat
)
{
size_t
batch_size
=
tmat
->
dims
()[
0
];
size_t
width
=
tmat
->
dims
()[
1
];
for
(
size_t
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
code
=
code_table
(
static_cast
<
size_t
>
(
ids_
[
i
])
);
int
code_length
=
code
.
get_length
();
auto
code
=
code_table
_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
.
calc_index
(
j
);
size_t
index
=
code
->
calc_index
(
j
);
tmat
->
data
<
T
>
()[
i
*
width
+
j
]
+=
vec
.
data
<
T
>
()[
index
];
}
}
...
...
@@ -37,31 +36,46 @@ void MatrixBitCodeFunctor<T>::Add(framework::Tensor* tmat,
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
AddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
vec
)
{
SimpleCodeTable
code_table
(
num_classes_
);
size_t
batch_size
=
tmat
.
dims
()[
0
];
size_t
width
=
tmat
.
dims
()[
1
];
for
(
size_t
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
code
=
code_table
(
static_cast
<
size_t
>
(
ids_
[
i
])
);
int
code_length
=
code
.
get_length
();
auto
code
=
code_table
_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
.
calc_index
(
j
);
size_t
index
=
code
->
calc_index
(
j
);
vec
->
data
<
T
>
()[
index
]
+=
tmat
.
data
<
T
>
()[
i
*
width
+
j
];
}
}
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
AddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
vec
)
{
size_t
batch_size
=
tmat
.
dims
()[
0
];
size_t
width
=
tmat
.
dims
()[
1
];
for
(
size_t
i
=
0
;
i
<
batch_size
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
->
calc_index
(
j
);
int64_t
row_index
=
vec
->
GetIndexFromId
(
static_cast
<
int64_t
>
(
index
));
vec
->
mutable_value
()
->
data
<
T
>
()[
row_index
]
+=
tmat
.
data
<
T
>
()[
i
*
width
+
j
];
}
}
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Sum
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
sum
,
T
scale_sum
)
{
SimpleCodeTable
code_table
(
num_classes_
);
size_t
num_samples
=
tmat
.
dims
()[
0
];
size_t
o_width
=
tmat
.
dims
()[
1
];
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
T
sm
=
static_cast
<
T
>
(
0.0
);
auto
code
=
code_table
(
static_cast
<
size_t
>
(
ids_
[
i
])
);
int
code_length
=
code
.
get_length
();
auto
code
=
code_table
_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
if
(
code
.
calc_bit
(
j
))
{
if
(
code
->
calc_bit
(
j
))
{
// calc_bit starts from right most bit, while data in tmat[i] is in the
// reverse order.
sm
+=
tmat
.
data
<
T
>
()[
i
*
o_width
+
j
];
...
...
@@ -75,7 +89,6 @@ template <typename T>
void
MatrixBitCodeFunctor
<
T
>::
Mul
(
framework
::
Tensor
*
tmat
,
const
framework
::
Tensor
&
weight
,
const
framework
::
Tensor
&
input
)
{
SimpleCodeTable
code_table
(
num_classes_
);
size_t
num_samples
=
tmat
->
dims
()[
0
];
size_t
tmat_width
=
tmat
->
dims
()[
1
];
size_t
input_width
=
input
.
dims
()[
1
];
...
...
@@ -84,10 +97,10 @@ void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
auto
weight_value
=
weight
.
data
<
T
>
();
auto
input_value
=
input
.
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
auto
code
=
code_table
(
static_cast
<
size_t
>
(
ids_
[
i
])
);
int
code_length
=
code
.
get_length
();
auto
code
=
code_table
_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
.
calc_index
(
j
);
size_t
index
=
code
->
calc_index
(
j
);
T
sum
=
static_cast
<
T
>
(
0.0
);
for
(
size_t
k
=
0
;
k
<
input_width
;
++
k
)
{
sum
+=
weight_value
[
weight_width
*
index
+
k
]
*
...
...
@@ -102,7 +115,6 @@ template <typename T>
void
MatrixBitCodeFunctor
<
T
>::
MulGradWeight
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
weight
,
const
framework
::
Tensor
&
input
)
{
SimpleCodeTable
code_table
(
num_classes_
);
size_t
num_samples
=
tmat
.
dims
()[
0
];
size_t
input_width
=
input
.
dims
()[
1
];
size_t
tmat_width
=
tmat
.
dims
()[
1
];
...
...
@@ -111,10 +123,10 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
auto
weight_value
=
weight
->
data
<
T
>
();
auto
input_value
=
input
.
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
auto
code
=
code_table
(
static_cast
<
size_t
>
(
ids_
[
i
])
);
int
code_length
=
code
.
get_length
();
auto
code
=
code_table
_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
.
calc_index
(
j
);
size_t
index
=
code
->
calc_index
(
j
);
for
(
size_t
k
=
0
;
k
<
input_width
;
++
k
)
{
weight_value
[
weight_width
*
index
+
k
]
+=
...
...
@@ -124,11 +136,35 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
}
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
MulGradWeight
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
weight
,
const
framework
::
Tensor
&
input
)
{
size_t
num_samples
=
tmat
.
dims
()[
0
];
size_t
input_width
=
input
.
dims
()[
1
];
size_t
tmat_width
=
tmat
.
dims
()[
1
];
size_t
weight_width
=
weight
->
value
().
dims
()[
1
];
auto
tmat_value
=
tmat
.
data
<
T
>
();
auto
weight_value
=
weight
->
mutable_value
()
->
data
<
T
>
();
auto
input_value
=
input
.
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
auto
code
=
code_table_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
->
calc_index
(
j
);
for
(
size_t
k
=
0
;
k
<
input_width
;
++
k
)
{
int64_t
row_index
=
weight
->
GetIndexFromId
(
static_cast
<
int64_t
>
(
index
));
weight_value
[
row_index
*
weight_width
+
k
]
+=
tmat_value
[
i
*
tmat_width
+
j
]
*
input_value
[
input_width
*
i
+
k
];
}
}
}
}
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
MulGradError
(
const
framework
::
Tensor
&
tmat
,
const
framework
::
Tensor
&
weight
,
framework
::
Tensor
*
input
)
{
SimpleCodeTable
code_table
(
num_classes_
);
size_t
num_samples
=
tmat
.
dims
()[
0
];
size_t
tmat_width
=
tmat
.
dims
()[
1
];
size_t
input_width
=
input
->
dims
()[
1
];
...
...
@@ -138,10 +174,10 @@ void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
auto
input_value
=
input
->
data
<
T
>
();
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
auto
code
=
code_table
(
static_cast
<
size_t
>
(
ids_
[
i
])
);
int
code_length
=
code
.
get_length
();
auto
code
=
code_table
_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
size_t
index
=
code
.
calc_index
(
j
);
size_t
index
=
code
->
calc_index
(
j
);
for
(
size_t
k
=
0
;
k
<
input_width
;
++
k
)
{
input_value
[
input_width
*
i
+
k
]
+=
...
...
@@ -154,14 +190,13 @@ void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
template
<
typename
T
>
void
MatrixBitCodeFunctor
<
T
>::
Sub
(
framework
::
Tensor
*
tmat
)
{
SimpleCodeTable
code_table
(
num_classes_
);
size_t
num_samples
=
tmat
->
dims
()[
0
];
size_t
o_width
=
tmat
->
dims
()[
1
];
for
(
size_t
i
=
0
;
i
<
num_samples
;
++
i
)
{
auto
code
=
code_table
(
static_cast
<
size_t
>
(
ids_
[
i
])
);
int
code_length
=
code
.
get_length
();
auto
code
=
code_table
_
->
get_code
(
i
);
int
code_length
=
code
->
get_length
();
for
(
int
j
=
0
;
j
<
code_length
;
++
j
)
{
if
(
code
.
calc_bit
(
j
))
{
if
(
code
->
calc_bit
(
j
))
{
tmat
->
data
<
T
>
()[
i
*
o_width
+
j
]
-=
1
;
}
}
...
...
paddle/fluid/operators/math/matrix_bit_code.h
浏览文件 @
12e1719f
...
...
@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
...
...
@@ -92,9 +94,27 @@ inline int clz(const T& value) {
inline
size_t
FindLastSet
(
size_t
x
)
{
return
sizeof
(
size_t
)
*
8
-
clz
(
x
);
}
#endif // !_WIN32
// set a code interface to create multiple code
class
Code
{
public:
virtual
~
Code
()
{}
virtual
size_t
calc_index
(
int
bit
)
const
=
0
;
virtual
bool
calc_bit
(
int
bit
)
const
=
0
;
virtual
int
get_length
()
const
=
0
;
};
// set a CodeTable interface to create multiple code table
class
CodeTable
{
public:
virtual
std
::
unique_ptr
<
Code
>
get_code
(
int64_t
code
)
const
=
0
;
virtual
size_t
size
()
const
=
0
;
virtual
int
get_max_code_length
()
const
=
0
;
virtual
~
CodeTable
()
{}
};
struct
SimpleCode
{
SimpleCode
(
size_t
code
,
size_t
num_classes
)
:
c_
(
code
+
num_classes
)
{}
class
SimpleCode
:
public
Code
{
public:
SimpleCode
(
size_t
code
,
size_t
num_classes
,
const
int64_t
*
ids
)
:
c_
(
static_cast
<
size_t
>
(
ids
[
code
])
+
num_classes
)
{}
/**
* Here the id of root shoud be 1 rather than 0, thus the encoding of class c
* is `c + num_classes` and all siblings can get the same weight indice using
...
...
@@ -104,41 +124,121 @@ struct SimpleCode {
* Binary classification path is the suffixes of encoding, thus leave out the
* left most bit in calc_bit.
*/
inline
size_t
calc_index
(
int
bit
)
const
{
return
(
c_
>>
(
bit
+
1
))
-
1
;
}
inline
bool
calc_bit
(
int
bit
)
const
{
return
c_
&
(
1
<<
bit
);
}
in
line
in
t
get_length
()
const
{
return
FindLastSet
(
c_
)
-
1
;
}
size_t
calc_index
(
int
bit
)
const
{
return
(
c_
>>
(
bit
+
1
))
-
1
;
}
bool
calc_bit
(
int
bit
)
const
{
return
c_
&
(
1
<<
bit
);
}
int
get_length
()
const
{
return
FindLastSet
(
c_
)
-
1
;
}
private:
size_t
c_
;
};
struct
SimpleCodeTable
{
explicit
SimpleCodeTable
(
size_t
num_classes
)
:
num_classes_
(
num_classes
)
{}
SimpleCode
operator
()(
size_t
code
)
const
{
return
SimpleCode
(
code
,
num_classes_
);
template
<
typename
T
>
class
CustomCode
:
public
Code
{
public:
CustomCode
(
const
framework
::
Tensor
&
ptable
,
const
framework
::
Tensor
&
pcode
,
const
int64_t
*
ids
,
int
index
)
:
ids_
(
ids
),
index_
(
index
)
{
ptable_
=
ptable
.
Slice
(
index
,
index
+
1
);
pcode_
=
pcode
.
Slice
(
index
,
index
+
1
);
}
/**
* Here the id of root shoud be 1 rather than 0, thus the encoding of class c
* is `c + num_classes` and all siblings can get the same weight indice using
* prefixes.
* Weight index is the prefixes of encoding, thus leave out the right most
* bit in calc_index.
* Binary classification path is the suffixes of encoding, thus leave out the
* left most bit in calc_bit.
*/
size_t
calc_index
(
int
bit
)
const
{
return
ptable_
.
data
<
T
>
()[
bit
];
}
bool
calc_bit
(
int
bit
)
const
{
return
pcode_
.
data
<
T
>
()[
bit
];
}
int
get_length
()
const
{
int
length
=
0
;
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
ptable_
.
dims
()[
1
]);
i
++
)
{
if
(
ptable_
.
data
<
T
>
()[
i
]
>=
0
)
{
length
++
;
}
else
{
return
length
;
}
}
return
length
;
}
private:
framework
::
Tensor
ptable_
;
framework
::
Tensor
pcode_
;
const
int64_t
*
ids_
;
const
int
index_
;
};
class
SimpleCodeTable
:
public
CodeTable
{
public:
SimpleCodeTable
(
size_t
num_classes
,
const
int64_t
*
ids
)
:
num_classes_
(
num_classes
),
ids_
(
ids
)
{}
std
::
unique_ptr
<
Code
>
get_code
(
int64_t
code
)
const
{
std
::
unique_ptr
<
Code
>
coder
(
new
SimpleCode
(
code
,
num_classes_
,
ids_
));
return
coder
;
}
size_t
size
()
const
{
return
num_classes_
;
}
int
get_max_code_length
()
const
{
return
FindLastSet
(
num_classes_
-
1
);
}
private:
size_t
num_classes_
;
const
int64_t
*
ids_
;
};
template
<
typename
T
>
class
CustomCodeTable
:
public
CodeTable
{
public:
CustomCodeTable
(
const
framework
::
Tensor
&
ptable
,
const
framework
::
Tensor
&
pcode
,
const
int64_t
*
ids
)
:
ptable_
(
ptable
),
pcode_
(
pcode
),
ids_
(
ids
)
{}
std
::
unique_ptr
<
Code
>
get_code
(
int64_t
code
)
const
{
std
::
unique_ptr
<
Code
>
coder
(
new
CustomCode
<
T
>
(
ptable_
,
pcode_
,
ids_
,
code
));
return
coder
;
}
size_t
size
()
const
{
return
static_cast
<
size_t
>
(
ptable_
.
dims
()[
1
]);
}
int
get_max_code_length
()
const
{
return
static_cast
<
size_t
>
(
ptable_
.
dims
()[
1
]);
}
private:
const
framework
::
Tensor
&
ptable_
;
const
framework
::
Tensor
&
pcode_
;
const
int64_t
*
ids_
;
};
template
<
typename
T
>
class
MatrixBitCodeFunctor
{
public:
explicit
MatrixBitCodeFunctor
(
size_t
num_classes
,
const
int64_t
*
ids
)
:
num_classes_
(
num_classes
),
ids_
(
ids
)
{}
MatrixBitCodeFunctor
(
size_t
num_classes
,
const
int64_t
*
ids
)
:
num_classes_
(
num_classes
),
ids_
(
ids
),
code_table_
(
new
SimpleCodeTable
(
num_classes
,
ids
))
{}
MatrixBitCodeFunctor
(
const
framework
::
Tensor
&
ptable
,
const
framework
::
Tensor
&
pcode
,
const
int64_t
*
ids
)
:
num_classes_
(
static_cast
<
size_t
>
(
ptable
.
dims
()[
1
])),
ids_
(
ids
),
code_table_
(
new
CustomCodeTable
<
int64_t
>
(
ptable
,
pcode
,
ids
))
{}
/* For j < code_length
tmat(i, j) += vec(0, index(i, j))
*/
void
Add
(
framework
::
Tensor
*
tmat
,
const
framework
::
Tensor
&
vec
);
void
Add
(
const
framework
::
Tensor
&
vec
,
framework
::
Tensor
*
tmat
);
/* For j < code_length
vec(0, index(i, j)) += tmat(i, j)
*/
void
AddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
vec
);
/* For selected rows For j < code_length
vec(0, index(i, j)) += tmat(i, j)
*/
void
AddGrad
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
vec
);
/* For j < code_length
sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
*/
...
...
@@ -159,6 +259,12 @@ class MatrixBitCodeFunctor {
*/
void
MulGradWeight
(
const
framework
::
Tensor
&
tmat
,
framework
::
Tensor
*
weight
,
const
framework
::
Tensor
&
input
);
/* For SelectedRows Weight, For index(i, j) >= 0:
weight.row(index(i, j)) += tmat(i, j) * input.row(i)
*/
void
MulGradWeight
(
const
framework
::
Tensor
&
tmat
,
framework
::
SelectedRows
*
weight
,
const
framework
::
Tensor
&
input
);
/* For j < code_length
input.row(i) += tmat(i, j) * weight.row(index(i, j))
*/
...
...
@@ -167,6 +273,7 @@ class MatrixBitCodeFunctor {
size_t
num_classes_
;
const
int64_t
*
ids_
;
std
::
unique_ptr
<
CodeTable
>
code_table_
;
};
}
// namespace math
}
// namespace operators
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
12e1719f
...
...
@@ -4587,27 +4587,43 @@ def hsigmoid(input,
num_classes
,
param_attr
=
None
,
bias_attr
=
None
,
name
=
None
):
name
=
None
,
path_table
=
None
,
path_code
=
None
,
is_custom
=
False
,
is_sparse
=
False
):
"""
The hierarchical sigmoid operator is used to accelerate the training
process of language model. This operator organizes the classes into a
complete binary tree, each leaf node represents a class(a word) and each
complete binary tree, or you can use is_custom to pass your own tree to
implement hierarchical. Each leaf node represents a class(a word) and each
internal node acts as a binary classifier. For each word there's a unique
path from root to it's leaf node, hsigmoid calculate the cost for each
internal node on the path, and sum them to get a total cost. hsigmoid can
achive a acceleration from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
represents the size of word dict.
Refer to `Hierarchical Probabilistic Neural Network Language Model
Using default tree you can
Refer to `Hierarchical Probabilistic Neural Network Language Model
<http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
And if you want to use the costumed tree by set 'is_custom' as true you may need to do following things first:
1. using your word dict to build a binary tree, each leaf node should be an word of your word dict
2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
means label of each binary classification, using 1 indicate true, 0 indicate false.
4. now, each word should has its path and code along the path, you can pass a batch of path and code
related to the same batch of inputs.
Args:
input (Variable): The input tensor variable with shape
:math:`[N
\\
times D]`, where :math:`N` is the size of mini-batch,
and :math:`D` is the feature size.
label (Variable): The tensor variable contains labels of training data.
It's a tensor with shape is :math:`[N
\\
times 1]`.
num_classes: (int), The number of classes, must not be less than 2.
num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set,
it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num
which indicates the num of classes using by binary classify.
param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
will create ParamAttr as param_attr. If the Initializer of the param_attr
...
...
@@ -4619,9 +4635,19 @@ def hsigmoid(input,
is not set, the bias is initialized zero. Default: None.
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically. Default: None.
path_table: (Variable|None) this variable can store each batch of samples' path to root,
it should be in leaf -> root order
path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like
structure and each element in this array is indexes in parent nodes' Weight Matrix.
path_code: (Variable|None) this variable can store each batch of samples' code,
each code consist with every code of parent nodes. it should be in leaf -> root order
is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is
set you need to set path_table/path_code/num_classes, otherwise num_classes should be set
is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient
of W and input will be sparse.
Returns:
Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
Out: (
Lod
Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
Examples:
...
...
@@ -4637,27 +4663,62 @@ def hsigmoid(input,
out
=
helper
.
create_variable_for_type_inference
(
dtype
)
pre_out
=
helper
.
create_variable_for_type_inference
(
dtype
)
dim
=
input
.
shape
[
1
]
if
num_classes
<
2
:
raise
ValueError
(
"num_classes must not be less than 2."
)
weights
=
helper
.
create_parameter
(
attr
=
helper
.
param_attr
,
shape
=
[
num_classes
-
1
,
dim
],
is_bias
=
False
,
dtype
=
input
.
dtype
)
inputs
=
{
"X"
:
input
,
"W"
:
weights
,
"Label"
:
label
}
if
helper
.
bias_attr
:
bias
=
helper
.
create_parameter
(
attr
=
helper
.
bias_attr
,
shape
=
[
1
,
num_classes
-
1
],
is_bias
=
True
,
if
((
num_classes
is
None
)
or
(
num_classes
<
2
))
and
(
not
is_custom
):
raise
ValueError
(
"num_classes must not be less than 2 with default tree"
)
if
(
is_custom
)
and
(
path_code
is
None
):
raise
ValueError
(
"path_code should not be None with costum tree"
)
elif
(
is_custom
)
and
(
path_table
is
None
):
raise
ValueError
(
"path_table should not be None with costum tree"
)
elif
(
is_custom
)
and
(
num_classes
is
None
):
raise
ValueError
(
"num_classes should not be None with costum tree"
)
else
:
pass
weights
=
None
if
not
is_custom
:
weights
=
helper
.
create_parameter
(
attr
=
helper
.
param_attr
,
shape
=
[
num_classes
-
1
,
dim
],
is_bias
=
False
,
dtype
=
input
.
dtype
)
inputs
[
'Bias'
]
=
bias
else
:
weights
=
helper
.
create_parameter
(
attr
=
helper
.
param_attr
,
shape
=
[
num_classes
,
dim
],
is_bias
=
False
,
dtype
=
input
.
dtype
)
inputs
=
{
"X"
:
input
,
"W"
:
weights
,
"PTable"
:
path_table
,
"PathCode"
:
path_code
,
"Label"
:
label
}
if
helper
.
bias_attr
:
if
not
is_custom
:
bias
=
helper
.
create_parameter
(
attr
=
helper
.
bias_attr
,
shape
=
[
num_classes
-
1
,
1
],
is_bias
=
True
,
dtype
=
input
.
dtype
)
inputs
[
'Bias'
]
=
bias
else
:
bias
=
helper
.
create_parameter
(
attr
=
helper
.
bias_attr
,
shape
=
[
num_classes
,
1
],
is_bias
=
True
,
dtype
=
input
.
dtype
)
inputs
[
'Bias'
]
=
bias
helper
.
append_op
(
type
=
"hierarchical_sigmoid"
,
inputs
=
inputs
,
outputs
=
{
"Out"
:
out
,
"PreOut"
:
pre_out
},
attrs
=
{
"num_classes"
:
num_classes
})
attrs
=
{
"num_classes"
:
num_classes
,
"is_sparse"
:
is_sparse
})
return
out
...
...
python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
浏览文件 @
12e1719f
...
...
@@ -16,6 +16,8 @@ from __future__ import print_function
import
unittest
import
numpy
as
np
import
paddle.fluid.core
as
core
import
paddle.fluid
as
fluid
import
math
from
op_test
import
OpTest
...
...
@@ -40,6 +42,29 @@ class CodeTable(object):
return
self
.
c
&
(
1
<<
bit
)
class
CodeTableWithCustomTree
(
object
):
def
__init__
(
self
,
path_table
,
path_code
,
index
):
self
.
ptable_
=
path_table
self
.
pcode_
=
path_code
self
.
index_
=
index
def
cal_index
(
self
,
bit
):
return
self
.
ptable_
[
self
.
index_
][
bit
]
def
get_length
(
self
):
length
=
0
for
ele
in
self
.
ptable_
[
self
.
index_
]:
# find the first -1 to stop trace
if
ele
>=
0
:
length
=
length
+
1
else
:
return
length
return
length
def
cal_bit
(
self
,
bit
):
return
self
.
pcode_
[
self
.
index_
][
bit
]
def
hsigmoid
(
x
,
w
,
label
,
bias
,
num_classes
):
batch_size
=
x
.
shape
[
0
]
code_length
=
find_latest_set
(
num_classes
-
1
)
...
...
@@ -52,7 +77,7 @@ def hsigmoid(x, w, label, bias, num_classes):
length
=
code_table
.
get_length
()
for
j
in
range
(
length
):
idx
=
code_table
.
cal_index
(
j
)
pre_output
[
i
][
j
]
+=
bias
[
0
][
idx
]
pre_output
[
i
][
j
]
+=
bias
[
idx
][
0
]
for
i
in
range
(
batch_size
):
code_table
=
CodeTable
(
num_classes
,
label
[
i
])
length
=
code_table
.
get_length
()
...
...
@@ -77,17 +102,58 @@ def hsigmoid(x, w, label, bias, num_classes):
return
pre_output
,
out
def
hsigmoidWithCustomTree
(
x
,
w
,
path_table
,
path_code
,
label
,
bias
,
num_classes
):
batch_size
=
x
.
shape
[
0
]
code_length
=
len
(
path_table
[
0
])
code_table
=
[
0
for
_
in
range
(
code_length
)]
# init pre_out with shape [N, code_length]
pre_output
=
np
.
zeros
((
batch_size
,
code_length
))
pre_sum
=
np
.
zeros
((
batch_size
,
1
))
out
=
np
.
zeros
((
batch_size
,
1
)).
astype
(
"float32"
)
if
isinstance
(
bias
,
np
.
ndarray
):
for
i
in
range
(
batch_size
):
code_table
=
CodeTableWithCustomTree
(
path_table
,
path_code
,
i
)
length
=
code_table
.
get_length
()
for
j
in
range
(
length
):
idx
=
code_table
.
cal_index
(
j
)
pre_output
[
i
][
j
]
+=
bias
[
idx
][
0
]
for
i
in
range
(
batch_size
):
code_table
=
CodeTableWithCustomTree
(
path_table
,
path_code
,
i
)
length
=
code_table
.
get_length
()
for
j
in
range
(
length
):
idx
=
code_table
.
cal_index
(
j
)
pre_output
[
i
][
j
]
+=
np
.
dot
(
w
[
idx
],
x
[
i
])
# clip[-40.0, 40.0]
pre_output
=
np
.
clip
(
pre_output
,
-
40.0
,
40.0
)
# out(i, 0) = \sum_j bit(i, j) * preout(i, j)
for
i
in
range
(
batch_size
):
code_table
=
CodeTableWithCustomTree
(
path_table
,
path_code
,
i
)
length
=
code_table
.
get_length
()
sum
=
0.0
for
j
in
range
(
length
):
if
code_table
.
cal_bit
(
j
):
sum
+=
pre_output
[
i
][
j
]
out
[
i
]
=
-
1.0
*
sum
# soft relu
pre_output
=
np
.
log
(
1
+
np
.
exp
(
pre_output
))
pre_sum
=
pre_output
.
sum
(
1
).
reshape
((
batch_size
,
1
))
out
+=
pre_sum
return
pre_output
,
out
class
TestHSigmoidOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"hierarchical_sigmoid"
num_classes
=
6
feature_size
=
8
batch_size
=
4
x
=
np
.
random
.
random
((
batch_size
,
feature_size
)).
astype
(
"float32"
)
w
=
np
.
random
.
random
((
num_classes
-
1
,
feature_size
)).
astype
(
"float32"
)
x
=
np
.
random
.
random
((
batch_size
,
feature_size
)).
astype
(
"float32"
)
*
2
w
=
np
.
random
.
random
(
(
num_classes
-
1
,
feature_size
)).
astype
(
"float32"
)
*
2
label
=
np
.
random
.
randint
(
0
,
num_classes
,
(
batch_size
,
1
))
bias
=
np
.
random
.
random
((
1
,
num_classes
-
1
)).
astype
(
"float32"
)
self
.
attrs
=
{
'num_classes'
:
num_classes
}
bias
=
np
.
random
.
random
((
num_classes
-
1
,
1
)).
astype
(
"float32"
)
self
.
attrs
=
{
'num_classes'
:
num_classes
,
'is_sparse'
:
False
}
self
.
inputs
=
{
'X'
:
x
,
'W'
:
w
,
'Label'
:
label
,
'Bias'
:
bias
}
pre_output
,
out
=
hsigmoid
(
x
,
w
,
label
,
bias
,
num_classes
)
self
.
outputs
=
{
'PreOut'
:
pre_output
,
'Out'
:
out
}
...
...
@@ -99,5 +165,185 @@ class TestHSigmoidOp(OpTest):
self
.
check_grad
([
'Bias'
,
'X'
,
'W'
],
[
'Out'
],
no_grad_set
=
set
(
'Label'
))
class
TestHSigmoidOpSparse
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"hierarchical_sigmoid"
num_classes
=
6
#using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
feature_size
=
8
batch_size
=
4
x
=
np
.
random
.
random
((
batch_size
,
feature_size
)).
astype
(
"float32"
)
w
=
np
.
random
.
random
((
num_classes
-
1
,
feature_size
)).
astype
(
"float32"
)
label
=
np
.
array
([
0
,
1
,
4
,
5
])
path_table
=
np
.
array
(
[(
0
,
2
,
-
1
,
-
1
,
-
1
),
(
0
,
1
,
3
,
-
1
,
-
1
),
(
0
,
1
,
4
,
-
1
,
-
1
),
(
0
,
2
,
-
1
,
-
1
,
-
1
)])
#np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
path_code
=
np
.
array
([(
0
,
0
,
-
1
,
-
1
,
-
1
),
(
1
,
1
,
1
,
-
1
,
-
1
),
(
1
,
0
,
0
,
-
1
,
-
1
),
(
0
,
1
,
-
1
,
-
1
,
-
1
)])
#np.array to store
bias
=
np
.
random
.
random
((
num_classes
-
1
,
1
)).
astype
(
"float32"
)
self
.
attrs
=
{
'num_classes'
:
num_classes
,
'is_sparse'
:
True
}
self
.
inputs
=
{
'X'
:
x
,
'W'
:
w
,
'PTable'
:
path_table
,
'PathCode'
:
path_code
,
'Label'
:
label
,
'Bias'
:
bias
}
pre_output
,
out
=
hsigmoidWithCustomTree
(
x
,
w
,
path_table
,
path_code
,
label
,
bias
,
num_classes
)
self
.
outputs
=
{
'PreOut'
:
pre_output
,
'Out'
:
out
}
def
test_check_output
(
self
):
self
.
check_output
()
class
TestHSigmoidOpWithSparseGrad
(
unittest
.
TestCase
):
def
hs_net_conf
(
self
,
is_sparse
):
input_word
=
fluid
.
layers
.
data
(
name
=
"x"
,
shape
=
[
1
],
dtype
=
'int64'
)
path_table
=
fluid
.
layers
.
data
(
name
=
'path_table'
,
shape
=
[
3
],
dtype
=
'int64'
)
path_code
=
fluid
.
layers
.
data
(
name
=
'path_code'
,
shape
=
[
3
],
dtype
=
'int64'
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
data_list
=
[
input_word
,
path_table
,
path_code
,
label
]
emb
=
fluid
.
layers
.
embedding
(
input
=
input_word
,
is_sparse
=
is_sparse
,
size
=
[
3
,
3
],
param_attr
=
fluid
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
Normal
(
scale
=
1
/
math
.
sqrt
(
3
))))
cost
=
fluid
.
layers
.
hsigmoid
(
input
=
emb
,
label
=
label
,
bias_attr
=
True
,
num_classes
=
3
,
path_table
=
path_table
,
path_code
=
path_code
,
is_custom
=
True
,
is_sparse
=
is_sparse
)
avg_cost
=
fluid
.
layers
.
reduce_mean
(
cost
)
return
avg_cost
,
data_list
def
training_test
(
self
,
is_sparse
):
with
fluid
.
program_guard
(
fluid
.
Program
(),
fluid
.
Program
()):
start_up
=
fluid
.
default_startup_program
()
start_up
.
random_seed
=
1
# Fix random seed
x
=
np
.
arange
(
6
).
reshape
(
6
)
path_table
=
np
.
array
([(
1
,
2
,
-
1
),
(
1
,
2
,
-
1
)])
path_code
=
np
.
array
([(
1
,
0
,
-
1
),
(
0
,
0
,
-
1
)])
label
=
np
.
array
([
1
,
4
])
loss
,
data_list
=
self
.
hs_net_conf
(
is_sparse
)
optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
1e-3
)
optimizer
.
minimize
(
loss
)
main_program
=
fluid
.
default_main_program
()
place
=
fluid
.
CPUPlace
()
feeder
=
fluid
.
DataFeeder
(
feed_list
=
data_list
,
place
=
place
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
start_up
)
result
=
list
()
for
i
in
range
(
10
):
data
=
[([[
x
[
i
%
2
]]],
[
list
(
path_table
[
i
%
2
])],
[
list
(
path_code
[
i
%
2
])],
[
label
[
i
%
2
]])]
loss_val
=
exe
.
run
(
main_program
,
feed
=
feeder
.
feed
(
data
),
fetch_list
=
[
loss
])
result
.
append
(
loss_val
)
return
result
def
test_hs_grad_with_sparse
(
self
):
dense_result
=
self
.
training_test
(
is_sparse
=
False
)
sparse_result
=
self
.
training_test
(
is_sparse
=
True
)
assert
(
dense_result
==
sparse_result
)
class
TestHSigmoidOpWithCostumTree
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"hierarchical_sigmoid"
num_classes
=
6
#using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
feature_size
=
8
batch_size
=
4
x
=
np
.
random
.
random
((
batch_size
,
feature_size
)).
astype
(
"float32"
)
*
2
w
=
np
.
random
.
random
(
(
num_classes
-
1
,
feature_size
)).
astype
(
"float32"
)
*
2
label
=
np
.
array
([
0
,
1
,
4
,
5
])
path_table
=
np
.
array
(
[(
0
,
2
,
-
1
,
-
1
,
-
1
),
(
0
,
1
,
3
,
-
1
,
-
1
),
(
0
,
1
,
4
,
-
1
,
-
1
),
(
0
,
2
,
-
1
,
-
1
,
-
1
)])
#np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
path_code
=
np
.
array
([(
0
,
0
,
-
1
,
-
1
,
-
1
),
(
1
,
1
,
1
,
-
1
,
-
1
),
(
1
,
0
,
0
,
-
1
,
-
1
),
(
0
,
1
,
-
1
,
-
1
,
-
1
)])
#np.array to store
bias
=
np
.
random
.
random
((
num_classes
-
1
,
1
)).
astype
(
"float32"
)
self
.
attrs
=
{
'num_classes'
:
num_classes
,
'is_sparse'
:
False
}
self
.
inputs
=
{
'X'
:
x
,
'W'
:
w
,
'PTable'
:
path_table
,
'PathCode'
:
path_code
,
'Label'
:
label
,
'Bias'
:
bias
}
pre_output
,
out
=
hsigmoidWithCustomTree
(
x
,
w
,
path_table
,
path_code
,
label
,
bias
,
num_classes
)
self
.
outputs
=
{
'PreOut'
:
pre_output
,
'Out'
:
out
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'Bias'
,
'X'
,
'W'
],
[
'Out'
],
no_grad_set
=
set
(
'Label'
))
class
TestHSigmoidOpWithCostumTreeWithoutBias
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"hierarchical_sigmoid"
num_classes
=
6
#using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
feature_size
=
8
batch_size
=
4
x
=
np
.
random
.
random
((
batch_size
,
feature_size
)).
astype
(
"float32"
)
*
2
w
=
np
.
random
.
random
(
(
num_classes
-
1
,
feature_size
)).
astype
(
"float32"
)
*
2
label
=
np
.
array
([
0
,
1
,
4
,
5
])
path_table
=
np
.
array
(
[(
0
,
2
,
-
1
,
-
1
,
-
1
),
(
0
,
1
,
3
,
-
1
,
-
1
),
(
0
,
1
,
4
,
-
1
,
-
1
),
(
0
,
2
,
-
1
,
-
1
,
-
1
)])
#np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
path_code
=
np
.
array
([(
0
,
0
,
-
1
,
-
1
,
-
1
),
(
1
,
1
,
1
,
-
1
,
-
1
),
(
1
,
0
,
0
,
-
1
,
-
1
),
(
0
,
1
,
-
1
,
-
1
,
-
1
)])
#np.array to store
# bias = np.random.random((num_classes - 1, 1)).astype("float32")
self
.
attrs
=
{
'num_classes'
:
num_classes
,
'is_sparse'
:
False
}
self
.
inputs
=
{
'X'
:
x
,
'W'
:
w
,
'PTable'
:
path_table
,
'PathCode'
:
path_code
,
'Label'
:
label
,
}
pre_output
,
out
=
hsigmoidWithCustomTree
(
x
=
x
,
w
=
w
,
path_table
=
path_table
,
path_code
=
path_code
,
label
=
label
,
bias
=
None
,
num_classes
=
num_classes
)
self
.
outputs
=
{
'PreOut'
:
pre_output
,
'Out'
:
out
}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
,
'W'
],
[
'Out'
],
no_grad_set
=
set
(
'Label'
))
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
12e1719f
...
...
@@ -185,6 +185,25 @@ class TestBook(unittest.TestCase):
input
=
x
,
label
=
y
,
num_classes
=
2
))
print
(
str
(
program
))
# test hsigmod with custom tree structure
program2
=
Program
()
with
program_guard
(
program2
):
x2
=
layers
.
data
(
name
=
'x2'
,
shape
=
[
4
,
8
],
dtype
=
'float32'
)
y2
=
layers
.
data
(
name
=
'y2'
,
shape
=
[
4
],
dtype
=
'int64'
)
path_table
=
layers
.
data
(
name
=
'path_table'
,
shape
=
[
4
,
6
],
dtype
=
'int64'
)
path_code
=
layers
.
data
(
name
=
'path_code'
,
shape
=
[
4
,
6
],
dtype
=
'int64'
)
self
.
assertIsNotNone
(
layers
.
hsigmoid
(
input
=
x2
,
label
=
y2
,
num_classes
=
6
,
path_table
=
path_table
,
path_code
=
path_code
,
is_custom
=
True
))
print
(
str
(
program2
))
def
test_sequence_expand
(
self
):
program
=
Program
()
with
program_guard
(
program
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录