提交 e7a4cfc0 编写于 作者: G guosheng

complete the hsigmoid_op

上级 d6953816
......@@ -86,25 +86,25 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor, required) The input Tensor, which the shape is"
"[N, D], which N is the size of mini-batch,"
"D is the embded size");
"(Tensor, required) The input tensor with shape [N, D], "
"where N is the size of mini-batch, and D is the feature size.");
AddInput("W",
"(Tensor, required), The parameters of hierarchical "
"sigmoid operator, each of them is s a 2-D tensor, the shape is"
"[num_classes - 1, D]");
"sigmoid operator, each of them is a 2-D tensor, the shape is"
"[num_classes - 1, D].");
AddInput("Label",
"(Tensor, required), The labels of training data. It's a"
"1-D tensor, which the shape is [N, 1]");
"tensor with shape [N, 1].");
AddInput("Bias",
"(Tensor, optional), The bias is a tensor with shape"
"[1, num_classes - 1]");
"[1, num_classes - 1].");
AddOutput("Out",
"(Tensor, required) The output of hierarchical sigmoid operator."
"the shape is [N, 1]");
"The shape is [N, 1].");
AddOutput("PreOut",
"(Tensor, required) A intermedia 2-D Tensor, which the shape is "
"[batch_size, code_length]")
"(Tensor, required) A intermedia 2-D tensor with shape "
"[batch_size, code_length], where code_length represents the "
"maximum path length from root to leaf nodes.")
.AsIntermediate();
AddAttr<AttrType>("num_classes", "(int, required), The number of classes")
.SetDefault(2);
......
......@@ -44,9 +44,11 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
framework::Tensor sum;
math::SetConstant<DeviceContext, T> zero;
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto pre_out_data = pre_out->mutable_data<T>(
auto* pre_out_data = pre_out->mutable_data<T>(
framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
// Not all class(leaf) nodes' path lengths equal code_length, thus init as
// 0s can avoid out of path's loss.
zero(dev_ctx, pre_out, static_cast<T>(0.0));
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
math::RowwiseSum<DeviceContext, T> row_sum;
......@@ -61,16 +63,13 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
bit_code.Add(pre_out, *bias);
}
bit_code.Mul(pre_out, *w, *in);
// clip the matrix with (-40, 40)
// clip to [-40, 40]
Transform<DeviceContext> trans;
trans(ctx.template device_context<DeviceContext>(), pre_out_data,
pre_out_data + pre_out->numel(), pre_out_data,
ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
bit_code.Sum(*pre_out, out, static_cast<T>(-1));
// softrelu with threshold is 40.0
trans(ctx.template device_context<DeviceContext>(), pre_out_data,
pre_out_data + pre_out->numel(), pre_out_data,
ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
// use softrelu to calculate cross entropy
pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
row_sum(dev_ctx, *pre_out, &sum);
out_mat.device(place) = sum_mat + out_mat;
......@@ -102,14 +101,16 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
// softrelu derivative
Eigen::array<int, 2> bcast({1, static_cast<int>(pre_out_grad.dims()[1])});
Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});
auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
pre_out_grad_mat = out_grad_mat.broadcast(bcast);
pre_out_grad_mat.device(place) =
pre_out_grad_mat *
(static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp());
(static_cast<T>(1.0) -
static_cast<T>(1.0) / pre_out_mat.exp()); // softrelu derivative
bit_code.Sub(&pre_out_grad);
// TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
// be consistent with the clipping in forward.
if (bias_grad) {
bias_grad->mutable_data<T>(ctx.GetPlace());
bit_code.AddGrad(pre_out_grad, bias_grad);
......
......@@ -65,12 +65,24 @@ inline constexpr size_t FindLastSet(size_t x) {
struct SimpleCode {
SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {}
/**
* calc_index should make sure that all siblings have the same weight indice.
* As for which weight index it maps to, it doesn't matter. To satisfy this,
* the id of root should be 1, and the left child of a node i is 2*i, the
* right child of a node i is 2*i+1.
*/
inline size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; }
/**
* calc_bit uses the right most bits, while calc_index uses the left most
* bits. They are not the same, and that's why we say it doesn't matter which
* weight index calc_index maps to.
*/
inline bool calc_bit(int bit) const { return c_ & (1 << bit); }
inline int get_length() const { return FindLastSet(c_) - 1; }
private:
size_t c_;
size_t c_; // Here the id of root is 1 rather than 0, thus the id of class c
// is `c + num_classes`.
};
struct SimpleCodeTable {
......@@ -83,7 +95,6 @@ struct SimpleCodeTable {
private:
size_t num_classes_;
int max_code_length_;
};
template <typename T>
......
......@@ -3858,29 +3858,32 @@ def nce(input,
return cost / (num_neg_samples + 1)
def hsigmoid(input, label, num_classes=2, param_attr=None, bias_attr=None):
def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None):
"""
The hierarchical sigmoid operator is used to accelerate the training
process of language model. This operator organizes the classes into a
complete binary tree, each leaf node represents a class(a word) and each internal
node acts likea binary classifier. For each word there's a unique path from root
to it's leaf node, hsigmoid calculate the cost for each internal node on the path
(include root), and sum them to get a total cost. hsigmoid can achive a acceleration
from N to logN, for which N represents the size of word dict. This idea is from "F.
Morin, Y. Bengio(AISTATS 05): Hierarchical Probabilistic Neural Network Language Model.
complete binary tree, each leaf node represents a class(a word) and each
internal node acts as a binary classifier. For each word there's a unique
path from root to it's leaf node, hsigmoid calculate the cost for each
internal node on the path, and sum them to get a total cost. hsigmoid can
achive a acceleration from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
represents the size of word dict.
Refer to `Hierarchical Probabilistic Neural Network Language Model
<http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
Args:
input (Variable): (Tensor) The input Tensor, which the shape is
[N * D], which N is the size of mini-batch,D is the embded size
label (Variable): (Tensor), The labels of training data. It's a
1-D tensor, which the shape is [1, N]
num_classes: (int, default 2), The number of classes, must be lager or
equal than 2.
input (Variable): The input tensor variable with shape
:math:`[N \\times D]`, where :math:`N` is the size of mini-batch,
and :math:`D` is the feature size.
label (Variable): The tensor variable contains labels of training data.
It's a tensor with shape is :math:`[N \\times 1]`.
num_classes: (int), The number of classes, must not be less than 2.
param_attr (ParamAttr|list of ParamAttr, default None): The parameter
attribute for learnable parameters/weights of this layer.
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter
attribute for the bias of this layer. If it is set to None, no bias
will be added to the output units.
attribute for the bias of this layer. If it is set to False, no
bias will be applied.
Returns:
Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
......@@ -3889,11 +3892,9 @@ def hsigmoid(input, label, num_classes=2, param_attr=None, bias_attr=None):
.. code-block:: python
x = fluid.layers.data(name='x', shape=[3, 2],
dtype='float32')
y = fluid.layers.data(name='y', shape=[1, 3],
dtype='int64')
out = fluid.layers.hsigmoid(input=x, label=y, num_classes=2)
x = fluid.layers.data(name='x', shape=[2], dtype='float32')
y = fluid.layers.data(name='y', shape=[1], dtype='int64')
out = fluid.layers.hsigmoid(input=x, label=y, num_classes=6)
"""
helper = LayerHelper('hierarchical_sigmoid', **locals())
......@@ -3902,7 +3903,7 @@ def hsigmoid(input, label, num_classes=2, param_attr=None, bias_attr=None):
pre_out = helper.create_tmp_variable(dtype)
dim = input.shape[1]
if num_classes < 2:
raise ValueError("num_classes must be lager or equal than 2.")
raise ValueError("num_classes must not be less than 2.")
weights = helper.create_parameter(
attr=helper.param_attr,
shape=[num_classes - 1, dim],
......
......@@ -55,10 +55,7 @@ def hsigmoid(x, w, label, bias, num_classes):
length = code_table.get_length()
for k in range(length):
idx = code_table.cal_index(k)
sum = 0.0
for l in range(x.shape[1]):
sum += w[idx][l] * x[j][l]
pre_output[j][k] += sum
pre_output[j][k] = np.dot(w[idx], x[j])
# clip[-40.0, 40.0]
pre_output = np.clip(pre_output, -40.0, 40.0)
# out(i, 0) = \sum_j bit(i, j) * preout(i, j)
......@@ -71,7 +68,6 @@ def hsigmoid(x, w, label, bias, num_classes):
sum += pre_output[i][j]
out[i] = -1.0 * sum
# soft relu
np.clip(pre_output, -40.0, 40.0)
pre_output = np.log(1 + np.exp(pre_output))
pre_sum = pre_output.sum(1).reshape((batch_size, 1))
out += pre_sum
......@@ -81,11 +77,11 @@ def hsigmoid(x, w, label, bias, num_classes):
class TestHSigmoidOp(OpTest):
def setUp(self):
self.op_type = "hierarchical_sigmoid"
num_classes = 4
embded_size = 1
batch_size = 1
x = np.random.random((batch_size, embded_size)).astype("float32")
w = np.random.random((num_classes - 1, embded_size)).astype("float32")
num_classes = 6
feature_size = 5
batch_size = 4
x = np.random.random((batch_size, feature_size)).astype("float32")
w = np.random.random((num_classes - 1, feature_size)).astype("float32")
label = np.random.randint(0, num_classes, batch_size)
bias = np.random.random((1, num_classes - 1)).astype("float32")
self.attrs = {'num_classes': num_classes}
......@@ -97,7 +93,7 @@ class TestHSigmoidOp(OpTest):
self.check_output()
def test_check_grad(self):
self.check_grad(['Bias', 'X', 'W'], 'Out', no_grad_set=set('Label'))
self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))
if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册