提交 e7a4cfc0 编写于 作者: G guosheng

complete the hsigmoid_op

上级 d6953816
...@@ -86,25 +86,25 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -86,25 +86,25 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("X", AddInput("X",
"(Tensor, required) The input Tensor, which the shape is" "(Tensor, required) The input tensor with shape [N, D], "
"[N, D], which N is the size of mini-batch," "where N is the size of mini-batch, and D is the feature size.");
"D is the embded size");
AddInput("W", AddInput("W",
"(Tensor, required), The parameters of hierarchical " "(Tensor, required), The parameters of hierarchical "
"sigmoid operator, each of them is s a 2-D tensor, the shape is" "sigmoid operator, each of them is a 2-D tensor, the shape is"
"[num_classes - 1, D]"); "[num_classes - 1, D].");
AddInput("Label", AddInput("Label",
"(Tensor, required), The labels of training data. It's a" "(Tensor, required), The labels of training data. It's a"
"1-D tensor, which the shape is [N, 1]"); "tensor with shape [N, 1].");
AddInput("Bias", AddInput("Bias",
"(Tensor, optional), The bias is a tensor with shape" "(Tensor, optional), The bias is a tensor with shape"
"[1, num_classes - 1]"); "[1, num_classes - 1].");
AddOutput("Out", AddOutput("Out",
"(Tensor, required) The output of hierarchical sigmoid operator." "(Tensor, required) The output of hierarchical sigmoid operator."
"the shape is [N, 1]"); "The shape is [N, 1].");
AddOutput("PreOut", AddOutput("PreOut",
"(Tensor, required) A intermedia 2-D Tensor, which the shape is " "(Tensor, required) A intermedia 2-D tensor with shape "
"[batch_size, code_length]") "[batch_size, code_length], where code_length represents the "
"maximum path length from root to leaf nodes.")
.AsIntermediate(); .AsIntermediate();
AddAttr<AttrType>("num_classes", "(int, required), The number of classes") AddAttr<AttrType>("num_classes", "(int, required), The number of classes")
.SetDefault(2); .SetDefault(2);
......
...@@ -44,9 +44,11 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> { ...@@ -44,9 +44,11 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
framework::Tensor sum; framework::Tensor sum;
math::SetConstant<DeviceContext, T> zero; math::SetConstant<DeviceContext, T> zero;
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto pre_out_data = pre_out->mutable_data<T>( auto* pre_out_data = pre_out->mutable_data<T>(
framework::make_ddim({batch_size, code_length}), ctx.GetPlace()); framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
auto pre_out_mat = EigenMatrix<T>::From(*pre_out); auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
// Not all class(leaf) nodes' path lengths equal code_length, thus init as
// 0s can avoid out of path's loss.
zero(dev_ctx, pre_out, static_cast<T>(0.0)); zero(dev_ctx, pre_out, static_cast<T>(0.0));
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
math::RowwiseSum<DeviceContext, T> row_sum; math::RowwiseSum<DeviceContext, T> row_sum;
...@@ -61,16 +63,13 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> { ...@@ -61,16 +63,13 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
bit_code.Add(pre_out, *bias); bit_code.Add(pre_out, *bias);
} }
bit_code.Mul(pre_out, *w, *in); bit_code.Mul(pre_out, *w, *in);
// clip the matrix with (-40, 40) // clip to [-40, 40]
Transform<DeviceContext> trans; Transform<DeviceContext> trans;
trans(ctx.template device_context<DeviceContext>(), pre_out_data, trans(ctx.template device_context<DeviceContext>(), pre_out_data,
pre_out_data + pre_out->numel(), pre_out_data, pre_out_data + pre_out->numel(), pre_out_data,
ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0))); ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
bit_code.Sum(*pre_out, out, static_cast<T>(-1)); bit_code.Sum(*pre_out, out, static_cast<T>(-1));
// softrelu with threshold is 40.0 // use softrelu to calculate cross entropy
trans(ctx.template device_context<DeviceContext>(), pre_out_data,
pre_out_data + pre_out->numel(), pre_out_data,
ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log(); pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
row_sum(dev_ctx, *pre_out, &sum); row_sum(dev_ctx, *pre_out, &sum);
out_mat.device(place) = sum_mat + out_mat; out_mat.device(place) = sum_mat + out_mat;
...@@ -102,14 +101,16 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> { ...@@ -102,14 +101,16 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
auto pre_out_mat = EigenMatrix<T>::From(*pre_out); auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad); auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>()); math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
// softrelu derivative Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});
Eigen::array<int, 2> bcast({1, static_cast<int>(pre_out_grad.dims()[1])});
auto out_grad_mat = EigenMatrix<T>::From(*out_grad); auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
pre_out_grad_mat = out_grad_mat.broadcast(bcast); pre_out_grad_mat = out_grad_mat.broadcast(bcast);
pre_out_grad_mat.device(place) = pre_out_grad_mat.device(place) =
pre_out_grad_mat * pre_out_grad_mat *
(static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp()); (static_cast<T>(1.0) -
static_cast<T>(1.0) / pre_out_mat.exp()); // softrelu derivative
bit_code.Sub(&pre_out_grad); bit_code.Sub(&pre_out_grad);
// TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
// be consistent with the clipping in forward.
if (bias_grad) { if (bias_grad) {
bias_grad->mutable_data<T>(ctx.GetPlace()); bias_grad->mutable_data<T>(ctx.GetPlace());
bit_code.AddGrad(pre_out_grad, bias_grad); bit_code.AddGrad(pre_out_grad, bias_grad);
......
...@@ -65,12 +65,24 @@ inline constexpr size_t FindLastSet(size_t x) { ...@@ -65,12 +65,24 @@ inline constexpr size_t FindLastSet(size_t x) {
struct SimpleCode { struct SimpleCode {
SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {} SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {}
/**
* calc_index should make sure that all siblings have the same weight indice.
* As for which weight index it maps to, it doesn't matter. To satisfy this,
* the id of root should be 1, and the left child of a node i is 2*i, the
* right child of a node i is 2*i+1.
*/
inline size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; } inline size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; }
/**
* calc_bit uses the right most bits, while calc_index uses the left most
* bits. They are not the same, and that's why we say it doesn't matter which
* weight index calc_index maps to.
*/
inline bool calc_bit(int bit) const { return c_ & (1 << bit); } inline bool calc_bit(int bit) const { return c_ & (1 << bit); }
inline int get_length() const { return FindLastSet(c_) - 1; } inline int get_length() const { return FindLastSet(c_) - 1; }
private: private:
size_t c_; size_t c_; // Here the id of root is 1 rather than 0, thus the id of class c
// is `c + num_classes`.
}; };
struct SimpleCodeTable { struct SimpleCodeTable {
...@@ -83,7 +95,6 @@ struct SimpleCodeTable { ...@@ -83,7 +95,6 @@ struct SimpleCodeTable {
private: private:
size_t num_classes_; size_t num_classes_;
int max_code_length_;
}; };
template <typename T> template <typename T>
......
...@@ -3858,29 +3858,32 @@ def nce(input, ...@@ -3858,29 +3858,32 @@ def nce(input,
return cost / (num_neg_samples + 1) return cost / (num_neg_samples + 1)
def hsigmoid(input, label, num_classes=2, param_attr=None, bias_attr=None): def hsigmoid(input, label, num_classes, param_attr=None, bias_attr=None):
""" """
The hierarchical sigmoid operator is used to accelerate the training The hierarchical sigmoid operator is used to accelerate the training
process of language model. This operator organizes the classes into a process of language model. This operator organizes the classes into a
complete binary tree, each leaf node represents a class(a word) and each internal complete binary tree, each leaf node represents a class(a word) and each
node acts likea binary classifier. For each word there's a unique path from root internal node acts as a binary classifier. For each word there's a unique
to it's leaf node, hsigmoid calculate the cost for each internal node on the path path from root to it's leaf node, hsigmoid calculate the cost for each
(include root), and sum them to get a total cost. hsigmoid can achive a acceleration internal node on the path, and sum them to get a total cost. hsigmoid can
from N to logN, for which N represents the size of word dict. This idea is from "F. achive a acceleration from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
Morin, Y. Bengio(AISTATS 05): Hierarchical Probabilistic Neural Network Language Model. represents the size of word dict.
Refer to `Hierarchical Probabilistic Neural Network Language Model
<http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_
Args: Args:
input (Variable): (Tensor) The input Tensor, which the shape is input (Variable): The input tensor variable with shape
[N * D], which N is the size of mini-batch,D is the embded size :math:`[N \\times D]`, where :math:`N` is the size of mini-batch,
label (Variable): (Tensor), The labels of training data. It's a and :math:`D` is the feature size.
1-D tensor, which the shape is [1, N] label (Variable): The tensor variable contains labels of training data.
num_classes: (int, default 2), The number of classes, must be lager or It's a tensor with shape is :math:`[N \\times 1]`.
equal than 2. num_classes: (int), The number of classes, must not be less than 2.
param_attr (ParamAttr|list of ParamAttr, default None): The parameter param_attr (ParamAttr|list of ParamAttr, default None): The parameter
attribute for learnable parameters/weights of this layer. attribute for learnable parameters/weights of this layer.
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter bias_attr (ParamAttr|list of ParamAttr, default None): The parameter
attribute for the bias of this layer. If it is set to None, no bias attribute for the bias of this layer. If it is set to False, no
will be added to the output units. bias will be applied.
Returns: Returns:
Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1] Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1]
...@@ -3889,11 +3892,9 @@ def hsigmoid(input, label, num_classes=2, param_attr=None, bias_attr=None): ...@@ -3889,11 +3892,9 @@ def hsigmoid(input, label, num_classes=2, param_attr=None, bias_attr=None):
.. code-block:: python .. code-block:: python
x = fluid.layers.data(name='x', shape=[3, 2], x = fluid.layers.data(name='x', shape=[2], dtype='float32')
dtype='float32') y = fluid.layers.data(name='y', shape=[1], dtype='int64')
y = fluid.layers.data(name='y', shape=[1, 3], out = fluid.layers.hsigmoid(input=x, label=y, num_classes=6)
dtype='int64')
out = fluid.layers.hsigmoid(input=x, label=y, num_classes=2)
""" """
helper = LayerHelper('hierarchical_sigmoid', **locals()) helper = LayerHelper('hierarchical_sigmoid', **locals())
...@@ -3902,7 +3903,7 @@ def hsigmoid(input, label, num_classes=2, param_attr=None, bias_attr=None): ...@@ -3902,7 +3903,7 @@ def hsigmoid(input, label, num_classes=2, param_attr=None, bias_attr=None):
pre_out = helper.create_tmp_variable(dtype) pre_out = helper.create_tmp_variable(dtype)
dim = input.shape[1] dim = input.shape[1]
if num_classes < 2: if num_classes < 2:
raise ValueError("num_classes must be lager or equal than 2.") raise ValueError("num_classes must not be less than 2.")
weights = helper.create_parameter( weights = helper.create_parameter(
attr=helper.param_attr, attr=helper.param_attr,
shape=[num_classes - 1, dim], shape=[num_classes - 1, dim],
......
...@@ -55,10 +55,7 @@ def hsigmoid(x, w, label, bias, num_classes): ...@@ -55,10 +55,7 @@ def hsigmoid(x, w, label, bias, num_classes):
length = code_table.get_length() length = code_table.get_length()
for k in range(length): for k in range(length):
idx = code_table.cal_index(k) idx = code_table.cal_index(k)
sum = 0.0 pre_output[j][k] = np.dot(w[idx], x[j])
for l in range(x.shape[1]):
sum += w[idx][l] * x[j][l]
pre_output[j][k] += sum
# clip[-40.0, 40.0] # clip[-40.0, 40.0]
pre_output = np.clip(pre_output, -40.0, 40.0) pre_output = np.clip(pre_output, -40.0, 40.0)
# out(i, 0) = \sum_j bit(i, j) * preout(i, j) # out(i, 0) = \sum_j bit(i, j) * preout(i, j)
...@@ -71,7 +68,6 @@ def hsigmoid(x, w, label, bias, num_classes): ...@@ -71,7 +68,6 @@ def hsigmoid(x, w, label, bias, num_classes):
sum += pre_output[i][j] sum += pre_output[i][j]
out[i] = -1.0 * sum out[i] = -1.0 * sum
# soft relu # soft relu
np.clip(pre_output, -40.0, 40.0)
pre_output = np.log(1 + np.exp(pre_output)) pre_output = np.log(1 + np.exp(pre_output))
pre_sum = pre_output.sum(1).reshape((batch_size, 1)) pre_sum = pre_output.sum(1).reshape((batch_size, 1))
out += pre_sum out += pre_sum
...@@ -81,11 +77,11 @@ def hsigmoid(x, w, label, bias, num_classes): ...@@ -81,11 +77,11 @@ def hsigmoid(x, w, label, bias, num_classes):
class TestHSigmoidOp(OpTest): class TestHSigmoidOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "hierarchical_sigmoid" self.op_type = "hierarchical_sigmoid"
num_classes = 4 num_classes = 6
embded_size = 1 feature_size = 5
batch_size = 1 batch_size = 4
x = np.random.random((batch_size, embded_size)).astype("float32") x = np.random.random((batch_size, feature_size)).astype("float32")
w = np.random.random((num_classes - 1, embded_size)).astype("float32") w = np.random.random((num_classes - 1, feature_size)).astype("float32")
label = np.random.randint(0, num_classes, batch_size) label = np.random.randint(0, num_classes, batch_size)
bias = np.random.random((1, num_classes - 1)).astype("float32") bias = np.random.random((1, num_classes - 1)).astype("float32")
self.attrs = {'num_classes': num_classes} self.attrs = {'num_classes': num_classes}
...@@ -97,7 +93,7 @@ class TestHSigmoidOp(OpTest): ...@@ -97,7 +93,7 @@ class TestHSigmoidOp(OpTest):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['Bias', 'X', 'W'], 'Out', no_grad_set=set('Label')) self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册