diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 6cb011611d93361cbaf9bc14c1c89aee7f417ab0..07ff8f947e59d2954783e2ba537bfce3cb320f22 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -47,11 +47,11 @@ template class HierarchicalSigmoidOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto in = detail::Ref(ctx.Input("X")); - auto w = detail::Ref(ctx.Input("W")); + auto& in = detail::Ref(ctx.Input("X")); + auto& w = detail::Ref(ctx.Input("W")); auto* path = ctx.Input("PTable"); auto* code = ctx.Input("PathCode"); - auto label = detail::Ref(ctx.Input("Label")); + auto& label = detail::Ref(ctx.Input("Label")); auto* bias = ctx.Input("Bias"); auto* out = ctx.Output("Out"); auto* pre_out = ctx.Output("PreOut"); @@ -114,8 +114,8 @@ template class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto in = detail::Ref(ctx.Input("X")); - auto w = detail::Ref(ctx.Input("W")); + auto& in = detail::Ref(ctx.Input("X")); + auto& w = detail::Ref(ctx.Input("W")); auto* path = ctx.Input("PTable"); auto* code = ctx.Input("PathCode"); auto* bias = ctx.Input("Bias"); @@ -124,9 +124,9 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { bool is_sparse = ctx.Attr("is_sparse"); auto& dev_ctx = ctx.template device_context(); math::SetConstant zero; - auto label = detail::Ref(ctx.Input("Label")); - auto pre_out = detail::Ref(ctx.Input("PreOut")); - auto out_grad = detail::Ref( + auto& label = detail::Ref(ctx.Input("Label")); + auto& pre_out = detail::Ref(ctx.Input("PreOut")); + auto& out_grad = detail::Ref( ctx.Input(framework::GradVarName("Out"))); framework::LoDTensor pre_out_grad; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 44116a262cadd079accdd45c88e21e28bc2d8c21..b22e9715b8d1f49da1cfa07b488054e84e437863 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4589,23 +4589,33 @@ def hsigmoid(input, bias_attr=None, name=None, non_leaf_num=None, - ptable=None, - pcode=None, - is_costum=False, + path_table=None, + path_code=None, + is_custom=False, is_sparse=False): """ The hierarchical sigmoid operator is used to accelerate the training process of language model. This operator organizes the classes into a - complete binary tree, each leaf node represents a class(a word) and each + complete binary tree, or you can use is_custom to pass your own tree to + implement hierarchical. Each leaf node represents a class(a word) and each internal node acts as a binary classifier. For each word there's a unique path from root to it's leaf node, hsigmoid calculate the cost for each internal node on the path, and sum them to get a total cost. hsigmoid can achive a acceleration from :math:`O(N)` to :math:`O(logN)`, where :math:`N` represents the size of word dict. - Refer to `Hierarchical Probabilistic Neural Network Language Model + Using default tree you can Refer to `Hierarchical Probabilistic Neural Network Language Model `_ + And if you want to use the costumed tree by set 'is_custom' as true you may need to do following things first: + 1. using your word dict to build a binary tree, each leaf node should be an word of your word dict + 2. build a dict to store word_id -> word's leaf to root path, we call it path_table. + 3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code + means label of each binary classification, using 1 indicate true, 0 indicate false. + 4. now, each word should has its path and code along the path, you can pass a batch of path and code + related to the same batch of inputs. + + Args: input (Variable): The input tensor variable with shape :math:`[N \\times D]`, where :math:`N` is the size of mini-batch, @@ -4613,13 +4623,6 @@ def hsigmoid(input, label (Variable): The tensor variable contains labels of training data. It's a tensor with shape is :math:`[N \\times 1]`. num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set - non_leaf_num: this defines the number of non-leaf nodes in costumed tree - ptable: (Variable|None) this variable can store each batch of samples' path to root, - it should be in leaf -> root order - ptable should have the same shape with pcode, and for each sample i ptable[i] indicates a np.array like - structure and each element in this array is indexes in parent nodes' Weight Matrix. - pcode: (Variable|None) this variable can store each batch of samples' code, - each code consist with every code of parent nodes. it should be in leaf -> root order param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid will create ParamAttr as param_attr. If the Initializer of the param_attr @@ -4631,8 +4634,15 @@ def hsigmoid(input, is not set, the bias is initialized zero. Default: None. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. Default: None. - is_costum: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is - set you need to set ptable/pcode/non_leaf_num, otherwise num_classes should be set + non_leaf_num: this defines the number of non-leaf nodes in costumed tree + path_table: (Variable|None) this variable can store each batch of samples' path to root, + it should be in leaf -> root order + path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like + structure and each element in this array is indexes in parent nodes' Weight Matrix. + path_code: (Variable|None) this variable can store each batch of samples' code, + each code consist with every code of parent nodes. it should be in leaf -> root order + is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is + set you need to set path_table/path_code/non_leaf_num, otherwise num_classes should be set is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient of W and input will be sparse. @@ -4653,22 +4663,22 @@ def hsigmoid(input, out = helper.create_variable_for_type_inference(dtype) pre_out = helper.create_variable_for_type_inference(dtype) dim = input.shape[1] - if ((num_classes is None) or (num_classes < 2)) and (not is_costum): + if ((num_classes is None) or (num_classes < 2)) and (not is_custom): raise ValueError( "num_classes must not be less than 2 with default tree") - if (is_costum) and (pcode is None): - raise ValueError("pcode should not be None with costum tree") - elif (is_costum) and (ptable is None): - raise ValueError("ptable should not be None with costum tree") - elif (is_costum) and (non_leaf_num is None): + if (is_custom) and (path_code is None): + raise ValueError("path_code should not be None with costum tree") + elif (is_custom) and (path_table is None): + raise ValueError("path_table should not be None with costum tree") + elif (is_custom) and (non_leaf_num is None): raise ValueError("non_leaf_num should not be None with costum tree") else: pass weights = None - if not is_costum: + if not is_custom: weights = helper.create_parameter( attr=helper.param_attr, shape=[num_classes - 1, dim], @@ -4683,12 +4693,12 @@ def hsigmoid(input, inputs = { "X": input, "W": weights, - "PTable": ptable, - "PathCode": pcode, + "PTable": path_table, + "PathCode": path_code, "Label": label } if helper.bias_attr: - if not is_costum: + if not is_custom: bias = helper.create_parameter( attr=helper.bias_attr, shape=[num_classes - 1, 1], diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index 8152ce9b78cbb2468556115dfc7cfb936a0eeb1f..4254c3bb250487f03fdb162f279642901098dbb2 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -43,9 +43,9 @@ class CodeTable(object): class CodeTableWithCustomTree(object): - def __init__(self, ptable, pcode, index): - self.ptable_ = ptable - self.pcode_ = pcode + def __init__(self, path_table, path_code, index): + self.ptable_ = path_table + self.pcode_ = path_code self.index_ = index def cal_index(self, bit): @@ -102,9 +102,10 @@ def hsigmoid(x, w, label, bias, num_classes): return pre_output, out -def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes): +def hsigmoidWithCustomTree(x, w, path_table, path_code, label, bias, + num_classes): batch_size = x.shape[0] - code_length = len(ptable[0]) + code_length = len(path_table[0]) code_table = [0 for _ in range(code_length)] # init pre_out with shape [N, code_length] pre_output = np.zeros((batch_size, code_length)) @@ -112,13 +113,13 @@ def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes): out = np.zeros((batch_size, 1)).astype("float32") if isinstance(bias, np.ndarray): for i in range(batch_size): - code_table = CodeTableWithCustomTree(ptable, pcode, i) + code_table = CodeTableWithCustomTree(path_table, path_code, i) length = code_table.get_length() for j in range(length): idx = code_table.cal_index(j) pre_output[i][j] += bias[idx][0] for i in range(batch_size): - code_table = CodeTableWithCustomTree(ptable, pcode, i) + code_table = CodeTableWithCustomTree(path_table, path_code, i) length = code_table.get_length() for j in range(length): idx = code_table.cal_index(j) @@ -127,7 +128,7 @@ def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes): pre_output = np.clip(pre_output, -40.0, 40.0) # out(i, 0) = \sum_j bit(i, j) * preout(i, j) for i in range(batch_size): - code_table = CodeTableWithCustomTree(ptable, pcode, i) + code_table = CodeTableWithCustomTree(path_table, path_code, i) length = code_table.get_length() sum = 0.0 for j in range(length): @@ -173,24 +174,24 @@ class TestHSigmoidOpSparse(OpTest): x = np.random.random((batch_size, feature_size)).astype("float32") w = np.random.random((num_classes - 1, feature_size)).astype("float32") label = np.array([0, 1, 4, 5]) - ptable = np.array( + path_table = np.array( [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), (0, 2, -1, -1, -1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) - pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( + path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store bias = np.random.random((num_classes - 1, 1)).astype("float32") self.attrs = {'num_classes': num_classes, 'is_sparse': True} self.inputs = { 'X': x, 'W': w, - 'PTable': ptable, - 'PathCode': pcode, + 'PTable': path_table, + 'PathCode': path_code, 'Label': label, 'Bias': bias } - pre_output, out = hsigmoidWithCustomTree(x, w, ptable, pcode, label, - bias, num_classes) + pre_output, out = hsigmoidWithCustomTree(x, w, path_table, path_code, + label, bias, num_classes) self.outputs = {'PreOut': pre_output, 'Out': out} def test_check_output(self): @@ -200,11 +201,13 @@ class TestHSigmoidOpSparse(OpTest): class TestHSigmoidOpWithSparseGrad(unittest.TestCase): def hs_net_conf(self, is_sparse): input_word = fluid.layers.data(name="x", shape=[1], dtype='int64') - ptable = fluid.layers.data(name='ptable', shape=[3], dtype='int64') - pcode = fluid.layers.data(name='pcode', shape=[3], dtype='int64') + path_table = fluid.layers.data( + name='path_table', shape=[3], dtype='int64') + path_code = fluid.layers.data( + name='path_code', shape=[3], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64') - data_list = [input_word, ptable, pcode, label] + data_list = [input_word, path_table, path_code, label] emb = fluid.layers.embedding( input=input_word, @@ -218,9 +221,9 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase): label=label, bias_attr=True, non_leaf_num=3, - ptable=ptable, - pcode=pcode, - is_costum=True, + path_table=path_table, + path_code=path_code, + is_custom=True, is_sparse=is_sparse) avg_cost = fluid.layers.reduce_mean(cost) @@ -232,8 +235,8 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase): start_up = fluid.default_startup_program() start_up.random_seed = 1 # Fix random seed x = np.arange(6).reshape(6) - ptable = np.array([(1, 2, -1), (1, 2, -1)]) - pcode = np.array([(1, 0, -1), (0, 0, -1)]) + path_table = np.array([(1, 2, -1), (1, 2, -1)]) + path_code = np.array([(1, 0, -1), (0, 0, -1)]) label = np.array([1, 4]) loss, data_list = self.hs_net_conf(is_sparse) @@ -248,8 +251,8 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase): exe.run(start_up) result = list() for i in range(10): - data = [([[x[i % 2]]], [list(ptable[i % 2])], - [list(pcode[i % 2])], [label[i % 2]])] + data = [([[x[i % 2]]], [list(path_table[i % 2])], + [list(path_code[i % 2])], [label[i % 2]])] loss_val = exe.run(main_program, feed=feeder.feed(data), @@ -273,24 +276,24 @@ class TestHSigmoidOpWithCostumTree(OpTest): w = np.random.random( (num_classes - 1, feature_size)).astype("float32") * 2 label = np.array([0, 1, 4, 5]) - ptable = np.array( + path_table = np.array( [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), (0, 2, -1, -1, -1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) - pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( + path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store bias = np.random.random((num_classes - 1, 1)).astype("float32") self.attrs = {'num_classes': num_classes, 'is_sparse': False} self.inputs = { 'X': x, 'W': w, - 'PTable': ptable, - 'PathCode': pcode, + 'PTable': path_table, + 'PathCode': path_code, 'Label': label, 'Bias': bias } - pre_output, out = hsigmoidWithCustomTree(x, w, ptable, pcode, label, - bias, num_classes) + pre_output, out = hsigmoidWithCustomTree(x, w, path_table, path_code, + label, bias, num_classes) self.outputs = {'PreOut': pre_output, 'Out': out} def test_check_output(self): @@ -310,26 +313,26 @@ class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest): w = np.random.random( (num_classes - 1, feature_size)).astype("float32") * 2 label = np.array([0, 1, 4, 5]) - ptable = np.array( + path_table = np.array( [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), (0, 2, -1, -1, -1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) - pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( + path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store # bias = np.random.random((num_classes - 1, 1)).astype("float32") self.attrs = {'num_classes': num_classes, 'is_sparse': False} self.inputs = { 'X': x, 'W': w, - 'PTable': ptable, - 'PathCode': pcode, + 'PTable': path_table, + 'PathCode': path_code, 'Label': label, } pre_output, out = hsigmoidWithCustomTree( x=x, w=w, - ptable=ptable, - pcode=pcode, + path_table=path_table, + path_code=path_code, label=label, bias=None, num_classes=num_classes) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 0dc3388b946cf133408be32f5b20a8c869a88785..b8477820eeb97a93ebf0aeb3a1c12895a66cb2c7 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -190,16 +190,18 @@ class TestBook(unittest.TestCase): with program_guard(program2): x2 = layers.data(name='x2', shape=[4, 8], dtype='float32') y2 = layers.data(name='y2', shape=[4], dtype='int64') - ptable = layers.data(name='ptable', shape=[4, 6], dtype='int64') - pcode = layers.data(name='pcode', shape=[4, 6], dtype='int64') + path_table = layers.data( + name='path_table', shape=[4, 6], dtype='int64') + path_code = layers.data( + name='path_code', shape=[4, 6], dtype='int64') self.assertIsNotNone( layers.hsigmoid( input=x2, label=y2, non_leaf_num=6, - ptable=ptable, - pcode=pcode, - is_costum=True)) + path_table=path_table, + path_code=path_code, + is_custom=True)) print(str(program2)) def test_sequence_expand(self):