From 02d68051db17e43f7b0c6785fa9f31384263a741 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Mon, 26 Nov 2018 03:09:12 +0000 Subject: [PATCH] add sparsed bias grad, test=develop --- .../operators/hierarchical_sigmoid_op.cc | 32 +++++++++++++------ .../fluid/operators/hierarchical_sigmoid_op.h | 31 ++++++++++++++---- .../fluid/operators/math/matrix_bit_code.cc | 18 +++++++++++ paddle/fluid/operators/math/matrix_bit_code.h | 5 +++ python/paddle/fluid/layers/nn.py | 4 +-- .../fluid/tests/unittests/test_hsigmoid_op.py | 17 ++++------ 6 files changed, 78 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index c350e6489dd..042d90e72f8 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -107,8 +107,9 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { "it should have shape like [N, L], L is the length of the Path") .AsDispensable(); AddInput("Bias", - "(LoDTensor, optional), The bias is a tensor with shape" - "[1, num_classes - 1]."); + "(LoDTensor, optional), The bias is a tensor with shape or " + "[non_leaf_num, 1]" + "[num_classes - 1, 1]."); AddOutput( "Out", "(LoDTensor, required) The output of hierarchical sigmoid operator." @@ -148,11 +149,11 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { "Output(W@Grad should not be null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Output(X@Grad should not be null."); - if (ctx->HasOutput(framework::GradVarName("Bias"))) { - ctx->SetOutputDim(framework::GradVarName("Bias"), - ctx->GetInputDim("Bias")); - } if (!ctx->Attrs().Get("is_sparse")) { + if (ctx->HasOutput(framework::GradVarName("Bias"))) { + ctx->SetOutputDim(framework::GradVarName("Bias"), + ctx->GetInputDim("Bias")); + } ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); } ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); @@ -172,20 +173,31 @@ class HierarchicalSigmoidGradOpGradVarTypeInference public: void operator()(const framework::OpDesc& op_desc, framework::BlockDesc* block) const override { - auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); + auto out_W_var_name = op_desc.Output(framework::GradVarName("W")).front(); + auto out_Bias_var_name = + op_desc.Output(framework::GradVarName("Bias")).front(); auto attr = op_desc.GetAttr("is_sparse"); bool is_sparse = boost::get(attr); if (is_sparse) { VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") << " is set to SelectedRows"; - block->Var(out_var_name) + block->Var(out_W_var_name) + ->SetType(framework::proto::VarType::SELECTED_ROWS); + VLOG(3) << "hierarchical_sigmoid_grad op " + << framework::GradVarName("Bias") << " is set to SelectedRows"; + block->Var(out_Bias_var_name) ->SetType(framework::proto::VarType::SELECTED_ROWS); } else { VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") << " is set to LoDTensor"; - block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); + block->Var(out_W_var_name) + ->SetType(framework::proto::VarType::LOD_TENSOR); + VLOG(3) << "hierarchical_sigmoid_grad op " + << framework::GradVarName("Bias") << " is set to SelectedRows"; + block->Var(out_Bias_var_name) + ->SetType(framework::proto::VarType::LOD_TENSOR); } - block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); + block->Var(out_W_var_name)->SetDataType(block->Var("W")->GetDataType()); } }; diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index b4a5fe83091..44853dafe9f 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -124,13 +124,12 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { auto* w = ctx.Input("W"); auto* path = ctx.Input("PTable"); auto* code = ctx.Input("PCode"); + auto* bias = ctx.Input("Bias"); auto* in_grad = ctx.Output(framework::GradVarName("X")); bool is_sparse = ctx.Attr("is_sparse"); auto& dev_ctx = ctx.template device_context(); math::SetConstant zero; - auto* bias_grad = - ctx.Output(framework::GradVarName("Bias")); auto* label = ctx.Input("Label"); auto* pre_out = ctx.Input("PreOut"); auto* out_grad = @@ -174,12 +173,15 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { pre_out_grad_mat * out_grad_mat.broadcast(bcast); // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to // be consistent with the clipping in forward. - if (bias_grad) { - bias_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, bias_grad, static_cast(0.0)); - bit_code->AddGrad(pre_out_grad, bias_grad); - } + if (!is_sparse) { + auto* bias_grad = + ctx.Output(framework::GradVarName("Bias")); + if (bias_grad) { + bias_grad->mutable_data(ctx.GetPlace()); + zero(dev_ctx, bias_grad, static_cast(0.0)); + bit_code->AddGrad(pre_out_grad, bias_grad); + } auto* w_grad = ctx.Output(framework::GradVarName("W")); w_grad->mutable_data(ctx.GetPlace()); @@ -199,6 +201,21 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { w_grad_value->mutable_data(temp_dim, ctx.GetPlace()); zero(dev_ctx, w_grad_value, static_cast(0.0)); + auto* bias_grad = + ctx.Output(framework::GradVarName("Bias")); + if (bias_grad) { + bias_grad->set_rows(real_rows); + // build ids -> rows index map + bias_grad->SyncIndex(); + bias_grad->set_height(bias->dims()[0]); + auto* bias_grad_value = bias_grad->mutable_value(); + std::vector dims = {static_cast(real_rows.size()), + bias->dims()[1]}; + bias_grad_value->mutable_data(framework::make_ddim(dims), + ctx.GetPlace()); + zero(dev_ctx, bias_grad_value, static_cast(0.0)); + bit_code->AddGrad(pre_out_grad, bias_grad); + } bit_code->MulGradWeight(pre_out_grad, w_grad, *in); } bit_code->MulGradError(pre_out_grad, *w, in_grad); diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 9a0cf8701fb..0c1aa29a18d 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -48,6 +48,24 @@ void MatrixBitCodeFunctor::AddGrad(const framework::LoDTensor& tmat, } } +template +void MatrixBitCodeFunctor::AddGrad(const framework::LoDTensor& tmat, + framework::SelectedRows* vec) { + size_t batch_size = tmat.dims()[0]; + size_t width = tmat.dims()[1]; + for (size_t i = 0; i < batch_size; ++i) { + auto code = code_table->get_code(i); + int code_length = code->get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code->calc_index(j); + int64_t row_index = + vec->AutoGrownIndex(static_cast(index), false, true); + vec->mutable_value()->data()[row_index] += + tmat.data()[i * width + j]; + } + } +} + template void MatrixBitCodeFunctor::Sum(const framework::LoDTensor& tmat, framework::LoDTensor* sum, T scale_sum) { diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index c8d21ba686b..673fcb65c81 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -241,6 +241,11 @@ class MatrixBitCodeFunctor { */ void AddGrad(const framework::LoDTensor& tmat, framework::LoDTensor* vec); + /* For selected rows For j < code_length + vec(0, index(i, j)) += tmat(i, j) + */ + void AddGrad(const framework::LoDTensor& tmat, framework::SelectedRows* vec); + /* For j < code_length sum(i, 0) = \sum_j bit(i, j) * tmat(i, j) */ diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index b02d75e55be..8170ccf0827 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4639,14 +4639,14 @@ def hsigmoid(input, if not is_costum: bias = helper.create_parameter( attr=helper.bias_attr, - shape=[1, num_classes - 1], + shape=[num_classes - 1, 1], is_bias=True, dtype=input.dtype) inputs['Bias'] = bias else: bias = helper.create_parameter( attr=helper.bias_attr, - shape=[1, non_leaf_num], + shape=[non_leaf_num, 1], is_bias=True, dtype=input.dtype) inputs['Bias'] = bias diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index 2f4225f912d..a3024dded6e 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -77,7 +77,7 @@ def hsigmoid(x, w, label, bias, num_classes): length = code_table.get_length() for j in range(length): idx = code_table.cal_index(j) - pre_output[i][j] += bias[0][idx] + pre_output[i][j] += bias[idx][0] for i in range(batch_size): code_table = CodeTable(num_classes, label[i]) length = code_table.get_length() @@ -115,7 +115,7 @@ def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes): length = code_table.get_length() for j in range(length): idx = code_table.cal_index(j) - pre_output[i][j] += bias[0][idx] + pre_output[i][j] += bias[idx][0] for i in range(batch_size): code_table = CodeTableWithCustomTree(ptable, pcode, i) length = code_table.get_length() @@ -150,7 +150,7 @@ class TestHSigmoidOp(OpTest): w = np.random.random( (num_classes - 1, feature_size)).astype("float32") * 2 label = np.random.randint(0, num_classes, (batch_size, 1)) - bias = np.random.random((1, num_classes - 1)).astype("float32") + bias = np.random.random((num_classes - 1, 1)).astype("float32") self.attrs = {'num_classes': num_classes, 'is_sparse': False} self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias} pre_output, out = hsigmoid(x, w, label, bias, num_classes) @@ -178,7 +178,7 @@ class TestHSigmoidOpSparse(OpTest): -1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store - bias = np.random.random((1, num_classes - 1)).astype("float32") + bias = np.random.random((num_classes - 1, 1)).astype("float32") self.attrs = {'num_classes': num_classes, 'is_sparse': True} self.inputs = { 'X': x, @@ -193,7 +193,6 @@ class TestHSigmoidOpSparse(OpTest): self.outputs = {'PreOut': pre_output, 'Out': out} def test_check_output(self): - print("checking output in CostumTree") self.check_output() @@ -208,7 +207,7 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase): emb = fluid.layers.embedding( input=input_word, - is_sparse=False, + is_sparse=is_sparse, size=[3, 3], param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal( scale=1 / math.sqrt(3)))) @@ -220,6 +219,7 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase): ptable=ptable, pcode=pcode, is_costum=True, + bias_attr=True, is_sparse=is_sparse) avg_cost = fluid.layers.reduce_mean(cost) @@ -240,7 +240,6 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase): optimizer.minimize(loss) main_program = fluid.default_main_program() - # print("main program: {program}".format{program=str(main_program)}) place = fluid.CPUPlace() feeder = fluid.DataFeeder(feed_list=data_list, place=place) exe = fluid.Executor(place) @@ -279,7 +278,7 @@ class TestHSigmoidOpWithCostumTree(OpTest): -1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store - bias = np.random.random((1, num_classes - 1)).astype("float32") + bias = np.random.random((num_classes - 1, 1)).astype("float32") self.attrs = {'num_classes': num_classes, 'is_sparse': False} self.inputs = { 'X': x, @@ -294,11 +293,9 @@ class TestHSigmoidOpWithCostumTree(OpTest): self.outputs = {'PreOut': pre_output, 'Out': out} def test_check_output(self): - print("checking output in CostumTree") self.check_output() def test_check_grad(self): - print("checking outputGrad in CostumTree") self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) -- GitLab