From 4a7e6deb631690017509d0719983279e2621c28b Mon Sep 17 00:00:00 2001 From: JesseyXujin Date: Mon, 2 Sep 2019 17:22:50 +0800 Subject: [PATCH] add padding in linear_chain_crf op (#19583) * add padding in linear_chain_crf op * modify API.spec * add linear_chain_crf_op.cc and linear_chain_crf_op.h * remove useless unit test , test=develop * modify API.spec, test=develop * remove some blanks in nn.py , test=develop * fix some bugs on nn.py and API.spec ,test=develop * fix nn.py, test=develop * fix API.spec ,test=develop * fix bug of CI test in test_linear_chain_crf_op.py * fix bug of CI test in test_linear_chain_crf_op.py, test=develop * remove paddle_enforce, test=develop * remove paddle_enforce, test=develop * remove paddle_enforce, test=develop * remove paddle_enforce, test=develop * remove paddle_enforce, test=develop * remove paddle_enforce, test=develop * modify nn.py, test=develop * fix API.spec, test=develop * fix unittest bug, test=develop --- paddle/fluid/API.spec | 2 +- paddle/fluid/operators/linear_chain_crf_op.cc | 125 ++++++++---- paddle/fluid/operators/linear_chain_crf_op.h | 190 ++++++++++++------ python/paddle/fluid/layers/nn.py | 76 +++++-- .../unittests/test_linear_chain_crf_op.py | 78 ++++++- 5 files changed, 356 insertions(+), 115 deletions(-) mode change 100644 => 100755 paddle/fluid/API.spec mode change 100644 => 100755 paddle/fluid/operators/linear_chain_crf_op.h mode change 100644 => 100755 python/paddle/fluid/layers/nn.py mode change 100644 => 100755 python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec old mode 100644 new mode 100755 index 287b918255..955a37c5ee --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -115,7 +115,7 @@ paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', ' paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'c37d51aad655c8a9f9b045c64717320a')) paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '83617c165827e030636c80486d5de6f3')) paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '33974b9bfa69f2f1eb85e6f956dff04e')) -paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)), ('document', '34f96be41684b0959897a9e735997e20')) +paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', '715f8f12d68ae90504a7b768e82be6f4')) paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)), ('document', '5ce117258e243be1c81539e254178d90')) paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', '8e6ce424cf9e261ef32ee229c06a6e66')) paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', 'f43c659ca1749a3f0ff2231e6dfda07d')) diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index a94704a728..ed09c64ffd 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -23,21 +23,28 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("Emission", - "(LoDTensor, default LoDTensor) " - "A 2-D LoDTensor with shape [N x D], where N is the size of the " + "(LoDTensor/Tensor). When a LoDTensor input,A 2-D LoDTensor" + " with shape [N x D], where N is the size of the " "mini-batch and D is the total tag number. The unscaled emission " - "weight matrix for the linear chain CRF. "); + "weight matrix for the linear chain CRF. When a Tensor input," + "A Tensor with shape [N x S x D], where N is batch number," + "S is max length of sequences, D is the total tag number."); AddInput("Transition", "(Tensor, default Tensor) A 2-D Tensor with shape " "[(D + 2) x D]. The learnable parameter for the linear_chain_crf " "operator. See more details in the operator's comments."); AddInput("Label", - "(LoDTensor, default LoDTensor) A LoDTensor with shape " + "(LoDTensor/Tensor), when a LoDTensor input, " "[N x 1], where N is the total element number in a mini-batch. " - "The ground truth."); + "when a Tensor input, [N x S], where N is batch number. " + "S is max length of sequences. The ground truth."); + AddInput("length", + "(Tensor, default Tensor) A Tensor with shape " + "[M x 1], where M is the sequence number in a mini-batch.") + .AsDispensable(); AddOutput( "Alpha", - "(Tensor, default Tensor) A 2-D Tensor with shape [N x D]. " + "(Tensor, default Tensor), the same shape with Emission. " "The forward vectors for the entire batch. Denote it as $\alpha$. " "$\alpha$ is a memo table used to calculate the normalization " "factor in CRF. $\alpha[k, v]$ stores the unnormalized " @@ -49,7 +56,7 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker { .AsIntermediate(); AddOutput( "EmissionExps", - "(Tensor, default Tensor) A 2-D Tensor with shape [N x D]. " + "(Tensor, default Tensor), the same shape with Emission. " "The exponentials of Input(Emission). This is an intermediate " "computational result in forward computation, and will be reused in " "backward computation.") @@ -145,11 +152,6 @@ class LinearChainCRFOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("LogLikelihood"), "Output(LogLikelihood) should be not null."); - auto emission_dims = ctx->GetInputDim("Emission"); - PADDLE_ENFORCE_EQ(emission_dims.size(), 2, - "The Input(Emission) should be a 2-D tensor."); - PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed."); - auto transition_dims = ctx->GetInputDim("Transition"); PADDLE_ENFORCE_EQ(transition_dims.size(), 2, "The Input(Transition) should be a 2-D tensor."); @@ -164,20 +166,40 @@ class LinearChainCRFOp : public framework::OperatorWithKernel { "An invalid dimension for the Input(Transition), which should " "be a 2-D tensor with shape [(D + 2) x D]."); } - PADDLE_INFERSHAPE_ENFORCE_EQ( - ctx, emission_dims[1], transition_dims[1], - "The 2nd dimension of the Input(Emission) and the Input(Transition) " - "should be equal to the tag number."); - - auto label_dims = ctx->GetInputDim("Label"); - PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL, - "The Input(Label) should be a 2-D tensor with the 2nd " - "dimensions fixed to 1."); - PADDLE_INFERSHAPE_ENFORCE_EQ( - ctx, emission_dims[0], label_dims[0], - "The height of Input(Emission) and the height of Input(Label) " - "should be the same."); - + auto emission_dims = ctx->GetInputDim("Emission"); + PADDLE_ENFORCE_NE(emission_dims[0], 0, + "An empty mini-batch is not allowed."); + if (ctx->HasInput("length")) { + PADDLE_ENFORCE_EQ(emission_dims.size(), 3, + "The Input(Emission) should be a 3-D tensor."); + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(label_dims.size(), 3, + "The Input(Label) should be a 3-D tensor"); + PADDLE_INFERSHAPE_ENFORCE_EQ( + ctx, emission_dims[0], label_dims[0], + "The batch size of Input(Emission) and Input(Label) " + "should be the same."); + PADDLE_INFERSHAPE_ENFORCE_EQ( + ctx, emission_dims[1], label_dims[1], + "The max length of Input(Emission) and Input(Label) " + "should be the same."); + } else { + PADDLE_ENFORCE_EQ(emission_dims.size(), 2, + "The Input(Emission) should be a 2-D tensor."); + PADDLE_INFERSHAPE_ENFORCE_EQ( + ctx, emission_dims[1], transition_dims[1], + "The 2nd dimension of the Input(Emission) and the Input(Transition) " + "should be equal to the tag number."); + + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(label_dims.size(), 2, + "The Input(Label) should be a 2-D tensor with the 2nd " + "dimensions fixed to 1."); + PADDLE_INFERSHAPE_ENFORCE_EQ( + ctx, emission_dims[0], label_dims[0], + "The height of Input(Emission) and the height of Input(Label) " + "should be the same."); + } ctx->SetOutputDim("Alpha", emission_dims); ctx->SetOutputDim("EmissionExps", emission_dims); ctx->SetOutputDim("TransitionExps", transition_dims); @@ -210,12 +232,6 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("LogLikelihood")), "Input(LogLikelihood@GRAD) shoudl be not null."); - auto emission_exps_dims = ctx->GetInputDim("EmissionExps"); - PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2, - "The Input(EmissionExps) should be a 2-D tensor."); - PADDLE_ENFORCE(emission_exps_dims[0], - "An empty mini-batch is not allowed."); - auto transition_exps_dims = ctx->GetInputDim("TransitionExps"); PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2, "The Input(TransitionExps) should be a 2-D tensor."); @@ -230,15 +246,34 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { "An invalid dimension for the Input(TransitionExps), which should " "be a 2-D tensor with shape [(D + 2) x D]."); } - PADDLE_INFERSHAPE_ENFORCE_EQ( - ctx, emission_exps_dims[1], transition_exps_dims[1], - "The 2nd dimension of the Input(EmissionExps) and the " - "Input(TransitionExps) should be equal to the tag number."); + auto emission_exps_dims = ctx->GetInputDim("EmissionExps"); auto label_dims = ctx->GetInputDim("Label"); - PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL, - "The Input(Label) should be a 2-D tensor with the 2nd " - "dimensions fixed to 1."); + if (ctx->HasInput("length")) { + PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 3, + "The Input(EmissionExps) should be a 3-D tensor."); + PADDLE_INFERSHAPE_ENFORCE_EQ( + ctx, emission_exps_dims[2], transition_exps_dims[1], + "The 3nd dimension of the Input(EmissionExps) and the " + "Input(TransitionExps) should be equal to the tag number."); + PADDLE_ENFORCE_EQ(label_dims.size(), 3, + "The Input(Label) should be a 3-D tensor with the 3nd " + "dimensions fixed to 1."); + } else { + PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2, + "The Input(EmissionExps) should be a 2-D tensor."); + PADDLE_INFERSHAPE_ENFORCE_EQ( + ctx, emission_exps_dims[1], transition_exps_dims[1], + "The 2nd dimension of the Input(EmissionExps) and the " + "Input(TransitionExps) should be equal to the tag number."); + PADDLE_ENFORCE_EQ(label_dims.size(), 2, + "The Input(Label) should be a 2-D tensor"); + PADDLE_ENFORCE_EQ(label_dims[1], 1, + "The Input(Label) 2nd dimensions fixed to 1."); + } + PADDLE_ENFORCE_NE(emission_exps_dims[0], 0, + "An empty mini-batch is not allowed."); + PADDLE_INFERSHAPE_ENFORCE_EQ( ctx, emission_exps_dims[0], label_dims[0], "The height of Input(EmissionExps) and the height of Input(Label) " @@ -246,8 +281,12 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { if (ctx->HasOutput(framework::GradVarName("Emission"))) { ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims); - ctx->ShareLoD("Emission", framework::GradVarName("Emission")); + if (ctx->HasInput("length") == false) { + ctx->ShareLoD("Emission", framework::GradVarName("Emission")); + } } + // ctx->SetOutputDim(framework::GradVarName("Emission"), + // emission_exps_dims); if (ctx->HasOutput(framework::GradVarName("Transition"))) { ctx->SetOutputDim(framework::GradVarName("Transition"), transition_exps_dims); @@ -275,15 +314,15 @@ class LinearChainCRFGradDescMaker : public framework::SingleGradOpDescMaker { std::unique_ptr op(new framework::OpDesc()); op->SetType("linear_chain_crf_grad"); op->SetAttrMap(Attrs()); - op->SetInput("Emission", Input("Emission")); op->SetInput("Transition", Input("Transition")); op->SetInput("Label", Input("Label")); - op->SetInput("Alpha", Output("Alpha")); op->SetInput("EmissionExps", Output("EmissionExps")); op->SetInput("TransitionExps", Output("TransitionExps")); - + if (ForwardOp().Inputs().count("length") > 0) { + op->SetInput("length", Input("length")); + } op->SetInput(framework::GradVarName("LogLikelihood"), OutputGrad("LogLikelihood")); diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h old mode 100644 new mode 100755 index d5162bcd74..dab9aab707 --- a/paddle/fluid/operators/linear_chain_crf_op.h +++ b/paddle/fluid/operators/linear_chain_crf_op.h @@ -54,20 +54,9 @@ template class LinearChainCRFOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - // TODO(caoying) The checks related to LoD information should be - // moved into InferShape once after the InferShape is refactored. - PADDLE_ENFORCE_EQ(ctx.Input("Emission")->NumLevels(), 1UL, - "The Input(Emission) should be a sequence."); - PADDLE_ENFORCE_EQ(ctx.Input("Label")->NumLevels(), 1UL, - "The Input(Label) should be a sequence."); - auto in_lod = ctx.Input("Label")->lod(); - PADDLE_ENFORCE(in_lod.size(), "Input(Label) must be a sequence."); - const size_t level = 0; - const size_t seq_num = in_lod[level].size() - 1; - - const LoDTensor* emission_weights = ctx.Input("Emission"); - const Tensor* transition_weights = ctx.Input("Transition"); - const LoDTensor* label = ctx.Input("Label"); + const Tensor* emission_weights = ctx.Input("Emission"); + const Tensor* transition_weights = + ctx.Input("Transition"); Tensor* emission_exps = ctx.Output("EmissionExps"); Tensor* transition_exps = ctx.Output("TransitionExps"); @@ -76,56 +65,103 @@ class LinearChainCRFOpKernel : public framework::OpKernel { // Because the computation codes only runs on CPU, here the memory for all // the outputs is FIXED to be allocated on the CPU memory. - emission_exps->mutable_data(platform::CPUPlace()); + auto* emission_exps_data = + emission_exps->mutable_data(platform::CPUPlace()); + auto* alpha_data = alpha->mutable_data(platform::CPUPlace()); transition_exps->mutable_data(platform::CPUPlace()); - alpha->mutable_data(platform::CPUPlace()); - // Resize the output tensor to its correct dimension. + memset(emission_exps_data, 0, emission_exps->numel() * sizeof(T)); + memset(alpha_data, 0, alpha->numel() * sizeof(T)); + auto emission_dims = emission_weights->dims(); + + const Tensor* label = ctx.Input("Label"); + auto& dev_ctx = ctx.template device_context(); + Tensor emission_weights_tmp = ctx.AllocateTmpTensor( + emission_weights->dims(), dev_ctx); + emission_weights_tmp.ShareDataWith(*emission_weights); + Tensor label_tmp = + ctx.AllocateTmpTensor(label->dims(), dev_ctx); + label_tmp.ShareDataWith(*label); + Tensor emission_exps_tmp = + ctx.AllocateTmpTensor(emission_exps->dims(), dev_ctx); + emission_exps_tmp.ShareDataWith(*emission_exps); + Tensor alpha_tmp = + ctx.AllocateTmpTensor(alpha->dims(), dev_ctx); + alpha_tmp.ShareDataWith(*alpha); + size_t seq_num = 0; + size_t batch_size; + size_t tag_num; + const int64_t* length_data; + framework::Vector in_lod; + if (ctx.HasInput("length")) { + const Tensor* label_length = ctx.Input("length"); + length_data = label_length->data(); + seq_num = label_length->numel(); + batch_size = emission_dims[0] * emission_dims[1]; + tag_num = emission_dims[2]; + emission_weights_tmp.Resize( + {emission_dims[0] * emission_dims[1], emission_dims[2]}); + auto label_dims = label->dims(); + label_tmp.Resize({label_dims[0] * label_dims[1], label_dims[2]}); + alpha_tmp.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]}); + emission_exps_tmp.Resize( + {emission_dims[0] * emission_dims[1], emission_dims[2]}); + PADDLE_ENFORCE_EQ(seq_num, emission_dims[0], + "the size of Input(length) must be equal to " + "emission_dims[0]."); + PADDLE_ENFORCE_EQ(seq_num, label_dims[0], + "the size of Input(length) must be equal to " + "label_dims[0]."); + } else { + seq_num = ctx.Input("Label")->lod()[0].size() - 1; + batch_size = emission_dims[0]; + tag_num = emission_dims[1]; + in_lod = ctx.Input("Label")->lod()[0]; + PADDLE_ENFORCE_NE(in_lod.size(), 0, "Input(Label) must be a sequence."); + } + ll->Resize({static_cast(seq_num), 1}); ll->mutable_data(platform::CPUPlace()); - // Now, all the inputs and outputs should be on the CPU memory. - auto emission_dims = emission_weights->dims(); - const size_t batch_size = emission_dims[0]; - const size_t tag_num = emission_dims[1]; - Tensor emission_row_max; emission_row_max.mutable_data( framework::make_ddim({static_cast(batch_size), 1}), platform::CPUPlace()); - auto& place = *ctx.template device_context() .eigen_device(); - auto x = EigenMatrix::From(*emission_weights); + auto x = EigenMatrix::From(emission_weights_tmp); auto x_row_max = EigenMatrix::From(emission_row_max); x_row_max.device(place) = x.maximum(Eigen::DSizes(1)) .reshape(Eigen::DSizes(static_cast(batch_size), 1)); - - auto x_exps = EigenMatrix::From(*emission_exps); + auto x_exps = EigenMatrix::From(emission_exps_tmp); x_exps.device(place) = (x - x_row_max.broadcast(Eigen::DSizes(1, tag_num))).exp(); - auto w = EigenMatrix::From(*transition_weights); auto w_exps = EigenMatrix::From(*transition_exps); w_exps.device(place) = w.exp(); - T* log_likelihood = ll->data(); for (size_t i = 0; i < seq_num; ++i) { - int start_pos = static_cast(in_lod[level][i]); - int end_pos = static_cast(in_lod[level][i + 1]); + int start_pos = 0; + int end_pos = 0; + if (ctx.HasInput("length")) { + if (length_data[i] == 0) continue; + start_pos = i * emission_dims[1]; + end_pos = start_pos + static_cast(length_data[i]); + } else { + start_pos = static_cast(in_lod[i]); + end_pos = static_cast(in_lod[i + 1]); + } if (end_pos == start_pos) { // If an empty input sequence is given, pad 0 for its cost. log_likelihood[i] = 0.; continue; } - - const Tensor one_seq = emission_weights->Slice(start_pos, end_pos); + const Tensor one_seq = emission_weights_tmp.Slice(start_pos, end_pos); Tensor one_seq_row_max = emission_row_max.Slice(start_pos, end_pos); - Tensor one_seq_exps = emission_exps->Slice(start_pos, end_pos); - const Tensor one_seq_label = label->Slice(start_pos, end_pos); - Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos); - + Tensor one_seq_exps = emission_exps_tmp.Slice(start_pos, end_pos); + const Tensor one_seq_label = label_tmp.Slice(start_pos, end_pos); + Tensor one_seq_alpha = alpha_tmp.Slice(start_pos, end_pos); log_likelihood[i] = ForwardOneSequence( one_seq, one_seq_row_max, one_seq_exps, *transition_weights, *transition_exps, one_seq_label, &one_seq_alpha); @@ -197,52 +233,91 @@ template class LinearChainCRFGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - const size_t level = 0; // currently, only support sequence. - auto lod = ctx.Input("Label")->lod(); - PADDLE_ENFORCE(lod.size(), "Input(Label) must be a sequence."); - - const Tensor* label = ctx.Input("Label"); + const Tensor* label = ctx.Input("Label"); const Tensor* emission_exps = ctx.Input("EmissionExps"); const Tensor* transition_exps = ctx.Input("TransitionExps"); const Tensor* alpha = ctx.Input("Alpha"); const T* ll_grad = ctx.Input(framework::GradVarName("LogLikelihood"))->data(); - + auto& dev_ctx = ctx.template device_context(); Tensor* emission_grad = ctx.Output(framework::GradVarName("Emission")); + auto* emission_grad_data = + emission_grad->mutable_data(platform::CPUPlace()); + memset(emission_grad_data, 0, emission_grad->numel() * sizeof(T)); + Tensor alpha_tmp = + ctx.AllocateTmpTensor(alpha->dims(), dev_ctx); + alpha_tmp.ShareDataWith(*alpha); + Tensor label_tmp = + ctx.AllocateTmpTensor(label->dims(), dev_ctx); + label_tmp.ShareDataWith(*label); + Tensor emission_exps_tmp = + ctx.AllocateTmpTensor(emission_exps->dims(), dev_ctx); + emission_exps_tmp.ShareDataWith(*emission_exps); + Tensor emission_grad_tmp = + ctx.AllocateTmpTensor(emission_grad->dims(), dev_ctx); + emission_grad_tmp.ShareDataWith(*emission_grad); + // getting seq_num using padding or not + size_t seq_num = 0; + framework::Vector lod; + const int64_t* length_data; + if (ctx.HasInput("length")) { + const Tensor* label_length = ctx.Input("length"); + length_data = label_length->data(); + seq_num = label_length->numel(); + auto emission_dims = emission_grad->dims(); + auto label_dims = label->dims(); + emission_grad_tmp.Resize( + {emission_dims[0] * emission_dims[1], emission_dims[2]}); + label_tmp.Resize({label_dims[0] * label_dims[1], label_dims[2]}); + alpha_tmp.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]}); + emission_exps_tmp.Resize( + {emission_dims[0] * emission_dims[1], emission_dims[2]}); + } else { + seq_num = ctx.Input("Label")->lod()[0].size() - 1; + lod = ctx.Input("Label")->lod()[0]; + PADDLE_ENFORCE_NE(lod.size(), 0, "Input(Label) must be a sequence."); + } + Tensor* transition_grad = ctx.Output(framework::GradVarName("Transition")); // TODO(caoying) Fix this constraint. When the Input(Emission) is from the // data reader operator, it can have no gradients. - PADDLE_ENFORCE(emission_grad, "Output(Emission@Grad) should not be null."); - emission_grad->mutable_data(platform::CPUPlace()); if (transition_grad) { transition_grad->mutable_data(platform::CPUPlace()); math::set_constant(ctx.device_context(), transition_grad, 0.); } // Now, all the inputs and outputs should be on the CPU memory. - auto emission_dims = emission_exps->dims(); // Beta is the memo table used in dynamic programming to calculate the // backwark vectors. For a backward vector i (the i-th row of beta), it // captures the unnormalized probabilities of partial sequences starting // at position i. Tensor beta; - beta.mutable_data(emission_dims, platform::CPUPlace()); - - for (size_t i = 0; i < lod[level].size() - 1; ++i) { - int start_pos = static_cast(lod[level][i]); - int end_pos = static_cast(lod[level][i + 1]); - if (end_pos == start_pos) continue; - + auto* beta_data = beta.mutable_data(emission_dims, platform::CPUPlace()); + memset(beta_data, 0, beta.numel() * sizeof(T)); + if (ctx.HasInput("length")) { + beta.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]}); + } + for (size_t i = 0; i < seq_num; ++i) { + int start_pos = 0; + int end_pos = 0; + if (ctx.HasInput("length")) { + if (length_data[i] == 0) continue; + start_pos = i * emission_dims[1]; + end_pos = start_pos + static_cast(length_data[i]); + } else { + start_pos = static_cast(lod[i]); + end_pos = static_cast(lod[i + 1]); + } const Tensor one_seq_emission_exps = - emission_exps->Slice(start_pos, end_pos); - const Tensor one_seq_label = label->Slice(start_pos, end_pos); - const Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos); + emission_exps_tmp.Slice(start_pos, end_pos); + const Tensor one_seq_label = label_tmp.Slice(start_pos, end_pos); + const Tensor one_seq_alpha = alpha_tmp.Slice(start_pos, end_pos); Tensor one_seq_beta = beta.Slice(start_pos, end_pos); - Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos); - + Tensor one_seq_emission_grad = + emission_grad_tmp.Slice(start_pos, end_pos); BackwardOneSequence( ctx.template device_context(), ll_grad[i], one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label, @@ -261,7 +336,6 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { const T* x_exps = emission_exps.data(); const int64_t* label_value = label.data(); T* beta_value = beta->data(); - auto x_dims = emission_exps.dims(); const size_t seq_length = x_dims[0]; const size_t tag_num = x_dims[1]; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py old mode 100644 new mode 100755 index b671f63e86..932054dbf2 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1404,7 +1404,7 @@ def gru_unit(input, @templatedoc() -def linear_chain_crf(input, label, param_attr=None): +def linear_chain_crf(input, label, param_attr=None, length=None): """ Linear Chain CRF. @@ -1414,6 +1414,7 @@ def linear_chain_crf(input, label, param_attr=None): input(${emission_type}): ${emission_comment} input(${transition_type}): ${transition_comment} label(${label_type}): ${label_comment} + Length(${length_type}): ${length_comment} param_attr(ParamAttr): The attribute of the learnable parameter. Returns: @@ -1424,16 +1425,62 @@ def linear_chain_crf(input, label, param_attr=None): Examples: .. code-block:: python - import paddle.fluid as fluid - emission = fluid.layers.data(name='emission', shape=[1000], dtype='float32') - target = fluid.layers.data(name='target', shape=[1], dtype='int32') - crf_cost = fluid.layers.linear_chain_crf( - input=emission, - label=target, - param_attr=fluid.ParamAttr( + import paddle.fluid as fluid + import numpy as np + + #define net structure, using LodTensor + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + input_data = fluid.layers.data(name='input_data', shape=[10], dtype='float32', lod_level=1) + label = fluid.layers.data(name='label', shape=[1], dtype='int', lod_level=1) + emission= fluid.layers.fc(input=input_data, size=10, act="tanh") + crf_cost = fluid.layers.linear_chain_crf( + input=emission, + label=label, + param_attr=fluid.ParamAttr( + name='crfw', + learning_rate=0.01)) + use_cuda = False + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_program) + #define data, using LoDTensor + a = fluid.create_lod_tensor(np.random.rand(12,10).astype('float32'), [[3,3,4,2]], place) + b = fluid.create_lod_tensor(np.array([[1],[1],[2],[3],[1],[1],[1],[3],[1],[1],[1],[1]]),[[3,3,4,2]] , place) + feed1 = {'input_data':a,'label':b} + loss= exe.run(train_program,feed=feed1, fetch_list=[crf_cost]) + print(loss) + + #define net structure, using padding + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + input_data2 = fluid.layers.data(name='input_data2', shape=[10,10], dtype='float32') + label2 = fluid.layers.data(name='label2', shape=[10,1], dtype='int') + label_length = fluid.layers.data(name='length', shape=[1], dtype='int') + emission2= fluid.layers.fc(input=input_data2, size=10, act="tanh", num_flatten_dims=2) + crf_cost2 = fluid.layers.linear_chain_crf( + input=emission2, + label=label2, + length=label_length, + param_attr=fluid.ParamAttr( name='crfw', - learning_rate=0.2)) + learning_rate=0.01)) + + use_cuda = False + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_program) + + #define data, using padding + cc=np.random.rand(4,10,10).astype('float32') + dd=np.random.rand(4,10,1).astype('int64') + ll=np.array([[3,3,4,2]]) + feed2 = {'input_data2':cc,'label2':dd,'length':ll} + loss2= exe.run(train_program,feed=feed2, fetch_list=[crf_cost2]) + print(loss2) """ helper = LayerHelper('linear_chain_crf', **locals()) size = input.shape[1] @@ -1449,11 +1496,16 @@ def linear_chain_crf(input, label, param_attr=None): dtype=helper.input_dtype()) log_likelihood = helper.create_variable_for_type_inference( dtype=helper.input_dtype()) + this_inputs = { + "Emission": [input], + "Transition": transition, + "Label": [label] + } + if length: + this_inputs['length'] = [length] helper.append_op( type='linear_chain_crf', - inputs={"Emission": [input], - "Transition": transition, - "Label": label}, + inputs=this_inputs, outputs={ "Alpha": [alpha], "EmissionExps": [emission_exps], diff --git a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py old mode 100644 new mode 100755 index b365e1642e..b86d958601 --- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py +++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py @@ -111,7 +111,7 @@ class TestLinearChainCrfOp(OpTest): lod = [[]] seq_start_pos = [0] for i in range(SEQ_NUM): - lod[-1].append(random.randint(0, MAX_SEQ_LEN)) + lod[-1].append(random.randint(1, MAX_SEQ_LEN)) seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1]) emission = np.random.uniform( -1, 1, [seq_start_pos[-1], TAG_NUM]).astype("float64") @@ -157,5 +157,81 @@ class TestLinearChainCrfOp(OpTest): ["Emission"], "LogLikelihood", no_grad_set=set("Transition")) +class TestLinearChainCrfPaddingTensor(OpTest): + def seq_pad(self, data, length): + max_len = np.max(length) + shape = [len(length), max_len] + list(data.shape[1:]) + padded = np.zeros(shape).astype(data.dtype) + offset = 0 + for i, l in enumerate(length): + padded[i, 0:l] = data[offset:offset + l] + offset += l + return padded + + def seq_pad_exps(self, data, length): + # Adding for transition_exps + max_len = np.max(length) + shape = [len(length), max_len] + list(data.shape[1:]) + padded = np.ones(shape).astype(data.dtype) + offset = 0 + for i, l in enumerate(length): + padded[i, 0:l] = data[offset:offset + l] + offset += l + return padded + + def set_test_data_1(self): + # Fix the unittest by: add padding tensor in inputs + SEQ_NUM = 3 + TAG_NUM = 17 + MAX_SEQ_LEN = 5 + + # the linear_chain_crf operator only supports sequence (LoD level = 1) + lod = [[]] + seq_start_pos = [0] + for i in range(SEQ_NUM): + lod[-1].append(random.randint(1, MAX_SEQ_LEN)) + seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1]) + emission = np.random.uniform( + -1, 1, [seq_start_pos[-1], TAG_NUM]).astype("float64") + emission_row_max = np.amax(emission, axis=1, keepdims=True) + emission_exps = np.exp(emission - emission_row_max) + transition = np.random.uniform(-0.5, 0.5, + [TAG_NUM + 2, TAG_NUM]).astype("float64") + transition_exps = np.exp(transition) + + labels = np.random.randint( + low=0, high=TAG_NUM, size=(seq_start_pos[-1], 1), dtype="int64") + self.inputs = { + "Emission": self.seq_pad(emission, lod[0]), + "Transition": transition, + "Label": self.seq_pad(labels, lod[0]), + "length": np.array(lod).astype("int64") + } + crf = LinearChainCrfForward(seq_start_pos, emission, emission_row_max, + emission_exps, transition, transition_exps, + labels) + alpha, log_likelihood = crf.crf_forward_compute() + self.outputs = { + "Alpha": self.seq_pad(alpha, lod[0]), + "EmissionExps": self.seq_pad_exps(emission_exps, lod[0]), + "TransitionExps": transition_exps, + "LogLikelihood": log_likelihood + } + + def setUp(self): + self.op_type = "linear_chain_crf" + self.set_test_data_1() + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["Emission", "Transition"], "LogLikelihood") + + def test_check_grad_ignore_transition(self): + self.check_grad( + ["Emission"], "LogLikelihood", no_grad_set=set("Transition")) + + if __name__ == "__main__": unittest.main() -- GitLab