From 427644b2fa01e6a44b6d3bc0b4d2fcc8ba8b6265 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Mon, 23 Oct 2017 10:07:12 +0800 Subject: [PATCH] fix the computation kernels. --- paddle/framework/operator.h | 2 +- paddle/operators/linear_chain_crf_op.cc | 122 +++++++++++------- paddle/operators/linear_chain_crf_op.h | 2 +- .../tests/test_linear_chain_crf_op.py | 15 +-- 4 files changed, 84 insertions(+), 57 deletions(-) diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index 0d0304ac9e..e9cf2f97e0 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -659,7 +659,7 @@ class OperatorWithKernel : public OperatorBase { if (t != nullptr) { int tmp = static_cast(ToDataType(t->type())); PADDLE_ENFORCE(tmp == data_type || data_type == -1, - "DataType of Paddle Op must be same."); + "DataType of Paddle Op must be the same."); data_type = tmp; } } diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc index 268b1c41db..12034d7d6e 100644 --- a/paddle/operators/linear_chain_crf_op.cc +++ b/paddle/operators/linear_chain_crf_op.cc @@ -165,11 +165,11 @@ class LinearChainCrfOp : public framework::OperatorWithKernel { "Output(LogLikelihood) should be not null."); auto emission_dims = ctx->GetInputDim("Emission"); - auto transition_dims = ctx->GetInputDim("Transition"); - auto label_dims = ctx->GetInputDim("Label"); - PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL, "The Input(Emission) should be a 2-D tensor."); + PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed."); + + auto transition_dims = ctx->GetInputDim("Transition"); PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL, "The Input(Transition) should be a 2-D tensor."); PADDLE_ENFORCE_EQ( @@ -180,6 +180,8 @@ class LinearChainCrfOp : public framework::OperatorWithKernel { emission_dims[1], transition_dims[1], "The 2nd dimension of the Input(Emission) and the Input(Transition) " "should be equal to the tag number."); + + auto label_dims = ctx->GetInputDim("Label"); PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL, "The Input(Label) should be a 2-D tensor with the 2nd " "dimensions fixed to 1."); @@ -204,7 +206,7 @@ class LinearChainCrfOp : public framework::OperatorWithKernel { // operator is determined by its input "Emission". framework::DataType IndicateDataType( const framework::ExecutionContext& ctx) const override { - return framework::ToDataType(ctx.Input("Emission")->type()); + return framework::ToDataType(ctx.Input("Emission")->type()); } }; @@ -224,6 +226,8 @@ class LinearChainCrfOpKernel auto* label = ctx.Input("Label"); auto in_lod = emission_weights->lod(); + PADDLE_ENFORCE(in_lod.size(), "Input(Emission) is not a sequence."); + // TODO(caoying) The checks related to LoD information should be // moved into InferShape once after the InferShape is refactored. PADDLE_ENFORCE_EQ(emission_weights->NumLevels(), 1UL, @@ -266,12 +270,17 @@ class LinearChainCrfOpKernel for (size_t i = 0; i < seq_num; ++i) { int start_pos = static_cast(in_lod[level][i]); int end_pos = static_cast(in_lod[level][i + 1]); + if (end_pos == start_pos) { + // If an empty input sequence is given, pad 0 for its cost. + log_likelihood[i] = static_cast(0.); + continue; + } - const Tensor one_seq = emission_weights->Slice(start_pos, end_pos); - Tensor one_seq_row_max = emission_row_max.Slice(start_pos, end_pos); - Tensor one_seq_exps = emission_exps->Slice(start_pos, end_pos); - const Tensor one_seq_label = label->Slice(start_pos, end_pos); - Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos); + const Tensor one_seq = emission_weights->Slice(start_pos, end_pos); + Tensor one_seq_row_max = emission_row_max.Slice(start_pos, end_pos); + Tensor one_seq_exps = emission_exps->Slice(start_pos, end_pos); + const Tensor one_seq_label = label->Slice(start_pos, end_pos); + Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos); log_likelihood[i] = ForwardOneSequence( &one_seq, &one_seq_row_max, &one_seq_exps, transition_weights, @@ -306,7 +315,7 @@ class LinearChainCrfOpKernel for (size_t k = 1; k < seq_length; ++k) { for (size_t i = 0; i < tag_num; ++i) { - T sum = 0.; + T sum = static_cast(0.); for (size_t j = 0; j < tag_num; ++j) { sum += alpha_value[(k - 1) * tag_num + j] * w_exps[(j + state_trans_base_idx) * tag_num + i]; @@ -326,11 +335,14 @@ class LinearChainCrfOpKernel PADDLE_ENFORCE_LT( *std::max_element(lbl, lbl + seq_length), tag_num, "An invalid tag label that execesses the largest tag number."); + // Calculate the nominator part, which depends on the label sequence. ll += w[lbl[0]] /*start transition*/ + x[lbl[0]] + w[tag_num + lbl[seq_length - 1]] /*end transition*/; - for (size_t k = 1; k < seq_length; ++k) - ll += x[k * tag_num + lbl[k]] + w[lbl[k - 1] * tag_num + lbl[k]]; + for (size_t k = 1; k < seq_length; ++k) { + ll += x[k * tag_num + lbl[k]] + + w[(lbl[k - 1] + state_trans_base_idx) * tag_num + lbl[k]]; + } return -ll; } }; @@ -353,12 +365,13 @@ class LinearChainCrfGradOp : public framework::OperatorWithKernel { "Output(Transition@GRAD) should be not null."); auto emission_exps_dims = ctx->GetInputDim("EmissionExps"); - auto transition_exps_dims = - ctx->GetInputDim(framework::GradVarName("TransitionExps")); - auto label_dims = ctx->GetInputDim("Label"); - PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2UL, "The Input(EmissionExps) should be a 2-D tensor."); + PADDLE_ENFORCE(emission_exps_dims[0], + "An empty mini-batch is not allowed."); + + auto transition_exps_dims = + ctx->GetInputDim(framework::GradVarName("TransitionExps")); PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2UL, "The Input(TransitionExps) should be a 2-D tensor."); PADDLE_ENFORCE_EQ( @@ -369,6 +382,8 @@ class LinearChainCrfGradOp : public framework::OperatorWithKernel { emission_exps_dims[1], transition_exps_dims[1], "The 2nd dimension of the Input(EmissionExps) and the " "Input(TransitionExps) should be equal to the tag number."); + + auto label_dims = ctx->GetInputDim("Label"); PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL, "The Input(Label) should be a 2-D tensor with the 2nd " "dimensions fixed to 1."); @@ -381,6 +396,14 @@ class LinearChainCrfGradOp : public framework::OperatorWithKernel { ctx->SetOutputDim(framework::GradVarName("Transition"), transition_exps_dims); } + + protected: + // Explicitly set that the data type of output of the linear_chain_crf_grad + // operator is determined by its input "EmissionExps". + framework::DataType IndicateDataType( + const framework::ExecutionContext& ctx) const override { + return framework::ToDataType(ctx.Input("EmissionExps")->type()); + } }; template @@ -390,12 +413,12 @@ class LinearChainCrfGradOpKernel void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "This kernel only runs on CPU."); - auto* ll_grad = - ctx.Input(framework::GradVarName("LogLikelihood")); auto* label = ctx.Input("Label"); auto* emission_exps = ctx.Input("EmissionExps"); auto* transition_exps = ctx.Input("TransitionExps"); - auto* alpha = ctx.Input("Alpha"); + auto* alpha = ctx.Input("Alpha"); + const T* ll_grad = + ctx.Input(framework::GradVarName("LogLikelihood"))->data(); auto* emission_grad = ctx.Output(framework::GradVarName("Emission")); @@ -413,34 +436,31 @@ class LinearChainCrfGradOpKernel Tensor beta; beta.mutable_data(emission_dims, platform::CPUPlace()); - auto place = ctx.GetEigenDevice(); - auto x_grad = EigenMatrix::From(*emission_grad); - auto out_grad = EigenMatrix::From(*ll_grad); - x_grad.device(place) = - x_grad * out_grad.broadcast(Eigen::DSizes(1, emission_dims[1])); - const size_t level = 0; // currently, only support sequence. - auto lod = emission_exps->lod(); + auto lod = label->lod(); + PADDLE_ENFORCE(lod.size(), "Input(Label) is not a sequence."); + for (size_t i = 0; i < lod[level].size() - 1; ++i) { int start_pos = static_cast(lod[level][i]); int end_pos = static_cast(lod[level][i + 1]); + if (end_pos == start_pos) continue; const Tensor one_seq_emission_exps = - emission_exps->Slice(start_pos, end_pos); - const Tensor one_seq_label = label->Slice(start_pos, end_pos); - const Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos); - Tensor one_seq_beta = beta.Slice(start_pos, end_pos); - Tensor one_seq_emission_grad = - emission_grad->Slice(start_pos, end_pos); - - BackwardOneSequence(ctx.device_context(), &one_seq_emission_exps, - transition_exps, &one_seq_alpha, &one_seq_label, - &one_seq_beta, trans_grad, &one_seq_emission_grad); + emission_exps->Slice(start_pos, end_pos); + const Tensor one_seq_label = label->Slice(start_pos, end_pos); + const Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos); + Tensor one_seq_beta = beta.Slice(start_pos, end_pos); + Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos); + + BackwardOneSequence(ctx.device_context(), ll_grad[i], + &one_seq_emission_exps, transition_exps, + &one_seq_alpha, &one_seq_label, &one_seq_beta, + trans_grad, &one_seq_emission_grad); } } protected: - void BackwardOneSequence(const platform::DeviceContext& ctx, + void BackwardOneSequence(const platform::DeviceContext& ctx, const T ll_grad, const Tensor* emission_exps, const Tensor* transition_exps, const Tensor* alpha, const Tensor* label, Tensor* beta, @@ -457,12 +477,15 @@ class LinearChainCrfGradOpKernel const size_t state_trans_base_idx = 2; // Calculate the backwark vectors beta. - for (int i = 0; i < tag_num; ++i) + // First, calculate the initialition state. + for (int i = 0; i < tag_num; ++i) { beta_value[(seq_length - 1) * tag_num + i] = w_exps[tag_num + i]; + } NormalizeL1(beta_value + (seq_length - 1) * tag_num, tag_num); + for (int k = seq_length - 2; k >= 0; --k) { for (int i = 0; i < tag_num; ++i) { - T sum = 0.; + T sum = static_cast(0.); for (int j = 0; j < tag_num; ++j) { sum += w_exps[(i + state_trans_base_idx) * tag_num + j] * x_exps[(k + 1) * tag_num + j] * @@ -476,6 +499,7 @@ class LinearChainCrfGradOpKernel auto alpha_mat = EigenMatrix::From(*alpha); auto beta_mat = EigenMatrix::From(*beta); auto x_grad_mat = EigenMatrix::From(*emission_grad); + x_grad_mat.setConstant(ll_grad); auto* place = ctx.GetEigenDevice(); x_grad_mat.device(*place) = alpha_mat * beta_mat; @@ -483,8 +507,9 @@ class LinearChainCrfGradOpKernel .reshape(Eigen::DSizes(seq_length, 1)) .broadcast(Eigen::DSizes(1, tag_num)); - for (int k = 0; k < seq_length; ++k) + for (int k = 0; k < seq_length; ++k) { x_grad_mat(k, label_value[k]) -= static_cast(1); + } if (transition_grad) { T* trans_grad = transition_grad->data(); @@ -501,20 +526,23 @@ class LinearChainCrfGradOpKernel .broadcast(Eigen::DSizes(1, tag_num)); for (int k = 1; k < seq_length; ++k) { - T sum = 0.; + T sum = static_cast(0.); for (int i = 0; i < tag_num; ++i) { - for (int j = 0; j < tag_num; ++j) - sum += x_exps_mat(i, j) * alpha_mat(k - 1, i) * beta_mat(k, j); + for (int j = 0; j < tag_num; ++j) { + sum += w_exps[(i + state_trans_base_idx) * tag_num + j] * + alpha_mat(k - 1, i) * beta_mat(k, j); + } } - sum = static_cast(1) / sum; + sum = static_cast(1.) / sum; for (int i = 0; i < tag_num; ++i) { for (int j = 0; j < tag_num; ++j) { - trans_grad[(i + 2) * tag_num + j] += - sum * x_exps_mat(i, j) * alpha_mat(k - 1, i) * beta_mat(k, j); + trans_grad[(i + state_trans_base_idx) * tag_num + j] += + sum * w_exps[(i + state_trans_base_idx) * tag_num + j] * + alpha_mat(k - 1, i) * beta_mat(k, j); } } trans_grad[label_value[k - 1] * tag_num + label_value[k]] -= - static_cast(1); + static_cast(1.); } } } diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h index e9852de595..f65d268bb6 100644 --- a/paddle/operators/linear_chain_crf_op.h +++ b/paddle/operators/linear_chain_crf_op.h @@ -42,7 +42,7 @@ class LinearChainCrfGradOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override; protected: - void BackwardOneSequence(const platform::DeviceContext& ctx, + void BackwardOneSequence(const platform::DeviceContext& ctx, const T ll_grad, const Tensor* emission_exps, const Tensor* transition_exps, const Tensor* alpha, const Tensor* label, Tensor* beta, diff --git a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py b/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py index 9b73e26eb9..0f169ada95 100644 --- a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py +++ b/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py @@ -4,8 +4,6 @@ import numpy as np from op_test import OpTest -import pdb - class LinearChainCrfForward(object): def __init__(self, seq_start_positions, emission_weights, emission_row_max, @@ -65,10 +63,10 @@ class LinearChainCrfForward(object): # calculate the nominator part. log_likelihood += ( - self.a[label[0]] + self.x[0, label[0]] + self.b[label[-1]]) + self.a[label[0]] + x[0, label[0]] + self.b[label[-1]]) + for k in range(1, seq_len): - log_likelihood += ( - self.x[k, label[k]] + self.w[label[k - 1], label[k]]) + log_likelihood += (x[k, label[k]] + self.w[label[k - 1], label[k]]) return -log_likelihood def crf_forward_compute(self): @@ -77,7 +75,7 @@ class LinearChainCrfForward(object): end = self.seq_start_positions[i + 1] self.log_likelihood[i] = self._forward_a_sequence( - self.x[start:end], self.x_row_max[start:end, :], + self.x[start:end, :], self.x_row_max[start:end, :], self.x_exps[start:end, :], self.labels[start:end, :], self.alpha[start:end, :]) return self.alpha, self.log_likelihood @@ -85,10 +83,11 @@ class LinearChainCrfForward(object): class TestLinearChainCrfOp(OpTest): def set_test_data(self): - SEQ_NUM = 3 + SEQ_NUM = 2 TAG_NUM = 17 - MAX_SEQ_LEN = 13 + MAX_SEQ_LEN = 5 + random.seed(1) # the linear_chain_crf operator only supports sequence (LoD level = 1) lod = [[0]] for i in range(SEQ_NUM): -- GitLab