diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index 3304d857ae2600bd94013b6672b88d43d1d188c6..3962d55324d8147f41dcd3c1685efb573f004e97 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -114,16 +114,19 @@ class Tensor { const platform::DeviceContext& ctx); /** - * @brief Return the slice of the tensor. + * @brief Return a sub-tensor of the given tensor. * - * @param[in] begin_idx The begin index of the slice. - * @param[in] end_idx The end index of the slice. + * @param[in] begin_idx The index of the start row(inclusive) to slice. + * The index number begins from 0. + * @param[in] end_idx The index of the end row(exclusive) to slice. + * The index number begins from 0. */ template inline Tensor Slice(const int& begin_idx, const int& end_idx) const; platform::Place place() const { - PADDLE_ENFORCE_NOT_NULL(holder_, "Tensor get place() must contains holder"); + PADDLE_ENFORCE_NOT_NULL( + holder_, "A holder must exist when calling the method place()."); return holder_->place(); } diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h index ce73e0a9edbe340f1165e2dbcba8c976c55df348..635a84f415f0cfd74a50818ef2a8ad10739bef52 100644 --- a/paddle/framework/tensor_impl.h +++ b/paddle/framework/tensor_impl.h @@ -168,10 +168,11 @@ inline void Tensor::CopyFromVector(const std::vector& src, template inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const { check_memory_size(); - PADDLE_ENFORCE_GE(begin_idx, 0, "Slice begin index is less than zero."); - PADDLE_ENFORCE_LE(end_idx, dims_[0], "Slice end index is out of bound."); + PADDLE_ENFORCE_GE(begin_idx, 0, + "The start row index must be greater than 0."); + PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound."); PADDLE_ENFORCE_LT(begin_idx, end_idx, - "Begin index must be less than end index."); + "The start row index must be less than the end row index."); if (dims_[0] == 1) { return *this; diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc index 6a13f82cce498f6a4c00770d188ab329b2e09a58..b4ea0338b24a8def00d1e6a6e9d5bbebe0602478 100644 --- a/paddle/operators/cross_entropy_op.cc +++ b/paddle/operators/cross_entropy_op.cc @@ -49,7 +49,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel { ctx->ShareLoD("X", /*->*/ "Y"); } - // Explicitly set data type of output of the cross_entropy operator + // Explicitly set that data type of the output of the cross_entropy operator // is determined by its input "X". framework::DataType IndicateDataType( const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc index bdff6ffc6a50fbbb59e62d65a36c37e9c056912a..b451ae62e2deac044f8fe04eeae5486d11fb3ead 100644 --- a/paddle/operators/linear_chain_crf_op.cc +++ b/paddle/operators/linear_chain_crf_op.cc @@ -17,6 +17,9 @@ limitations under the License. */ namespace paddle { namespace operators { +using framework::LoDTensor; +using framework::LoD; + class LinearChainCrfOpMaker : public framework::OpProtoAndCheckerMaker { public: LinearChainCrfOpMaker(framework::OpProto* proto, @@ -77,14 +80,14 @@ Please see http://www.cs.columbia.edu/~mcollins/fb.pdf for reference. Equation: -- Denote the first input of this operator (Emission) as \f$x\f$ here. -- The first D values of the second input (Transition) of this operator are for -starting weights, denoted as \f$a\f$ here. -- The next D values of the second input (Transition) of this operator are for -ending weights, denoted as \f$b\f$ here. -- The remaning values of the second input (Transition) are for transition -weights, denoted as \f$w\f$ here. -- Denote the third input of this operator (Label) as \f$s\f$ here. +- Denote Input(Emission) to this operator as \f$x\f$ here. +- The first D values of Input(Transition) to this operator are for starting +weights, denoted as \f$a\f$ here. +- The next D values of Input(Transition) of this operator are for ending +weights, denoted as \f$b\f$ here. +- The remaning values of Input(Transition) are for transition weights, +denoted as \f$w\f$ here. +- Denote Input(Label) as \f$s\f$ here. The probability of a sequence \f$s\f$ of length \f$L\f$ is defined as: \f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L} @@ -107,8 +110,7 @@ sequences internally, it expects UNSCALED emission feature weights. Please do not call this op with the emission feature being output of any nonlinear activation. -3. The 2nd dimension of the first input of this operator (Emission) MUST be -equal to the tag number. +3. The 2nd dimension of Input(Emission) MUST be equal to the tag number. )DOC"); } @@ -136,33 +138,188 @@ class LinearChainCrfOp : public framework::OperatorWithKernel { auto label_dims = ctx->GetInputDim("Label"); PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL, - "The input Emission should be a 2-D tensor."); + "The Input(Emission) should be a 2-D tensor."); PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL, - "The input Transition should be a 2-D tensor."); + "The Input(Transition) should be a 2-D tensor."); PADDLE_ENFORCE_EQ( - transition_dims[0] + 2, transition_dims[1], - "An invalid dimension for the input Transition, which should " + transition_dims[0] - 2, transition_dims[1], + "An invalid dimension for the Input(Transition), which should " "be a 2-D tensor with shape [D + 2 x D]."); PADDLE_ENFORCE_EQ( emission_dims[1], transition_dims[1], - "The 2nd dimension of the input Emission and the input Transition " + "The 2nd dimension of the Input(Emission) and the Input(Transition) " "should be equal to the tag number."); PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL, - "The input Label should be a 2-D tensor " - "with the 2nd dimensions fixed to 1."); + "The Input(Label) should be a 2-D tensor with the 2nd " + "dimensions fixed to 1."); + PADDLE_ENFORCE_EQ( + emission_dims[0], label_dims[0], + "The height of Input(Emission) and the height of Input(Label) " + "should be the same."); ctx->SetOutputDim("Alpha", emission_dims); + + // (TODO caoying) This is tricky. The 1st dimension of Output(LogLikelihood) + // is the sequence number in a mini-batch. The dimension set here should be + // resized to its correct size in the function Compute. ctx->SetOutputDim("LogLikelihood", {emission_dims[0], 1}); } - // Explicitly set data type of output of the linear_chain_crf operator - // is determined by its input "Emission". + // Explicitly set that the data type of output of the linear_chain_crf + // operator is determined by its input "Emission". framework::DataType IndicateDataType( const framework::ExecutionContext& ctx) const override { return framework::ToDataType(ctx.Input("Emission")->type()); } }; +template +class LinearChainCrfOpKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "This kernel only runs on CPU."); + + auto* emission_weights = ctx.Input("Emission"); + auto* transition_weights = ctx.Input("Transition"); + auto* label = ctx.Input("Label"); + + auto in_lod = emission_weights->lod(); + // TODO(caoying) The checks related to LoD information should be + // moved into InferShape once after the InferShape is refactored. + PADDLE_ENFORCE_EQ(emission_weights->NumLevels(), 1UL, + "The Input(Emission) should be a sequence."); + PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL, + "The Input(Label) should be a sequence."); + const size_t level = 0; + + auto emission_dims = emission_weights->dims(); + const size_t seq_num = in_lod[level].size() - 1; + + // TODO(caoying) These local variables seems to be created and destroied + // every time this function is called. Will this bring additional overhead? + Tensor emission_exps; + Tensor emission_row_max; + Tensor transition_exps; + emission_exps.mutable_data(emission_dims, platform::CPUPlace()); + emission_row_max.mutable_data( + framework::make_ddim({emission_dims[0], 1}), platform::CPUPlace()); + transition_exps.mutable_data(transition_weights->dims(), + platform::CPUPlace()); + + auto* alpha = ctx.Output("Alpha"); + alpha->mutable_data(ctx.GetPlace()); + auto* ll = ctx.Output("LogLikelihood"); + // resize the output tensor to the correct dimension. + ll->Resize({static_cast(seq_num), 1}); + T* log_likelihood = ll->mutable_data(ctx.GetPlace()); + + for (size_t i = 0; i < seq_num; ++i) { + int start_pos = static_cast(in_lod[level][i]); + int end_pos = static_cast(in_lod[level][i + 1]); + + const Tensor one_seq = emission_weights->Slice(start_pos, end_pos); + Tensor one_seq_row_max = emission_row_max.Slice(start_pos, end_pos); + Tensor one_seq_exps = emission_exps.Slice(start_pos, end_pos); + const Tensor one_seq_label = label->Slice(start_pos, end_pos); + Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos); + + log_likelihood[i] = ForwardOneSequence( + ctx.device_context(), one_seq, one_seq_row_max, one_seq_exps, + (*transition_weights), transition_exps, one_seq_label, one_seq_alpha); + } + } + + protected: + T ForwardOneSequence(const platform::DeviceContext& ctx, + const Tensor& emission, Tensor& emission_row_max, + Tensor& emission_exps, const Tensor& trans_weights, + Tensor& trans_weight_exps, const Tensor& label, + Tensor& alpha) const { + // (TODO caoying) Evaluate and optimize this. + // The Eigen compution kernel will be invoked for multiple times. + // Some computations regardless of sequence inforamtion could be performed + // only one time for the entire batch. This potentially could be optimized. + + auto x_dims = emission.dims(); + const size_t seq_length = x_dims[0]; + const size_t tag_num = x_dims[1]; + + T* alpha_value = alpha.data(); + + auto x = EigenMatrix::From(emission); + auto x_row_max = EigenMatrix::From(emission_row_max); + const int class_dim = 1; + x_row_max.device(*ctx.GetEigenDevice()) = + x.maximum(Eigen::DSizes(class_dim)) + .reshape(Eigen::DSizes(int(seq_length), 1)); + + auto x_exps = EigenMatrix::From(emission_exps); + x_exps.device(*ctx.GetEigenDevice()) = + (x - x_row_max.broadcast(Eigen::DSizes(1, tag_num))).exp(); + + auto w = EigenMatrix::From(trans_weights); + auto w_exps = EigenMatrix::From(trans_weight_exps); + w_exps.device(*ctx.GetEigenDevice()) = w.exp(); + // The 1st row of w are transition weights for start mask. + const size_t start_ridx = 0; + // The 2nd row of w are transition weights for end mask. + const size_t end_ridx = 1; + // Transition weights among other tags begins from the 3rd row of w. + const size_t state_base_ridx = 2; + + for (size_t i = 0; i < tag_num; ++i) { + alpha_value[i] = w_exps(start_ridx, i) * x_exps(0, i); + } + T ll = -x_row_max(0, 1) - std::log(NormalizeL1(alpha_value, tag_num)); + + for (size_t k = 1; k < seq_length; ++k) { + for (size_t i = 0; i < tag_num; ++i) { + T sum = 0.; + for (size_t j = 0; j < tag_num; ++j) { + sum += alpha_value[(k - 1) * tag_num + j] * + w_exps(j + state_base_ridx, i); + } + alpha_value[k * tag_num + i] = x_exps(k, i) * sum; + } + ll -= x_row_max(k, 1) + + std::log(NormalizeL1(alpha_value + k * tag_num, tag_num)); + } + T sum = 0.; + for (size_t i = 0; i < tag_num; ++i) { + sum += alpha_value[(seq_length - 1) * tag_num + i] * w_exps(end_ridx, i); + } + ll -= std::log(sum); + + const int* lbl = label.data(); + PADDLE_ENFORCE_LT( + *std::max_element(lbl, lbl + seq_length), tag_num, + "An invalid tag label that execesses the largest tag number."); + + // Calculate the nominator part, which depends on the label sequence. + ll += w(start_ridx, lbl[0]) + x(start_ridx, lbl[0]) + + w(end_ridx, lbl[seq_length - 1]); + for (size_t k = 1; k < seq_length; ++k) + ll += x(k, lbl[k]) + w(lbl[k - 1], lbl[k]); + return -ll; + } + + private: + T NormalizeL1(T* x, size_t len) const { + T sum = 0.; + for (size_t i = 0; i < len; ++i) sum += x[i]; + // (This comment is from the old LinearChainCRFLayer.) + // Right now, we just bet that sum won't be zero. If this really happens, we + // will figure out what should be done then. + PADDLE_ENFORCE(sum, + "The unnormalized probabilites of all possible unfinished " + "sequences must be greater than 0."); + for (size_t i = 0; i < len; ++i) x[i] /= sum; + return sum; + } +}; + class LinearChainCrfGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -171,12 +328,25 @@ class LinearChainCrfGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override {} }; +template +class LinearChainCrfGradOpKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "This kernel only runs on CPU."); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP(linear_chain_crf, ops::LinearChainCrfOp, ops::LinearChainCrfOpMaker, linear_chain_crf_grad, ops::LinearChainCrfGradOp); -REGISTER_OP_CPU_KERNEL(linear_chain_crf, ops::LinearChainCrfOpKernel); -REGISTER_OP_CPU_KERNEL(linear_chain_crf_grad, - ops::LinearChainCrfGradOpKernel); +REGISTER_OP_CPU_KERNEL( + linear_chain_crf, + ops::LinearChainCrfOpKernel); +REGISTER_OP_CPU_KERNEL( + linear_chain_crf_grad, + ops::LinearChainCrfGradOpKernel); diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h index ddea39b0c707e9c04a227d8fa153c4a7a837cb9e..a656e233c2c6331affca345283a3c22ee32852e1 100644 --- a/paddle/operators/linear_chain_crf_op.h +++ b/paddle/operators/linear_chain_crf_op.h @@ -19,27 +19,31 @@ limitations under the License. */ namespace paddle { namespace operators { -using Tensor = framework::Tensor; +using framework::Tensor; template using EigenMatrix = framework::EigenMatrix; -template +template class LinearChainCrfOpKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), - "This kernel only runs on CPU."); - } + void Compute(const framework::ExecutionContext& ctx) const override; + + protected: + T ForwardOneSequence(const platform::DeviceContext& ctx, + const Tensor& emission, Tensor& emission_row_max, + Tensor& emission_exps, const Tensor& trans_weights, + Tensor& trans_weight_exps, const Tensor& label, + Tensor& a) const; + + private: + T NormalizeL1(T* x, size_t len) const; }; -template +template class LinearChainCrfGradOpKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), - "This kernel only runs on CPU."); - } + void Compute(const framework::ExecutionContext& ctx) const override; }; } // namespace operators diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc index e639f3a46879a4e4e7087ea91607a387d476199d..98a1c70f11629bac13260d5e67efc7d69ebd16ce 100644 --- a/paddle/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/operators/softmax_with_cross_entropy_op.cc @@ -60,19 +60,23 @@ Because this operators performs a softmax on logits internally, it expects unscaled logits. Please do not call this op with the output of softmax operator, which will produce incorrect results. -This operators expects mutually exclusive hard labels, each sample in a batch -is in exactly one class with probabilities 1. Each sample in the batch with one -and only one label. +When the attribute softLabel is set false, this operators expects mutually +exclusive hard labels, each sample in a batch is in exactly one class with +probabilities 1. Each sample in the batch with one and only one label. Equation: 1) hard label (one-hot label) -Loss_j = -\text{Logit}_{Label_j} + \log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right), j = 1, ..., K +Loss_j = \f$ -\text{Logit}_{Label_j} + +\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right), +j = 1, ..., K $\f 2) soft label (a distribution over all classes) -Loss_j = -\sum_{i=0}^{K}\text{Label}_i\left(\text{Logit}_i-\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right), j = 1,...,K +Loss_j = \f$ -\sum_{i=0}^{K}\text{Label}_i\left(\text{Logit}_i - +\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right), +j = 1,...,K $\f )DOC"); } diff --git a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py b/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py index b16c4d40b965d622cd3773fe65fb5e9f5c13cf0d..413210e75b8feeaf76710eb3965a007446aba852 100644 --- a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py +++ b/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py @@ -61,13 +61,13 @@ class LinearChainCrfForward(object): s += alpha[-1, i] * self.b_exps[i] log_likelihood -= np.log(s) - # calculate the noninator part. + # calculate the nominator part. log_likelihood += ( self.a[label[0]] + self.x[0, label[0]] + self.b[label[-1]]) for k in range(1, seq_len): log_likelihood += ( self.x[k, label[k]] + self.w[label[k - 1], label[k]]) - return log_likelihood + return -log_likelihood def crf_forward_compute(self): for i in range(self.seq_num): @@ -102,7 +102,7 @@ class TestLinearChainCrfOp(OpTest): self.inputs = { "Emission": (emission, lod), "Transition": transition, - "label": (labels, lod) + "Label": (labels, lod) } crf = LinearChainCrfForward(lod[0], emission, transition, labels)