follow comments.

2ac9a3d8 · caoying03 · dd2be3da · 2ac9a3d8 · 2ac9a3d8 · 2ac9a3d8
4 changed file
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -235,7 +235,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
  PADDLE_ENFORCE_LT(
      begin_idx, end_idx,
-      "The start row index must be smaller than the end row index.");
+      "The start row index must be lesser than the end row index.");
  if (dims_[0] == 1) {
    return *this;

--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -26,9 +26,8 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
        "Emission",
        "(LoDTensor, default: LoDTensor<float>). "
        "The unscaled emission weight matrix for the linear chain CRF. "
-        "This input is a LoDTensor with shape [N x D] where N is the total "
+        "This input is a LoDTensor with shape [N x D] where N is the size of "
-        "element number of all input squences in a mini-batch, "
+        "the mini-batch and D is the total tag number.");
-        "and D is the total tag number.");
    AddInput(
        "Transition",
        "(Tensor, default: Tensor<float>). A Tensor with shape [(D + 2) x D]. "
@@ -36,7 +35,7 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
        "See more details in the operator's comments.");
    AddInput(
        "Label",
-        "(LoDTensor, default: LoDTensor<int>). The groundtruth which is a 2-D "
+        "(LoDTensor, default: LoDTensor<int>). The ground truth which is a 2-D "
        "LoDTensor with shape [N x 1], where N is the total element number in "
        "a mini-batch.");
    AddOutput(
@@ -77,12 +76,13 @@ variables. CRF learns the conditional probability \f$P(Y|X)\f$, where
 Linear chain CRF is a special case of CRF that is useful for sequence labeling
 task. Sequence labeling tasks do not assume a lot of conditional
-independences among inputs. They only concern about the input and the output
+independences among inputs. The only constraint they impose is that the input
-being linear sequences. Thus, the graph model of such a CRF is a simple chain
+and output must be linear sequences. Thus, the graph of such a CRF is a simple
-or a line, which results in the linear chain CRF.
+chain or a line, which results in the linear chain CRF.
 This operator implements the Forward-Backward algorithm for the linear chain
-CRF. Please see http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
+CRF. Please see http://www.cs.columbia.edu/~mcollins/fb.pdf and
+http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for reference.
 Equation:
@@ -111,7 +111,7 @@ NOTE:
 transition features. The emission feature weights are NOT computed in
 this operator. They MUST be computed first before this operator is called.
-2. Because this operator performs globally normaliztion over all possible
+2. Because this operator performs global normalization over all possible
 sequences internally, it expects UNSCALED emission feature weights.
 Please do not call this op with the emission feature being output of any
 nonlinear activation.
@@ -171,9 +171,10 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Alpha", emission_dims);
    ctx->SetOutputDim("EmissionExps", emission_dims);
    ctx->SetOutputDim("TransitionExps", transition_dims);
-    // (TODO caoying) This is tricky. The 1st dimension of Output(LogLikelihood)
+    // TODO(caoying) This is tricky. The 1st dimension of Output(LogLikelihood)
    // is the sequence number in a mini-batch. The dimension set here should be
-    // resized to its correct size in the function Compute.
+    // resized to its correct size in the function Compute. Fix this once we can
+    // get LoD information in the InferShape interface.
    ctx->SetOutputDim("LogLikelihood", {emission_dims[0], 1});
  }
@@ -236,7 +237,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
 protected:
  // Explicitly set that the data type of output of the linear_chain_crf_grad
-  // operator is determined by its input: graidents of LogLikelihood.
+  // operator is determined by its input: gradients of LogLikelihood.
  framework::DataType IndicateDataType(
      const framework::ExecutionContext& ctx) const override {
    return framework::ToDataType(

--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -188,7 +188,6 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
                            const LoDTensor& src, LoDTensor* dst) {
      dst->mutable_data<T>(src.dims(), platform::CPUPlace());
      dst->CopyFrom(src, platform::CPUPlace(), ctx);
    };
    copyLoDTensor(ctx, emission_weights_src, emission_weights_dst);
@@ -248,7 +247,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
      for (size_t i = 0; i < tag_num; ++i) {
        T sum = 0.;
        for (size_t j = 0; j < tag_num; ++j) {
-          sum += alpha_value[(k - 1) * tag_num + j] *
+          sum += alpha_value[(k - 1) * tag_num + j] *  // (*)
                 w_exps[(j + state_trans_base_idx) * tag_num + i];
        }
        alpha_value[k * tag_num + i] = x_exps[k * tag_num + i] * sum;
@@ -291,7 +290,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
    // These local variables hold the inputs and outputs, garanteeing them on
    // CPU memory, to provide a consistent reference.
    // TODO(caoying) Fix this by moving all these local variables into the
-    // class's data members once we can profile the training process.
+    // class's data members once we can profile the training process, or
+    // implementing a real GPU kernel for CRF.
    Tensor* label = nullptr;
    Tensor label_tensor;
    Tensor* emission_exps = nullptr;
@@ -344,6 +344,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
      transition_grad =
          ctx.Output<Tensor>(framework::GradVarName("Transition"));
    }
+    // TODO(caoying) Fix this constraint. When the Input(Emission) is from the
+    // data reader operator, it can have no gradients.
    PADDLE_ENFORCE(emission_grad, "Output(Emission@Grad) should not be null.");
    emission_grad->mutable_data<T>(platform::CPUPlace());
    math::SetConstant<platform::CPUPlace, T>()(ctx.device_context(),
@@ -458,7 +461,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
      for (size_t i = 0; i < tag_num; ++i) {
        T sum = 0.;
        for (size_t j = 0; j < tag_num; ++j) {
-          sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *
+          sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *  // (**)
                 x_exps[(k + 1) * tag_num + j] *
                 beta_value[(k + 1) * tag_num + j];
        }
@@ -493,7 +496,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
      auto x_exps_mat = EigenMatrix<T>::From(emission_exps);
-      // TODO(caoying): Fix this to avoid using this local variable.
+      // TODO(caoying): Fix this to avoid using this local variable if when can
+      // profiling the training process.
      Tensor tmp;
      tmp.mutable_data<T>(beta->dims(), platform::CPUPlace());
      auto tmp_mat = EigenMatrix<T>::From(tmp);

--- a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
+++ b/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
@@ -83,6 +83,9 @@ class LinearChainCrfForward(object):
 class TestLinearChainCrfOp(OpTest):
    def set_test_data(self):
+        # TODO(caoying) Fix the unittest by: add the boundary cases when
+        # sequence lengths are 1, 2, and 3.
        SEQ_NUM = 3
        TAG_NUM = 17
        MAX_SEQ_LEN = 5