From 427644b2fa01e6a44b6d3bc0b4d2fcc8ba8b6265 Mon Sep 17 00:00:00 2001
From: caoying03 <caoying03@baidu.com>
Date: Mon, 23 Oct 2017 10:07:12 +0800
Subject: [PATCH] fix the computation kernels.

---
 paddle/framework/operator.h                   |   2 +-
 paddle/operators/linear_chain_crf_op.cc       | 122 +++++++++++-------
 paddle/operators/linear_chain_crf_op.h        |   2 +-
 .../tests/test_linear_chain_crf_op.py         |  15 +--
 4 files changed, 84 insertions(+), 57 deletions(-)
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 0d0304ac9e..e9cf2f97e0 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -659,7 +659,7 @@ class OperatorWithKernel : public OperatorBase {
           if (t != nullptr) {
             int tmp = static_cast<int>(ToDataType(t->type()));
             PADDLE_ENFORCE(tmp == data_type || data_type == -1,
-                           "DataType of Paddle Op must be same.");
+                           "DataType of Paddle Op must be the same.");
             data_type = tmp;
           }
         }
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index 268b1c41db..12034d7d6e 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -165,11 +165,11 @@ class LinearChainCrfOp : public framework::OperatorWithKernel {
                    "Output(LogLikelihood) should be not null.");
 
     auto emission_dims = ctx->GetInputDim("Emission");
-    auto transition_dims = ctx->GetInputDim("Transition");
-    auto label_dims = ctx->GetInputDim("Label");
-
     PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL,
                       "The Input(Emission) should be a 2-D tensor.");
+    PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
+
+    auto transition_dims = ctx->GetInputDim("Transition");
     PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL,
                       "The Input(Transition) should be a 2-D tensor.");
     PADDLE_ENFORCE_EQ(
@@ -180,6 +180,8 @@ class LinearChainCrfOp : public framework::OperatorWithKernel {
         emission_dims[1], transition_dims[1],
         "The 2nd dimension of the Input(Emission) and the Input(Transition) "
         "should be equal to the tag number.");
+
+    auto label_dims = ctx->GetInputDim("Label");
     PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
                    "The Input(Label) should be a 2-D tensor with the 2nd "
                    "dimensions fixed to 1.");
@@ -204,7 +206,7 @@ class LinearChainCrfOp : public framework::OperatorWithKernel {
   // operator is determined by its input "Emission".
   framework::DataType IndicateDataType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::ToDataType(ctx.Input<Tensor>("Emission")->type());
+    return framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type());
   }
 };
 
@@ -224,6 +226,8 @@ class LinearChainCrfOpKernel<platform::CPUPlace, T>
     auto* label = ctx.Input<LoDTensor>("Label");
 
     auto in_lod = emission_weights->lod();
+    PADDLE_ENFORCE(in_lod.size(), "Input(Emission) is not a sequence.");
+
     // TODO(caoying) The checks related to LoD information should be
     // moved into InferShape once after the InferShape is refactored.
     PADDLE_ENFORCE_EQ(emission_weights->NumLevels(), 1UL,
@@ -266,12 +270,17 @@ class LinearChainCrfOpKernel<platform::CPUPlace, T>
     for (size_t i = 0; i < seq_num; ++i) {
       int start_pos = static_cast<int>(in_lod[level][i]);
       int end_pos = static_cast<int>(in_lod[level][i + 1]);
+      if (end_pos == start_pos) {
+        // If an empty input sequence is given, pad 0 for its cost.
+        log_likelihood[i] = static_cast<T>(0.);
+        continue;
+      }
 
-      const Tensor one_seq = emission_weights->Slice<T>(start_pos, end_pos);
-      Tensor one_seq_row_max = emission_row_max.Slice<T>(start_pos, end_pos);
-      Tensor one_seq_exps = emission_exps->Slice<T>(start_pos, end_pos);
-      const Tensor one_seq_label = label->Slice<T>(start_pos, end_pos);
-      Tensor one_seq_alpha = alpha->Slice<T>(start_pos, end_pos);
+      const Tensor one_seq = emission_weights->Slice(start_pos, end_pos);
+      Tensor one_seq_row_max = emission_row_max.Slice(start_pos, end_pos);
+      Tensor one_seq_exps = emission_exps->Slice(start_pos, end_pos);
+      const Tensor one_seq_label = label->Slice(start_pos, end_pos);
+      Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos);
 
       log_likelihood[i] = ForwardOneSequence(
           &one_seq, &one_seq_row_max, &one_seq_exps, transition_weights,
@@ -306,7 +315,7 @@ class LinearChainCrfOpKernel<platform::CPUPlace, T>
 
     for (size_t k = 1; k < seq_length; ++k) {
       for (size_t i = 0; i < tag_num; ++i) {
-        T sum = 0.;
+        T sum = static_cast<T>(0.);
         for (size_t j = 0; j < tag_num; ++j) {
           sum += alpha_value[(k - 1) * tag_num + j] *
                  w_exps[(j + state_trans_base_idx) * tag_num + i];
@@ -326,11 +335,14 @@ class LinearChainCrfOpKernel<platform::CPUPlace, T>
     PADDLE_ENFORCE_LT(
         *std::max_element(lbl, lbl + seq_length), tag_num,
         "An invalid tag label that execesses the largest tag number.");
+
     // Calculate the nominator part, which depends on the label sequence.
     ll += w[lbl[0]] /*start transition*/ + x[lbl[0]] +
           w[tag_num + lbl[seq_length - 1]] /*end transition*/;
-    for (size_t k = 1; k < seq_length; ++k)
-      ll += x[k * tag_num + lbl[k]] + w[lbl[k - 1] * tag_num + lbl[k]];
+    for (size_t k = 1; k < seq_length; ++k) {
+      ll += x[k * tag_num + lbl[k]] +
+            w[(lbl[k - 1] + state_trans_base_idx) * tag_num + lbl[k]];
+    }
     return -ll;
   }
 };
@@ -353,12 +365,13 @@ class LinearChainCrfGradOp : public framework::OperatorWithKernel {
                    "Output(Transition@GRAD) should be not null.");
 
     auto emission_exps_dims = ctx->GetInputDim("EmissionExps");
-    auto transition_exps_dims =
-        ctx->GetInputDim(framework::GradVarName("TransitionExps"));
-    auto label_dims = ctx->GetInputDim("Label");
-
     PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2UL,
                       "The Input(EmissionExps) should be a 2-D tensor.");
+    PADDLE_ENFORCE(emission_exps_dims[0],
+                   "An empty mini-batch is not allowed.");
+
+    auto transition_exps_dims =
+        ctx->GetInputDim(framework::GradVarName("TransitionExps"));
     PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2UL,
                       "The Input(TransitionExps) should be a 2-D tensor.");
     PADDLE_ENFORCE_EQ(
@@ -369,6 +382,8 @@ class LinearChainCrfGradOp : public framework::OperatorWithKernel {
         emission_exps_dims[1], transition_exps_dims[1],
         "The 2nd dimension of the Input(EmissionExps) and the "
         "Input(TransitionExps) should be equal to the tag number.");
+
+    auto label_dims = ctx->GetInputDim("Label");
     PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
                    "The Input(Label) should be a 2-D tensor with the 2nd "
                    "dimensions fixed to 1.");
@@ -381,6 +396,14 @@ class LinearChainCrfGradOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("Transition"),
                       transition_exps_dims);
   }
+
+ protected:
+  // Explicitly set that the data type of output of the linear_chain_crf_grad
+  // operator is determined by its input "EmissionExps".
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<LoDTensor>("EmissionExps")->type());
+  }
 };
 
 template <typename T>
@@ -390,12 +413,12 @@ class LinearChainCrfGradOpKernel<platform::CPUPlace, T>
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                    "This kernel only runs on CPU.");
-    auto* ll_grad =
-        ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"));
     auto* label = ctx.Input<LoDTensor>("Label");
     auto* emission_exps = ctx.Input<LoDTensor>("EmissionExps");
     auto* transition_exps = ctx.Input<Tensor>("TransitionExps");
-    auto* alpha = ctx.Input<Tensor>("Alpha");
+    auto* alpha = ctx.Input<LoDTensor>("Alpha");
+    const T* ll_grad =
+        ctx.Input<Tensor>(framework::GradVarName("LogLikelihood"))->data<T>();
 
     auto* emission_grad =
         ctx.Output<Tensor>(framework::GradVarName("Emission"));
@@ -413,34 +436,31 @@ class LinearChainCrfGradOpKernel<platform::CPUPlace, T>
     Tensor beta;
     beta.mutable_data<T>(emission_dims, platform::CPUPlace());
 
-    auto place = ctx.GetEigenDevice<platform::CPUPlace>();
-    auto x_grad = EigenMatrix<T>::From(*emission_grad);
-    auto out_grad = EigenMatrix<T>::From(*ll_grad);
-    x_grad.device(place) =
-        x_grad * out_grad.broadcast(Eigen::DSizes<int, 2>(1, emission_dims[1]));
-
     const size_t level = 0;  // currently, only support sequence.
-    auto lod = emission_exps->lod();
+    auto lod = label->lod();
+    PADDLE_ENFORCE(lod.size(), "Input(Label) is not a sequence.");
+
     for (size_t i = 0; i < lod[level].size() - 1; ++i) {
       int start_pos = static_cast<int>(lod[level][i]);
       int end_pos = static_cast<int>(lod[level][i + 1]);
+      if (end_pos == start_pos) continue;
 
       const Tensor one_seq_emission_exps =
-          emission_exps->Slice<T>(start_pos, end_pos);
-      const Tensor one_seq_label = label->Slice<T>(start_pos, end_pos);
-      const Tensor one_seq_alpha = alpha->Slice<T>(start_pos, end_pos);
-      Tensor one_seq_beta = beta.Slice<T>(start_pos, end_pos);
-      Tensor one_seq_emission_grad =
-          emission_grad->Slice<T>(start_pos, end_pos);
-
-      BackwardOneSequence(ctx.device_context(), &one_seq_emission_exps,
-                          transition_exps, &one_seq_alpha, &one_seq_label,
-                          &one_seq_beta, trans_grad, &one_seq_emission_grad);
+          emission_exps->Slice(start_pos, end_pos);
+      const Tensor one_seq_label = label->Slice(start_pos, end_pos);
+      const Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos);
+      Tensor one_seq_beta = beta.Slice(start_pos, end_pos);
+      Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos);
+
+      BackwardOneSequence(ctx.device_context(), ll_grad[i],
+                          &one_seq_emission_exps, transition_exps,
+                          &one_seq_alpha, &one_seq_label, &one_seq_beta,
+                          trans_grad, &one_seq_emission_grad);
     }
   }
 
  protected:
-  void BackwardOneSequence(const platform::DeviceContext& ctx,
+  void BackwardOneSequence(const platform::DeviceContext& ctx, const T ll_grad,
                            const Tensor* emission_exps,
                            const Tensor* transition_exps, const Tensor* alpha,
                            const Tensor* label, Tensor* beta,
@@ -457,12 +477,15 @@ class LinearChainCrfGradOpKernel<platform::CPUPlace, T>
     const size_t state_trans_base_idx = 2;
 
     // Calculate the backwark vectors beta.
-    for (int i = 0; i < tag_num; ++i)
+    // First, calculate the initialition state.
+    for (int i = 0; i < tag_num; ++i) {
       beta_value[(seq_length - 1) * tag_num + i] = w_exps[tag_num + i];
+    }
     NormalizeL1<T>(beta_value + (seq_length - 1) * tag_num, tag_num);
+
     for (int k = seq_length - 2; k >= 0; --k) {
       for (int i = 0; i < tag_num; ++i) {
-        T sum = 0.;
+        T sum = static_cast<T>(0.);
         for (int j = 0; j < tag_num; ++j) {
           sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *
                  x_exps[(k + 1) * tag_num + j] *
@@ -476,6 +499,7 @@ class LinearChainCrfGradOpKernel<platform::CPUPlace, T>
     auto alpha_mat = EigenMatrix<T>::From(*alpha);
     auto beta_mat = EigenMatrix<T>::From(*beta);
     auto x_grad_mat = EigenMatrix<T>::From(*emission_grad);
+    x_grad_mat.setConstant(ll_grad);
 
     auto* place = ctx.GetEigenDevice<platform::CPUPlace>();
     x_grad_mat.device(*place) = alpha_mat * beta_mat;
@@ -483,8 +507,9 @@ class LinearChainCrfGradOpKernel<platform::CPUPlace, T>
                       .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
                       .broadcast(Eigen::DSizes<int, 2>(1, tag_num));
 
-    for (int k = 0; k < seq_length; ++k)
+    for (int k = 0; k < seq_length; ++k) {
       x_grad_mat(k, label_value[k]) -= static_cast<T>(1);
+    }
 
     if (transition_grad) {
       T* trans_grad = transition_grad->data<T>();
@@ -501,20 +526,23 @@ class LinearChainCrfGradOpKernel<platform::CPUPlace, T>
                       .broadcast(Eigen::DSizes<int, 2>(1, tag_num));
 
       for (int k = 1; k < seq_length; ++k) {
-        T sum = 0.;
+        T sum = static_cast<T>(0.);
         for (int i = 0; i < tag_num; ++i) {
-          for (int j = 0; j < tag_num; ++j)
-            sum += x_exps_mat(i, j) * alpha_mat(k - 1, i) * beta_mat(k, j);
+          for (int j = 0; j < tag_num; ++j) {
+            sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *
+                   alpha_mat(k - 1, i) * beta_mat(k, j);
+          }
         }
-        sum = static_cast<T>(1) / sum;
+        sum = static_cast<T>(1.) / sum;
         for (int i = 0; i < tag_num; ++i) {
           for (int j = 0; j < tag_num; ++j) {
-            trans_grad[(i + 2) * tag_num + j] +=
-                sum * x_exps_mat(i, j) * alpha_mat(k - 1, i) * beta_mat(k, j);
+            trans_grad[(i + state_trans_base_idx) * tag_num + j] +=
+                sum * w_exps[(i + state_trans_base_idx) * tag_num + j] *
+                alpha_mat(k - 1, i) * beta_mat(k, j);
           }
         }
         trans_grad[label_value[k - 1] * tag_num + label_value[k]] -=
-            static_cast<T>(1);
+            static_cast<T>(1.);
       }
     }
   }
diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h
index e9852de595..f65d268bb6 100644
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -42,7 +42,7 @@ class LinearChainCrfGradOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override;
 
  protected:
-  void BackwardOneSequence(const platform::DeviceContext& ctx,
+  void BackwardOneSequence(const platform::DeviceContext& ctx, const T ll_grad,
                            const Tensor* emission_exps,
                            const Tensor* transition_exps, const Tensor* alpha,
                            const Tensor* label, Tensor* beta,
diff --git a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py b/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
index 9b73e26eb9..0f169ada95 100644
--- a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
+++ b/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
@@ -4,8 +4,6 @@ import numpy as np
 
 from op_test import OpTest
 
-import pdb
-
 
 class LinearChainCrfForward(object):
     def __init__(self, seq_start_positions, emission_weights, emission_row_max,
@@ -65,10 +63,10 @@ class LinearChainCrfForward(object):
 
         # calculate the nominator part.
         log_likelihood += (
-            self.a[label[0]] + self.x[0, label[0]] + self.b[label[-1]])
+            self.a[label[0]] + x[0, label[0]] + self.b[label[-1]])
+
         for k in range(1, seq_len):
-            log_likelihood += (
-                self.x[k, label[k]] + self.w[label[k - 1], label[k]])
+            log_likelihood += (x[k, label[k]] + self.w[label[k - 1], label[k]])
         return -log_likelihood
 
     def crf_forward_compute(self):
@@ -77,7 +75,7 @@ class LinearChainCrfForward(object):
             end = self.seq_start_positions[i + 1]
 
             self.log_likelihood[i] = self._forward_a_sequence(
-                self.x[start:end], self.x_row_max[start:end, :],
+                self.x[start:end, :], self.x_row_max[start:end, :],
                 self.x_exps[start:end, :], self.labels[start:end, :],
                 self.alpha[start:end, :])
         return self.alpha, self.log_likelihood
@@ -85,10 +83,11 @@ class LinearChainCrfForward(object):
 
 class TestLinearChainCrfOp(OpTest):
     def set_test_data(self):
-        SEQ_NUM = 3
+        SEQ_NUM = 2
         TAG_NUM = 17
-        MAX_SEQ_LEN = 13
+        MAX_SEQ_LEN = 5
 
+        random.seed(1)
         # the linear_chain_crf operator only supports sequence (LoD level = 1)
         lod = [[0]]
         for i in range(SEQ_NUM):
-- 
GitLab