From e9c61b67e25f37324131bd8f54afc0b45d992c6d Mon Sep 17 00:00:00 2001
From: Peng Li <lipeng17@baidu.com>
Date: Wed, 30 Nov 2016 21:55:29 +0800
Subject: [PATCH] Fix bug in processing instance weight and coeff in CRFLayer
 The instance weight and coeff are only mulipled to the gradient with respect
 to input.

---
 paddle/gserver/layers/CRFDecodingLayer.cpp   |   2 +-
 paddle/gserver/layers/CRFLayer.cpp           |  29 ++-
 paddle/gserver/layers/CRFLayer.h             |   1 +
 paddle/gserver/layers/LinearChainCRF.cpp     |  72 ++++----
 paddle/gserver/layers/LinearChainCRF.h       |  26 ++-
 paddle/gserver/tests/CMakeLists.txt          |   9 +
 paddle/gserver/tests/test_CRFLayerGrad.cpp   | 178 +++++++++++++++++++
 paddle/gserver/tests/test_LayerGrad.cpp      |  21 ---
 paddle/gserver/tests/test_LinearChainCRF.cpp |   2 +-
 9 files changed, 260 insertions(+), 80 deletions(-)
 create mode 100644 paddle/gserver/tests/test_CRFLayerGrad.cpp

diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/gserver/layers/CRFDecodingLayer.cpp
index 8986741dc30..3c66f4191a8 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.cpp
+++ b/paddle/gserver/layers/CRFDecodingLayer.cpp
@@ -24,7 +24,7 @@ bool CRFDecodingLayer::init(const LayerMap& layerMap,
     return false;
   }
   crf_.reset(new LinearChainCRF(
-      numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData(), nullptr));
+      numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData()));
   return true;
 }
 
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp
index ed4f864ba91..fa434cda002 100644
--- a/paddle/gserver/layers/CRFLayer.cpp
+++ b/paddle/gserver/layers/CRFLayer.cpp
@@ -42,6 +42,8 @@ bool CRFLayer::init(const LayerMap& layerMap,
   CHECK_EQ(parameters_[0]->getSize(), numClasses_ * (numClasses_ + 2));
 
   parameter_ = parameters_[0];
+  weight_.reset(
+      new Weight(numClasses_ + 2, numClasses_, parameter_));
 
   // We don't need sequenceStartPositions because each sample of output_ is
   // for the cost of one sequence.
@@ -69,11 +71,7 @@ void CRFLayer::forward(PassType passType) {
 
   for (size_t i = 0; i < numSequences; ++i) {
     if (i >= crfs_.size()) {
-      crfs_.emplace_back(numClasses_,
-                         parameter_->getBuf(PARAMETER_VALUE)->getData(),
-                         parameter_->getBuf(PARAMETER_GRADIENT)
-                             ? parameter_->getBuf(PARAMETER_GRADIENT)->getData()
-                             : nullptr);
+      crfs_.emplace_back(numClasses_, weight_->getW()->getData());
     }
     output_.value->getData()[i] =
         crfs_[i].forward(output.value->getData() + numClasses_ * starts[i],
@@ -94,21 +92,22 @@ void CRFLayer::backward(const UpdateCallback& callback) {
   int numSequences = label.sequenceStartPositions->getSize() - 1;
 
   for (int i = 0; i < numSequences; ++i) {
+    bool needWGrad = weight_->getWGrad() ? true : false;
     crfs_[i].backward(output.value->getData() + numClasses_ * starts[i],
-                      output.grad->getData() + numClasses_ * starts[i],
                       label.ids->getData() + starts[i],
-                      starts[i + 1] - starts[i]);
-    if (weightLayer_) {
-      real weight = getInputValue(*weightLayer_)->getElement(i, 0);
-      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
-      grad->mulScalar(weight);
+                      starts[i + 1] - starts[i], needWGrad);
+    real instanceWeight = weightLayer_ ?
+      getInputValue(*weightLayer_)->getElement(i, 0) : real(1.0f);
+    instanceWeight *= coeff_;
+
+    MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
+    grad->add(*crfs_[i].getXGrad(), real(1.0f), instanceWeight);
+    if (needWGrad) {
+      weight_->getWGrad()->add(*crfs_[i].getWGrad(), real(1.0f),
+          instanceWeight);
     }
   }
 
-  if (coeff_ != real(1.0f)) {
-    output.grad->mulScalar(coeff_);
-  }
-
   parameter_->incUpdate(callback);
 }
 
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
index 21c7fc61e16..ae024b8fa38 100644
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -38,6 +38,7 @@ protected:
   ParameterPtr parameter_;
   std::vector<LinearChainCRF> crfs_;
   LayerPtr weightLayer_;  // weight for each sequence
+  std::unique_ptr<Weight> weight_;  // parameters
   real coeff_;            // weight for the layer
 };
 
diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp
index 2b3a50b2e29..f9e4bb83d43 100644
--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@@ -17,18 +17,12 @@ limitations under the License. */
 
 namespace paddle {
 
-LinearChainCRF::LinearChainCRF(int numClasses, real* para, real* grad)
+LinearChainCRF::LinearChainCRF(int numClasses, real* para)
     : numClasses_(numClasses) {
   a_ = Matrix::create(para, 1, numClasses_);
   b_ = Matrix::create(para + numClasses_, 1, numClasses_);
   w_ = Matrix::create(para + 2 * numClasses_, numClasses_, numClasses_);
 
-  if (grad) {
-    da_ = Matrix::create(grad, 1, numClasses_);
-    db_ = Matrix::create(grad + numClasses_, 1, numClasses_);
-    dw_ = Matrix::create(grad + 2 * numClasses_, numClasses_, numClasses_);
-  }
-
   ones_ = Matrix::create(1, numClasses_);
   ones_->one();
 
@@ -107,19 +101,24 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
   return -ll;
 }
 
-void LinearChainCRF::backward(real* x, real* dx, int* s, int length) {
+void LinearChainCRF::backward(real* x, int* s, int length, bool needWGrad) {
   MatrixPtr matX = Matrix::create(x, length, numClasses_);
-  MatrixPtr matDX = Matrix::create(dx, length, numClasses_);
-  MatrixPtr matGrad = Matrix::create(length, numClasses_);
+  Matrix::resizeOrCreate(matGrad_, length, numClasses_);
   Matrix::resizeOrCreate(beta_, length, numClasses_);
   real* b = b_->getData();
-  real* dw = dw_ ? dw_->getData() : nullptr;
+  if (needWGrad) {
+    Matrix::resizeOrCreate(matWGrad_, numClasses_ + 2, numClasses_);
+    matWGrad_->zeroMem();
+    da_ = matWGrad_->subRowMatrix(0, 1);
+    db_ = matWGrad_->subRowMatrix(1, 2);
+    dw_ = matWGrad_->subRowMatrix(2, numClasses_ + 2);
+  }
 
   real* alpha = alpha_->getData();
   real* beta = beta_->getData();
   real* expW = expW_->getData();
   real* expX = expX_->getData();
-  real* grad = matGrad->getData();
+  real* grad = matGrad_->getData();
 
   for (int i = 0; i < numClasses_; ++i) {
     beta[(length - 1) * numClasses_ + i] = exp(b[i]);
@@ -140,39 +139,38 @@ void LinearChainCRF::backward(real* x, real* dx, int* s, int length) {
     normalizeL1(beta + k * numClasses_, numClasses_);
   }
 
-  matGrad->dotMul(*alpha_, *beta_);
-  matGrad->rowNormalizeL1(*matGrad);
+  matGrad_->dotMul(*alpha_, *beta_);
+  matGrad_->rowNormalizeL1(*matGrad_);
   for (int k = 0; k < length; ++k) {
     grad[k * numClasses_ + s[k]] -= (real)1;
   }
-  matDX->add(*matGrad);
-  if (da_) {
-    da_->add(*matGrad->subMatrix(/* startRow= */ 0, /* numRows= */ 1));
-  }
-  if (db_) {
-    db_->add(*matGrad->subMatrix(/* startRow= */ length - 1, 1));
-  }
 
-  beta_->dotMul(*beta_, *expX_);
-  beta_->rowNormalizeL1(*beta_);
+  if (needWGrad) {
+    da_->add(*matGrad_->subMatrix(/* startRow= */ 0, /* numRows= */ 1));
+    db_->add(*matGrad_->subMatrix(/* startRow= */ length - 1, 1));
 
-  for (int k = 1; dw && k < length; ++k) {
-    real sum = 0;
-    for (int i = 0; i < numClasses_; ++i) {
-      for (int j = 0; j < numClasses_; ++j) {
-        sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] *
-               beta[k * numClasses_ + j];
+    beta_->dotMul(*beta_, *expX_);
+    beta_->rowNormalizeL1(*beta_);
+
+    real* dw = dw_->getData();
+    for (int k = 1; k < length; ++k) {
+      real sum = 0;
+      for (int i = 0; i < numClasses_; ++i) {
+        for (int j = 0; j < numClasses_; ++j) {
+          sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] *
+                 beta[k * numClasses_ + j];
+        }
       }
-    }
-    sum = 1 / sum;
-    for (int i = 0; i < numClasses_; ++i) {
-      for (int j = 0; j < numClasses_; ++j) {
-        dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] *
-                                   alpha[(k - 1) * numClasses_ + i] *
-                                   beta[k * numClasses_ + j];
+      sum = 1 / sum;
+      for (int i = 0; i < numClasses_; ++i) {
+        for (int j = 0; j < numClasses_; ++j) {
+          dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] *
+                                     alpha[(k - 1) * numClasses_ + i] *
+                                     beta[k * numClasses_ + j];
+        }
       }
+      dw[s[k - 1] * numClasses_ + s[k]] -= (real)1;
     }
-    dw[s[k - 1] * numClasses_ + s[k]] -= (real)1;
   }
 }
 
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
index 6368f2b9de2..f58ac6b5813 100644
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ b/paddle/gserver/layers/LinearChainCRF.h
@@ -21,7 +21,7 @@ namespace paddle {
 class LinearChainCRF {
 public:
   /**
-   * The size of para and grad must be \f$(numClasses + 2) * numClasses\f$.
+   * The size of para must be \f$(numClasses + 2) * numClasses\f$.
    * The first numClasses values of para are for starting weights (\f$a\f$).
    * The next numClasses values of para are for ending weights (\f$b\f$),
    * The remaning values are for transition weights (\f$w\f$).
@@ -34,7 +34,7 @@ public:
    * all possible
    * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
    */
-  LinearChainCRF(int numClasses, real* para, real* grad);
+  LinearChainCRF(int numClasses, real* para);
 
   /**
    * Calculate the negative log likelihood of s given x.
@@ -45,29 +45,45 @@ public:
 
   /**
    * Calculate the gradient with respect to x, a, b, and w.
-   * The gradient of x will be stored in dx.
    * backward() can only be called after a corresponding call to forward() with
    * the same x, s and length.
-   * @note The gradient is added to dx and grad (provided at constructor).
+   * The gradient with respect to a, b, and w will not be calculated if
+   * needWGrad is false.
+   * @note Please call getWGrad() and getXGrad() to get the gradient with
+   * respect to (a, b, w) and x respectively.
    */
-  void backward(real* x, real* dx, int* s, int length);
+  void backward(real* x, int* s, int length, bool needWGrad);
 
   /**
    * Find the most probable sequence given x. The result will be stored in s.
    */
   void decode(real* x, int* s, int length);
 
+  /*
+   * Return the gradient with respect to (a, b, w). It can only be called after
+   * a corresponding call to backward().
+   */
+  MatrixPtr getWGrad() { return matWGrad_; }
+
+  /*
+   * Return the gradient with respect to x. It can only be called after a
+   * corresponding call to backward().
+   */
+  MatrixPtr getXGrad() { return matGrad_; }
+
 protected:
   int numClasses_;
   MatrixPtr a_;
   MatrixPtr b_;
   MatrixPtr w_;
+  MatrixPtr matWGrad_;
   MatrixPtr da_;
   MatrixPtr db_;
   MatrixPtr dw_;
   MatrixPtr ones_;
 
   MatrixPtr expX_;
+  MatrixPtr matGrad_;
   MatrixPtr alpha_;
   MatrixPtr beta_;
   MatrixPtr maxX_;
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 0651d0b4733..1525960ffd4 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -20,6 +20,15 @@ add_unittest_without_exec(test_LayerGrad
 add_test(NAME test_LayerGrad
     COMMAND test_LayerGrad)
 
+################ test_CRFLayerGrad ####################
+add_unittest_without_exec(test_CRFLayerGrad
+    test_CRFLayerGrad.cpp
+    LayerGradUtil.cpp
+    TestUtil.cpp)
+add_test(NAME test_CRFLayerGrad
+    COMMAND test_CRFLayerGrad)
+
+
 add_unittest_without_exec(test_ActivationGrad
     test_ActivationGrad.cpp
     LayerGradUtil.cpp
diff --git a/paddle/gserver/tests/test_CRFLayerGrad.cpp b/paddle/gserver/tests/test_CRFLayerGrad.cpp
new file mode 100644
index 00000000000..bc1d5f3061d
--- /dev/null
+++ b/paddle/gserver/tests/test_CRFLayerGrad.cpp
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/gserver/layers/LinearChainCRF.h"
+#include "ModelConfig.pb.h"
+
+#include "TestUtil.h"
+#include "LayerGradUtil.h"
+
+using namespace paddle; // NOLINT
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_int32(gpu_id);
+P_DECLARE_double(checkgrad_eps);
+P_DECLARE_bool(thread_local_rand_use_global_seed);
+P_DECLARE_bool(prev_batch_state);
+
+static inline bool getNextSequence(std::vector<int>& seq, int numClasses) {
+  for (auto& v : seq) {
+    if (++v < numClasses) {
+      return true;
+    }
+    v = 0;
+  }
+  return false;
+}
+
+// log(exp(x) + exp(y))
+static inline real logSum(real x, real y) {
+  real maxValue = std::max(x, y);
+  if (std::isinf(maxValue)) {
+    return -std::numeric_limits<real>::infinity();
+  } else {
+    return maxValue + log(exp(x - maxValue) + exp(y - maxValue));
+  }
+}
+
+static inline std::vector<int> genRandLabels(int numClasses, int length) {
+  std::vector<int> labels(length);
+  for (int i = 0; i < length; ++i) {
+    labels[i] = rand() % numClasses;  // NOLINT
+  }
+  return labels;
+}
+
+TEST(CRFLayer, cost) {
+  const int numClasses = 4;
+  CpuVector para(numClasses * (numClasses + 2));
+  real* a = para.getData();
+  real* b = para.getData() + numClasses;
+  real* w = para.getData() + 2 * numClasses;
+  LinearChainCRF crf(4, para.getData());
+  for (int length : {1, 2, 3, 10}) {
+    for (int tries = 0; tries < 10; ++tries) {
+      CpuMatrix x(length, numClasses);
+      x.randomizeUniform();
+      para.randnorm(0, 2);
+
+      std::vector<int> goldenLabels = genRandLabels(numClasses, length);
+
+      real cost = crf.forward(x.getData(), goldenLabels.data(), length);
+
+      real logZ = -std::numeric_limits<real>::infinity();
+      real logNominator = -std::numeric_limits<real>::infinity();
+      std::vector<int> testResult(length, 0);
+      do {
+        real score = a[testResult.front()];
+        score += x.getElement(0, testResult.front());
+        for (int k = 1; k < length; ++k) {
+          score += x.getElement(k, testResult[k]) +
+                   w[numClasses * testResult[k - 1] + testResult[k]];
+        }
+        score += b[testResult.back()];
+        logZ = logSum(logZ, score);
+
+        if (goldenLabels == testResult) {
+          logNominator = score;
+        }
+      } while (getNextSequence(testResult, numClasses));
+
+      real trueCost = -logNominator + logZ;
+
+      real diff = fabs(trueCost - cost);
+      diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost);
+      VLOG(1) << "cost=" << cost << " trueCost=" << trueCost
+        << " diff=" << diff << std::endl;
+      if (typeid(real) == typeid(double)) {  // NOLINT
+        EXPECT_LE(diff, 1e-10);
+      } else {
+        EXPECT_LE(diff, 5e-3);
+      }
+    }
+  }
+}
+
+inline real epsilon() {
+  return typeid(real) == typeid(double) ? 1e-10 : 0.05;
+}
+
+TestConfig initTestConfig(size_t numClasses, bool withWeight) {
+  TestConfig config;
+  config.layerConfig.set_type("crf");
+  config.layerConfig.set_size(numClasses);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0",
+      numClasses, numClasses * (numClasses + 2)});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_label",
+      numClasses, 0});
+  config.layerConfig.add_inputs();
+
+  if (withWeight) {
+    config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight",
+        1, 0});
+    config.layerConfig.add_inputs();
+  }
+
+  return config;
+}
+
+TEST(Layer, CRFLayer) {
+  size_t numClasses = 10;
+  for (int tries = 0; tries < 5; ++tries) {
+    TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
+    for (int length : {1, 3, 100}) {
+      // Not support GPU now
+      testLayerGrad(config,
+                    "crf",
+                    length,
+                    /* trans= */ false,
+                    /* useGpu= */ false,
+                    /* useWeight= */ false,
+                    epsilon());
+    }
+  }
+}
+
+TEST(Layer, CRFLayerUseWeight) {
+  size_t numClasses = 10;
+  for (int tries = 0; tries < 5; ++tries) {
+    TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
+    for (int length : {1, 3, 100}) {
+      // Not support GPU now
+      testLayerGrad(config,
+                    "crf",
+                    length,
+                    /* trans= */ false,
+                    /* useGpu= */ false,
+                    /* useWeight= */ false,
+                    epsilon());
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 374ae57dd36..2aa86cc65c7 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -270,27 +270,6 @@ TEST(Layer, AddtoLayer) {
   }
 }
 
-TEST(Layer, CRFLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("crf");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 120});
-  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config,
-                "crf",
-                100,
-                /* trans */ false,
-                /* useGpu */ false,
-                false /*useWeight*/,
-                0.03 /*epsilon*/);
-}
-
 TEST(Layer, CTCLayer) {
   TestConfig config;
   config.layerConfig.set_type("ctc");
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
index 913d6ed7511..82361fa5511 100644
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ b/paddle/gserver/tests/test_LinearChainCRF.cpp
@@ -36,7 +36,7 @@ TEST(LinearChainCRF, decoding) {
   real* a = para.getData();
   real* b = para.getData() + numClasses;
   real* w = para.getData() + 2 * numClasses;
-  LinearChainCRF crf(4, para.getData(), nullptr);
+  LinearChainCRF crf(4, para.getData());
   for (int length : {1, 2, 3, 10}) {
     for (int tries = 0; tries < 10; ++tries) {
       CpuMatrix x(length, numClasses);
-- 
GitLab