From e9c61b67e25f37324131bd8f54afc0b45d992c6d Mon Sep 17 00:00:00 2001 From: Peng Li Date: Wed, 30 Nov 2016 21:55:29 +0800 Subject: [PATCH] Fix bug in processing instance weight and coeff in CRFLayer The instance weight and coeff are only mulipled to the gradient with respect to input. --- paddle/gserver/layers/CRFDecodingLayer.cpp | 2 +- paddle/gserver/layers/CRFLayer.cpp | 29 ++- paddle/gserver/layers/CRFLayer.h | 1 + paddle/gserver/layers/LinearChainCRF.cpp | 72 ++++---- paddle/gserver/layers/LinearChainCRF.h | 26 ++- paddle/gserver/tests/CMakeLists.txt | 9 + paddle/gserver/tests/test_CRFLayerGrad.cpp | 178 +++++++++++++++++++ paddle/gserver/tests/test_LayerGrad.cpp | 21 --- paddle/gserver/tests/test_LinearChainCRF.cpp | 2 +- 9 files changed, 260 insertions(+), 80 deletions(-) create mode 100644 paddle/gserver/tests/test_CRFLayerGrad.cpp diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/gserver/layers/CRFDecodingLayer.cpp index 8986741dc3..3c66f4191a 100644 --- a/paddle/gserver/layers/CRFDecodingLayer.cpp +++ b/paddle/gserver/layers/CRFDecodingLayer.cpp @@ -24,7 +24,7 @@ bool CRFDecodingLayer::init(const LayerMap& layerMap, return false; } crf_.reset(new LinearChainCRF( - numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData(), nullptr)); + numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData())); return true; } diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp index ed4f864ba9..fa434cda00 100644 --- a/paddle/gserver/layers/CRFLayer.cpp +++ b/paddle/gserver/layers/CRFLayer.cpp @@ -42,6 +42,8 @@ bool CRFLayer::init(const LayerMap& layerMap, CHECK_EQ(parameters_[0]->getSize(), numClasses_ * (numClasses_ + 2)); parameter_ = parameters_[0]; + weight_.reset( + new Weight(numClasses_ + 2, numClasses_, parameter_)); // We don't need sequenceStartPositions because each sample of output_ is // for the cost of one sequence. @@ -69,11 +71,7 @@ void CRFLayer::forward(PassType passType) { for (size_t i = 0; i < numSequences; ++i) { if (i >= crfs_.size()) { - crfs_.emplace_back(numClasses_, - parameter_->getBuf(PARAMETER_VALUE)->getData(), - parameter_->getBuf(PARAMETER_GRADIENT) - ? parameter_->getBuf(PARAMETER_GRADIENT)->getData() - : nullptr); + crfs_.emplace_back(numClasses_, weight_->getW()->getData()); } output_.value->getData()[i] = crfs_[i].forward(output.value->getData() + numClasses_ * starts[i], @@ -94,21 +92,22 @@ void CRFLayer::backward(const UpdateCallback& callback) { int numSequences = label.sequenceStartPositions->getSize() - 1; for (int i = 0; i < numSequences; ++i) { + bool needWGrad = weight_->getWGrad() ? true : false; crfs_[i].backward(output.value->getData() + numClasses_ * starts[i], - output.grad->getData() + numClasses_ * starts[i], label.ids->getData() + starts[i], - starts[i + 1] - starts[i]); - if (weightLayer_) { - real weight = getInputValue(*weightLayer_)->getElement(i, 0); - MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]); - grad->mulScalar(weight); + starts[i + 1] - starts[i], needWGrad); + real instanceWeight = weightLayer_ ? + getInputValue(*weightLayer_)->getElement(i, 0) : real(1.0f); + instanceWeight *= coeff_; + + MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]); + grad->add(*crfs_[i].getXGrad(), real(1.0f), instanceWeight); + if (needWGrad) { + weight_->getWGrad()->add(*crfs_[i].getWGrad(), real(1.0f), + instanceWeight); } } - if (coeff_ != real(1.0f)) { - output.grad->mulScalar(coeff_); - } - parameter_->incUpdate(callback); } diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h index 21c7fc61e1..ae024b8fa3 100644 --- a/paddle/gserver/layers/CRFLayer.h +++ b/paddle/gserver/layers/CRFLayer.h @@ -38,6 +38,7 @@ protected: ParameterPtr parameter_; std::vector crfs_; LayerPtr weightLayer_; // weight for each sequence + std::unique_ptr weight_; // parameters real coeff_; // weight for the layer }; diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp index 2b3a50b2e2..f9e4bb83d4 100644 --- a/paddle/gserver/layers/LinearChainCRF.cpp +++ b/paddle/gserver/layers/LinearChainCRF.cpp @@ -17,18 +17,12 @@ limitations under the License. */ namespace paddle { -LinearChainCRF::LinearChainCRF(int numClasses, real* para, real* grad) +LinearChainCRF::LinearChainCRF(int numClasses, real* para) : numClasses_(numClasses) { a_ = Matrix::create(para, 1, numClasses_); b_ = Matrix::create(para + numClasses_, 1, numClasses_); w_ = Matrix::create(para + 2 * numClasses_, numClasses_, numClasses_); - if (grad) { - da_ = Matrix::create(grad, 1, numClasses_); - db_ = Matrix::create(grad + numClasses_, 1, numClasses_); - dw_ = Matrix::create(grad + 2 * numClasses_, numClasses_, numClasses_); - } - ones_ = Matrix::create(1, numClasses_); ones_->one(); @@ -107,19 +101,24 @@ real LinearChainCRF::forward(real* x, int* s, int length) { return -ll; } -void LinearChainCRF::backward(real* x, real* dx, int* s, int length) { +void LinearChainCRF::backward(real* x, int* s, int length, bool needWGrad) { MatrixPtr matX = Matrix::create(x, length, numClasses_); - MatrixPtr matDX = Matrix::create(dx, length, numClasses_); - MatrixPtr matGrad = Matrix::create(length, numClasses_); + Matrix::resizeOrCreate(matGrad_, length, numClasses_); Matrix::resizeOrCreate(beta_, length, numClasses_); real* b = b_->getData(); - real* dw = dw_ ? dw_->getData() : nullptr; + if (needWGrad) { + Matrix::resizeOrCreate(matWGrad_, numClasses_ + 2, numClasses_); + matWGrad_->zeroMem(); + da_ = matWGrad_->subRowMatrix(0, 1); + db_ = matWGrad_->subRowMatrix(1, 2); + dw_ = matWGrad_->subRowMatrix(2, numClasses_ + 2); + } real* alpha = alpha_->getData(); real* beta = beta_->getData(); real* expW = expW_->getData(); real* expX = expX_->getData(); - real* grad = matGrad->getData(); + real* grad = matGrad_->getData(); for (int i = 0; i < numClasses_; ++i) { beta[(length - 1) * numClasses_ + i] = exp(b[i]); @@ -140,39 +139,38 @@ void LinearChainCRF::backward(real* x, real* dx, int* s, int length) { normalizeL1(beta + k * numClasses_, numClasses_); } - matGrad->dotMul(*alpha_, *beta_); - matGrad->rowNormalizeL1(*matGrad); + matGrad_->dotMul(*alpha_, *beta_); + matGrad_->rowNormalizeL1(*matGrad_); for (int k = 0; k < length; ++k) { grad[k * numClasses_ + s[k]] -= (real)1; } - matDX->add(*matGrad); - if (da_) { - da_->add(*matGrad->subMatrix(/* startRow= */ 0, /* numRows= */ 1)); - } - if (db_) { - db_->add(*matGrad->subMatrix(/* startRow= */ length - 1, 1)); - } - beta_->dotMul(*beta_, *expX_); - beta_->rowNormalizeL1(*beta_); + if (needWGrad) { + da_->add(*matGrad_->subMatrix(/* startRow= */ 0, /* numRows= */ 1)); + db_->add(*matGrad_->subMatrix(/* startRow= */ length - 1, 1)); - for (int k = 1; dw && k < length; ++k) { - real sum = 0; - for (int i = 0; i < numClasses_; ++i) { - for (int j = 0; j < numClasses_; ++j) { - sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] * - beta[k * numClasses_ + j]; + beta_->dotMul(*beta_, *expX_); + beta_->rowNormalizeL1(*beta_); + + real* dw = dw_->getData(); + for (int k = 1; k < length; ++k) { + real sum = 0; + for (int i = 0; i < numClasses_; ++i) { + for (int j = 0; j < numClasses_; ++j) { + sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] * + beta[k * numClasses_ + j]; + } } - } - sum = 1 / sum; - for (int i = 0; i < numClasses_; ++i) { - for (int j = 0; j < numClasses_; ++j) { - dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] * - alpha[(k - 1) * numClasses_ + i] * - beta[k * numClasses_ + j]; + sum = 1 / sum; + for (int i = 0; i < numClasses_; ++i) { + for (int j = 0; j < numClasses_; ++j) { + dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] * + alpha[(k - 1) * numClasses_ + i] * + beta[k * numClasses_ + j]; + } } + dw[s[k - 1] * numClasses_ + s[k]] -= (real)1; } - dw[s[k - 1] * numClasses_ + s[k]] -= (real)1; } } diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h index 6368f2b9de..f58ac6b581 100644 --- a/paddle/gserver/layers/LinearChainCRF.h +++ b/paddle/gserver/layers/LinearChainCRF.h @@ -21,7 +21,7 @@ namespace paddle { class LinearChainCRF { public: /** - * The size of para and grad must be \f$(numClasses + 2) * numClasses\f$. + * The size of para must be \f$(numClasses + 2) * numClasses\f$. * The first numClasses values of para are for starting weights (\f$a\f$). * The next numClasses values of para are for ending weights (\f$b\f$), * The remaning values are for transition weights (\f$w\f$). @@ -34,7 +34,7 @@ public: * all possible * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF. */ - LinearChainCRF(int numClasses, real* para, real* grad); + LinearChainCRF(int numClasses, real* para); /** * Calculate the negative log likelihood of s given x. @@ -45,29 +45,45 @@ public: /** * Calculate the gradient with respect to x, a, b, and w. - * The gradient of x will be stored in dx. * backward() can only be called after a corresponding call to forward() with * the same x, s and length. - * @note The gradient is added to dx and grad (provided at constructor). + * The gradient with respect to a, b, and w will not be calculated if + * needWGrad is false. + * @note Please call getWGrad() and getXGrad() to get the gradient with + * respect to (a, b, w) and x respectively. */ - void backward(real* x, real* dx, int* s, int length); + void backward(real* x, int* s, int length, bool needWGrad); /** * Find the most probable sequence given x. The result will be stored in s. */ void decode(real* x, int* s, int length); + /* + * Return the gradient with respect to (a, b, w). It can only be called after + * a corresponding call to backward(). + */ + MatrixPtr getWGrad() { return matWGrad_; } + + /* + * Return the gradient with respect to x. It can only be called after a + * corresponding call to backward(). + */ + MatrixPtr getXGrad() { return matGrad_; } + protected: int numClasses_; MatrixPtr a_; MatrixPtr b_; MatrixPtr w_; + MatrixPtr matWGrad_; MatrixPtr da_; MatrixPtr db_; MatrixPtr dw_; MatrixPtr ones_; MatrixPtr expX_; + MatrixPtr matGrad_; MatrixPtr alpha_; MatrixPtr beta_; MatrixPtr maxX_; diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index 0651d0b473..1525960ffd 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -20,6 +20,15 @@ add_unittest_without_exec(test_LayerGrad add_test(NAME test_LayerGrad COMMAND test_LayerGrad) +################ test_CRFLayerGrad #################### +add_unittest_without_exec(test_CRFLayerGrad + test_CRFLayerGrad.cpp + LayerGradUtil.cpp + TestUtil.cpp) +add_test(NAME test_CRFLayerGrad + COMMAND test_CRFLayerGrad) + + add_unittest_without_exec(test_ActivationGrad test_ActivationGrad.cpp LayerGradUtil.cpp diff --git a/paddle/gserver/tests/test_CRFLayerGrad.cpp b/paddle/gserver/tests/test_CRFLayerGrad.cpp new file mode 100644 index 0000000000..bc1d5f3061 --- /dev/null +++ b/paddle/gserver/tests/test_CRFLayerGrad.cpp @@ -0,0 +1,178 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/gserver/layers/DataLayer.h" +#include "paddle/trainer/Trainer.h" +#include "paddle/gserver/layers/LinearChainCRF.h" +#include "ModelConfig.pb.h" + +#include "TestUtil.h" +#include "LayerGradUtil.h" + +using namespace paddle; // NOLINT + +P_DECLARE_bool(use_gpu); +P_DECLARE_int32(gpu_id); +P_DECLARE_double(checkgrad_eps); +P_DECLARE_bool(thread_local_rand_use_global_seed); +P_DECLARE_bool(prev_batch_state); + +static inline bool getNextSequence(std::vector& seq, int numClasses) { + for (auto& v : seq) { + if (++v < numClasses) { + return true; + } + v = 0; + } + return false; +} + +// log(exp(x) + exp(y)) +static inline real logSum(real x, real y) { + real maxValue = std::max(x, y); + if (std::isinf(maxValue)) { + return -std::numeric_limits::infinity(); + } else { + return maxValue + log(exp(x - maxValue) + exp(y - maxValue)); + } +} + +static inline std::vector genRandLabels(int numClasses, int length) { + std::vector labels(length); + for (int i = 0; i < length; ++i) { + labels[i] = rand() % numClasses; // NOLINT + } + return labels; +} + +TEST(CRFLayer, cost) { + const int numClasses = 4; + CpuVector para(numClasses * (numClasses + 2)); + real* a = para.getData(); + real* b = para.getData() + numClasses; + real* w = para.getData() + 2 * numClasses; + LinearChainCRF crf(4, para.getData()); + for (int length : {1, 2, 3, 10}) { + for (int tries = 0; tries < 10; ++tries) { + CpuMatrix x(length, numClasses); + x.randomizeUniform(); + para.randnorm(0, 2); + + std::vector goldenLabels = genRandLabels(numClasses, length); + + real cost = crf.forward(x.getData(), goldenLabels.data(), length); + + real logZ = -std::numeric_limits::infinity(); + real logNominator = -std::numeric_limits::infinity(); + std::vector testResult(length, 0); + do { + real score = a[testResult.front()]; + score += x.getElement(0, testResult.front()); + for (int k = 1; k < length; ++k) { + score += x.getElement(k, testResult[k]) + + w[numClasses * testResult[k - 1] + testResult[k]]; + } + score += b[testResult.back()]; + logZ = logSum(logZ, score); + + if (goldenLabels == testResult) { + logNominator = score; + } + } while (getNextSequence(testResult, numClasses)); + + real trueCost = -logNominator + logZ; + + real diff = fabs(trueCost - cost); + diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost); + VLOG(1) << "cost=" << cost << " trueCost=" << trueCost + << " diff=" << diff << std::endl; + if (typeid(real) == typeid(double)) { // NOLINT + EXPECT_LE(diff, 1e-10); + } else { + EXPECT_LE(diff, 5e-3); + } + } + } +} + +inline real epsilon() { + return typeid(real) == typeid(double) ? 1e-10 : 0.05; +} + +TestConfig initTestConfig(size_t numClasses, bool withWeight) { + TestConfig config; + config.layerConfig.set_type("crf"); + config.layerConfig.set_size(numClasses); + config.biasSize = 0; + + config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", + numClasses, numClasses * (numClasses + 2)}); + config.layerConfig.add_inputs(); + config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_label", + numClasses, 0}); + config.layerConfig.add_inputs(); + + if (withWeight) { + config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight", + 1, 0}); + config.layerConfig.add_inputs(); + } + + return config; +} + +TEST(Layer, CRFLayer) { + size_t numClasses = 10; + for (int tries = 0; tries < 5; ++tries) { + TestConfig config = initTestConfig(numClasses, /* withWeight= */ false); + for (int length : {1, 3, 100}) { + // Not support GPU now + testLayerGrad(config, + "crf", + length, + /* trans= */ false, + /* useGpu= */ false, + /* useWeight= */ false, + epsilon()); + } + } +} + +TEST(Layer, CRFLayerUseWeight) { + size_t numClasses = 10; + for (int tries = 0; tries < 5; ++tries) { + TestConfig config = initTestConfig(numClasses, /* withWeight= */ true); + for (int length : {1, 3, 100}) { + // Not support GPU now + testLayerGrad(config, + "crf", + length, + /* trans= */ false, + /* useGpu= */ false, + /* useWeight= */ false, + epsilon()); + } + } +} + +int main(int argc, char** argv) { + initMain(argc, argv); + hl_start(); + hl_init(FLAGS_gpu_id); + FLAGS_thread_local_rand_use_global_seed = true; + srand(1); + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 374ae57dd3..2aa86cc65c 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -270,27 +270,6 @@ TEST(Layer, AddtoLayer) { } } -TEST(Layer, CRFLayer) { - TestConfig config; - config.layerConfig.set_type("crf"); - config.layerConfig.set_size(10); - config.biasSize = 0; - - config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 120}); - config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0}); - config.layerConfig.add_inputs(); - config.layerConfig.add_inputs(); - - // Not support GPU now - testLayerGrad(config, - "crf", - 100, - /* trans */ false, - /* useGpu */ false, - false /*useWeight*/, - 0.03 /*epsilon*/); -} - TEST(Layer, CTCLayer) { TestConfig config; config.layerConfig.set_type("ctc"); diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp index 913d6ed751..82361fa551 100644 --- a/paddle/gserver/tests/test_LinearChainCRF.cpp +++ b/paddle/gserver/tests/test_LinearChainCRF.cpp @@ -36,7 +36,7 @@ TEST(LinearChainCRF, decoding) { real* a = para.getData(); real* b = para.getData() + numClasses; real* w = para.getData() + 2 * numClasses; - LinearChainCRF crf(4, para.getData(), nullptr); + LinearChainCRF crf(4, para.getData()); for (int length : {1, 2, 3, 10}) { for (int tries = 0; tries < 10; ++tries) { CpuMatrix x(length, numClasses); -- GitLab