diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/gserver/layers/CRFDecodingLayer.cpp index fdb46aba68e924480a6595b02c04ff4d1edd914d..191176ce985a8e12e33562f0cab73da6bbe667e6 100644 --- a/paddle/gserver/layers/CRFDecodingLayer.cpp +++ b/paddle/gserver/layers/CRFDecodingLayer.cpp @@ -24,7 +24,7 @@ bool CRFDecodingLayer::init(const LayerMap& layerMap, return false; } crf_.reset(new LinearChainCRF( - numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData(), nullptr)); + numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData())); return true; } diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp index 02b7aaf17e89d889ca0030f9de2b5d7431a28fd3..0b544420097e9150f8489731b6379dea633e992c 100644 --- a/paddle/gserver/layers/CRFLayer.cpp +++ b/paddle/gserver/layers/CRFLayer.cpp @@ -42,6 +42,7 @@ bool CRFLayer::init(const LayerMap& layerMap, CHECK_EQ(parameters_[0]->getSize(), numClasses_ * (numClasses_ + 2)); parameter_ = parameters_[0]; + weight_.reset(new Weight(numClasses_ + 2, numClasses_, parameter_)); // We don't need sequenceStartPositions because each sample of output_ is // for the cost of one sequence. @@ -69,11 +70,7 @@ void CRFLayer::forward(PassType passType) { for (size_t i = 0; i < numSequences; ++i) { if (i >= crfs_.size()) { - crfs_.emplace_back(numClasses_, - parameter_->getBuf(PARAMETER_VALUE)->getData(), - parameter_->getBuf(PARAMETER_GRADIENT) - ? parameter_->getBuf(PARAMETER_GRADIENT)->getData() - : nullptr); + crfs_.emplace_back(numClasses_, weight_->getW()->getData()); } output_.value->getData()[i] = crfs_[i].forward(output.value->getData() + numClasses_ * starts[i], @@ -93,22 +90,25 @@ void CRFLayer::backward(const UpdateCallback& callback) { const int* starts = label.sequenceStartPositions->getData(false); int numSequences = label.sequenceStartPositions->getSize() - 1; + bool needWGrad = weight_->getWGrad() ? true : false; for (int i = 0; i < numSequences; ++i) { crfs_[i].backward(output.value->getData() + numClasses_ * starts[i], - output.grad->getData() + numClasses_ * starts[i], label.ids->getData() + starts[i], - starts[i + 1] - starts[i]); - if (weightLayer_) { - real weight = getInputValue(*weightLayer_)->getElement(i, 0); - MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]); - grad->mulScalar(weight); + starts[i + 1] - starts[i], + needWGrad); + real instanceWeight = weightLayer_ + ? getInputValue(*weightLayer_)->getElement(i, 0) + : real(1.0f); + instanceWeight *= coeff_; + + MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]); + grad->add(*crfs_[i].getXGrad(), real(1.0f), instanceWeight); + if (needWGrad) { + weight_->getWGrad()->add( + *crfs_[i].getWGrad(), real(1.0f), instanceWeight); } } - if (coeff_ != real(1.0f)) { - output.grad->mulScalar(coeff_); - } - parameter_->incUpdate(callback); } diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h index de36a85083b6b293fd2d8522ec279a38cc4f8be3..00ec13cede97401b4c8a308df6fac27e47692146 100644 --- a/paddle/gserver/layers/CRFLayer.h +++ b/paddle/gserver/layers/CRFLayer.h @@ -38,8 +38,9 @@ protected: size_t numClasses_; ParameterPtr parameter_; std::vector crfs_; - LayerPtr weightLayer_; // weight for each sequence - real coeff_; // weight for the layer + LayerPtr weightLayer_; // weight for each sequence + std::unique_ptr weight_; // parameters + real coeff_; // weight for the layer }; } // namespace paddle diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp index b7f748f3bb8a419429956724131e81dfdbd274c6..dc3dc156792bdf32c3b948a292597d0e9eca5d8b 100644 --- a/paddle/gserver/layers/LinearChainCRF.cpp +++ b/paddle/gserver/layers/LinearChainCRF.cpp @@ -17,18 +17,12 @@ limitations under the License. */ namespace paddle { -LinearChainCRF::LinearChainCRF(int numClasses, real* para, real* grad) +LinearChainCRF::LinearChainCRF(int numClasses, real* para) : numClasses_(numClasses) { a_ = Matrix::create(para, 1, numClasses_); b_ = Matrix::create(para + numClasses_, 1, numClasses_); w_ = Matrix::create(para + 2 * numClasses_, numClasses_, numClasses_); - if (grad) { - da_ = Matrix::create(grad, 1, numClasses_); - db_ = Matrix::create(grad + numClasses_, 1, numClasses_); - dw_ = Matrix::create(grad + 2 * numClasses_, numClasses_, numClasses_); - } - ones_ = Matrix::create(1, numClasses_); ones_->one(); @@ -107,19 +101,24 @@ real LinearChainCRF::forward(real* x, int* s, int length) { return -ll; } -void LinearChainCRF::backward(real* x, real* dx, int* s, int length) { +void LinearChainCRF::backward(real* x, int* s, int length, bool needWGrad) { MatrixPtr matX = Matrix::create(x, length, numClasses_); - MatrixPtr matDX = Matrix::create(dx, length, numClasses_); - MatrixPtr matGrad = Matrix::create(length, numClasses_); + Matrix::resizeOrCreate(matGrad_, length, numClasses_); Matrix::resizeOrCreate(beta_, length, numClasses_); real* b = b_->getData(); - real* dw = dw_ ? dw_->getData() : nullptr; + if (needWGrad) { + Matrix::resizeOrCreate(matWGrad_, numClasses_ + 2, numClasses_); + matWGrad_->zeroMem(); + da_ = matWGrad_->subRowMatrix(0, 1); + db_ = matWGrad_->subRowMatrix(1, 2); + dw_ = matWGrad_->subRowMatrix(2, numClasses_ + 2); + } real* alpha = alpha_->getData(); real* beta = beta_->getData(); real* expW = expW_->getData(); real* expX = expX_->getData(); - real* grad = matGrad->getData(); + real* grad = matGrad_->getData(); for (int i = 0; i < numClasses_; ++i) { beta[(length - 1) * numClasses_ + i] = exp(b[i]); @@ -140,39 +139,38 @@ void LinearChainCRF::backward(real* x, real* dx, int* s, int length) { normalizeL1(beta + k * numClasses_, numClasses_); } - matGrad->dotMul(*alpha_, *beta_); - matGrad->rowNormalizeL1(*matGrad); + matGrad_->dotMul(*alpha_, *beta_); + matGrad_->rowNormalizeL1(*matGrad_); for (int k = 0; k < length; ++k) { grad[k * numClasses_ + s[k]] -= (real)1; } - matDX->add(*matGrad); - if (da_) { - da_->add(*matGrad->subMatrix(/* startRow= */ 0, /* numRows= */ 1)); - } - if (db_) { - db_->add(*matGrad->subMatrix(/* startRow= */ length - 1, 1)); - } - beta_->dotMul(*beta_, *expX_); - beta_->rowNormalizeL1(*beta_); + if (needWGrad) { + da_->add(*matGrad_->subMatrix(/* startRow= */ 0, /* numRows= */ 1)); + db_->add(*matGrad_->subMatrix(/* startRow= */ length - 1, 1)); - for (int k = 1; dw && k < length; ++k) { - real sum = 0; - for (int i = 0; i < numClasses_; ++i) { - for (int j = 0; j < numClasses_; ++j) { - sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] * - beta[k * numClasses_ + j]; + beta_->dotMul(*beta_, *expX_); + beta_->rowNormalizeL1(*beta_); + + real* dw = dw_->getData(); + for (int k = 1; k < length; ++k) { + real sum = 0; + for (int i = 0; i < numClasses_; ++i) { + for (int j = 0; j < numClasses_; ++j) { + sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] * + beta[k * numClasses_ + j]; + } } - } - sum = 1 / sum; - for (int i = 0; i < numClasses_; ++i) { - for (int j = 0; j < numClasses_; ++j) { - dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] * - alpha[(k - 1) * numClasses_ + i] * - beta[k * numClasses_ + j]; + sum = 1 / sum; + for (int i = 0; i < numClasses_; ++i) { + for (int j = 0; j < numClasses_; ++j) { + dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] * + alpha[(k - 1) * numClasses_ + i] * + beta[k * numClasses_ + j]; + } } + dw[s[k - 1] * numClasses_ + s[k]] -= (real)1; } - dw[s[k - 1] * numClasses_ + s[k]] -= (real)1; } } diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h index a905bf803dd5443ef8d4ad7702720a50a5220a9a..8daf1e14a6fa98bef41f4f32bff439df8302adfd 100644 --- a/paddle/gserver/layers/LinearChainCRF.h +++ b/paddle/gserver/layers/LinearChainCRF.h @@ -21,7 +21,7 @@ namespace paddle { class LinearChainCRF { public: /** - * The size of para and grad must be \f$(numClasses + 2) * numClasses\f$. + * The size of para must be \f$(numClasses + 2) * numClasses\f$. * The first numClasses values of para are for starting weights (\f$a\f$). * The next numClasses values of para are for ending weights (\f$b\f$), * The remaning values are for transition weights (\f$w\f$). @@ -34,7 +34,7 @@ public: * all possible * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF. */ - LinearChainCRF(int numClasses, real* para, real* grad); + LinearChainCRF(int numClasses, real* para); /** * Calculate the negative log likelihood of s given x. @@ -45,29 +45,45 @@ public: /** * Calculate the gradient with respect to x, a, b, and w. - * The gradient of x will be stored in dx. * backward() can only be called after a corresponding call to forward() with * the same x, s and length. - * @note The gradient is added to dx and grad (provided at constructor). + * The gradient with respect to a, b, and w will not be calculated if + * needWGrad is false. + * @note Please call getWGrad() and getXGrad() to get the gradient with + * respect to (a, b, w) and x respectively. */ - void backward(real* x, real* dx, int* s, int length); + void backward(real* x, int* s, int length, bool needWGrad); /** * Find the most probable sequence given x. The result will be stored in s. */ void decode(real* x, int* s, int length); + /* + * Return the gradient with respect to (a, b, w). It can only be called after + * a corresponding call to backward(). + */ + MatrixPtr getWGrad() { return matWGrad_; } + + /* + * Return the gradient with respect to x. It can only be called after a + * corresponding call to backward(). + */ + MatrixPtr getXGrad() { return matGrad_; } + protected: int numClasses_; MatrixPtr a_; MatrixPtr b_; MatrixPtr w_; + MatrixPtr matWGrad_; MatrixPtr da_; MatrixPtr db_; MatrixPtr dw_; MatrixPtr ones_; MatrixPtr expX_; + MatrixPtr matGrad_; MatrixPtr alpha_; MatrixPtr beta_; MatrixPtr maxX_; diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index 0caa5e1e11e6d42fadfa87149814c4b77b3b6271..3c4128b5b8a0ea420bd3027b9a36e5f75087c3cb 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -18,6 +18,14 @@ add_unittest_without_exec(test_LayerGrad add_test(NAME test_LayerGrad COMMAND test_LayerGrad) +################ test_CRFLayerGrad #################### +add_unittest_without_exec(test_CRFLayerGrad + test_CRFLayerGrad.cpp + LayerGradUtil.cpp) +add_test(NAME test_CRFLayerGrad + COMMAND test_CRFLayerGrad) + + add_unittest_without_exec(test_ActivationGrad test_ActivationGrad.cpp LayerGradUtil.cpp) diff --git a/paddle/gserver/tests/test_CRFLayerGrad.cpp b/paddle/gserver/tests/test_CRFLayerGrad.cpp new file mode 100644 index 0000000000000000000000000000000000000000..df14449291e9ec08f45718de07bbb101f6dbea58 --- /dev/null +++ b/paddle/gserver/tests/test_CRFLayerGrad.cpp @@ -0,0 +1,174 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "ModelConfig.pb.h" +#include "paddle/gserver/layers/DataLayer.h" +#include "paddle/gserver/layers/LinearChainCRF.h" +#include "paddle/trainer/Trainer.h" + +#include "LayerGradUtil.h" +#include "paddle/testing/TestUtil.h" + +using namespace paddle; // NOLINT + +DECLARE_int32(gpu_id); +DECLARE_bool(thread_local_rand_use_global_seed); + +static inline bool getNextSequence(std::vector& seq, int numClasses) { + for (auto& v : seq) { + if (++v < numClasses) { + return true; + } + v = 0; + } + return false; +} + +// log(exp(x) + exp(y)) +static inline real logSum(real x, real y) { + real maxValue = std::max(x, y); + if (std::isinf(maxValue)) { + return -std::numeric_limits::infinity(); + } else { + return maxValue + log(exp(x - maxValue) + exp(y - maxValue)); + } +} + +static inline std::vector genRandLabels(int numClasses, int length) { + std::vector labels(length); + for (int i = 0; i < length; ++i) { + labels[i] = rand() % numClasses; // NOLINT + } + return labels; +} + +TEST(CRFLayer, cost) { + const int numClasses = 4; + CpuVector para(numClasses * (numClasses + 2)); + real* a = para.getData(); + real* b = para.getData() + numClasses; + real* w = para.getData() + 2 * numClasses; + LinearChainCRF crf(4, para.getData()); + for (int length : {1, 2, 3, 10}) { + for (int tries = 0; tries < 10; ++tries) { + CpuMatrix x(length, numClasses); + x.randomizeUniform(); + para.randnorm(0, 2); + + std::vector goldenLabels = genRandLabels(numClasses, length); + + real cost = crf.forward(x.getData(), goldenLabels.data(), length); + + real logZ = -std::numeric_limits::infinity(); + real logNominator = -std::numeric_limits::infinity(); + std::vector testResult(length, 0); + do { + real score = a[testResult.front()]; + score += x.getElement(0, testResult.front()); + for (int k = 1; k < length; ++k) { + score += x.getElement(k, testResult[k]) + + w[numClasses * testResult[k - 1] + testResult[k]]; + } + score += b[testResult.back()]; + logZ = logSum(logZ, score); + + if (goldenLabels == testResult) { + logNominator = score; + } + } while (getNextSequence(testResult, numClasses)); + + real trueCost = -logNominator + logZ; + + real diff = fabs(trueCost - cost); + diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost); + VLOG(1) << "cost=" << cost << " trueCost=" << trueCost << " diff=" << diff + << std::endl; + if (typeid(real) == typeid(double)) { // NOLINT + EXPECT_LE(diff, 1e-10); + } else { + EXPECT_LE(diff, 5e-3); + } + } + } +} + +inline real epsilon() { return typeid(real) == typeid(double) ? 1e-10 : 0.06; } + +TestConfig initTestConfig(size_t numClasses, bool withWeight) { + TestConfig config; + config.layerConfig.set_type("crf"); + config.layerConfig.set_size(numClasses); + config.biasSize = 0; + + config.inputDefs.push_back({INPUT_SEQUENCE_DATA, + "layer_0", + numClasses, + numClasses * (numClasses + 2)}); + config.layerConfig.add_inputs(); + config.inputDefs.push_back( + {INPUT_SEQUENCE_LABEL, "layer_label", numClasses, 0}); + config.layerConfig.add_inputs(); + + if (withWeight) { + config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight", 1, 0}); + config.layerConfig.add_inputs(); + } + + return config; +} + +TEST(Layer, CRFLayer) { + size_t numClasses = 10; + for (int tries = 0; tries < 5; ++tries) { + TestConfig config = initTestConfig(numClasses, /* withWeight= */ false); + for (int length : {1, 3, 100}) { + // Not support GPU now + testLayerGrad(config, + "crf", + length, + /* trans= */ false, + /* useGpu= */ false, + /* useWeight= */ false, + epsilon()); + } + } +} + +TEST(Layer, CRFLayerUseWeight) { + size_t numClasses = 10; + for (int tries = 0; tries < 5; ++tries) { + TestConfig config = initTestConfig(numClasses, /* withWeight= */ true); + for (int length : {1, 3, 100}) { + // Not support GPU now + testLayerGrad(config, + "crf", + length, + /* trans= */ false, + /* useGpu= */ false, + /* useWeight= */ false, + epsilon()); + } + } +} + +int main(int argc, char** argv) { + initMain(argc, argv); + hl_start(); + hl_init(FLAGS_gpu_id); + FLAGS_thread_local_rand_use_global_seed = true; + srand(1); + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 14d9db52470b2828186eca04d303135910489266..ceb69359c992128635c199e56805d3f603ca4271 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -276,27 +276,6 @@ TEST(Layer, AddtoLayer) { } } -TEST(Layer, CRFLayer) { - TestConfig config; - config.layerConfig.set_type("crf"); - config.layerConfig.set_size(10); - config.biasSize = 0; - - config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 120}); - config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0}); - config.layerConfig.add_inputs(); - config.layerConfig.add_inputs(); - - // Not support GPU now - testLayerGrad(config, - "crf", - 100, - /* trans */ false, - /* useGpu */ false, - false /*useWeight*/, - 0.03 /*epsilon*/); -} - TEST(Layer, CTCLayer) { TestConfig config; config.layerConfig.set_type("ctc"); diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp index f046cb0b289c9ce22b98f3200bf0a3f7d48d77f5..b37277054c58a5f71cc4649fc6c062ca8dc1d4c9 100644 --- a/paddle/gserver/tests/test_LinearChainCRF.cpp +++ b/paddle/gserver/tests/test_LinearChainCRF.cpp @@ -36,7 +36,7 @@ TEST(LinearChainCRF, decoding) { real* a = para.getData(); real* b = para.getData() + numClasses; real* w = para.getData() + 2 * numClasses; - LinearChainCRF crf(4, para.getData(), nullptr); + LinearChainCRF crf(4, para.getData()); for (int length : {1, 2, 3, 10}) { for (int tries = 0; tries < 10; ++tries) { CpuMatrix x(length, numClasses); diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 4d15210fc3aa47321cae678b067f5cc2c402b00b..e257aa568facb1555944dba7e76c5d8bce7f1c7d 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2998,7 +2998,7 @@ class CRFLayer(LayerBase): super(CRFLayer, self).__init__(name, 'crf', size, inputs, device=device) config_assert(2 <= len(self.inputs) <= 3, 'CRFLayer must have 2 or 3 inputs') - self.create_input_parameter(0, size * (size + 2), [size, size + 2]) + self.create_input_parameter(0, size * (size + 2), [size + 2, size]) self.config.coeff = coeff @@ -3020,7 +3020,7 @@ class CRFDecodingLayer(LayerBase): config_assert( len(self.inputs) <= 2, 'CRFDecodingLayer cannot have more than 2 inputs') - self.create_input_parameter(0, size * (size + 2), [size, size + 2]) + self.create_input_parameter(0, size * (size + 2), [size + 2, size]) @config_layer('ctc') diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr index 10e59e21bc7a48bc53fb535f86f053c91f57c1df..05fd1c99d2db6e9faa3b3884ec9baf051791f9fe 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr @@ -239,9 +239,9 @@ parameters { name: "___crf_layer_0__.w0" size: 24 initial_mean: 0.0 - initial_std: 0.5 - dims: 4 + initial_std: 0.408248290464 dims: 6 + dims: 4 initial_strategy: 0 initial_smart: true }