提交 51c45854 编写于 作者: L liuyuan 提交者: GitHub

Merge pull request #678 from pengli09/fix-crf-weight-and-coeff-bug

Fix bug in processing instance weight and coeff in CRFLayer
......@@ -24,7 +24,7 @@ bool CRFDecodingLayer::init(const LayerMap& layerMap,
return false;
}
crf_.reset(new LinearChainCRF(
numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData(), nullptr));
numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData()));
return true;
}
......
......@@ -42,6 +42,7 @@ bool CRFLayer::init(const LayerMap& layerMap,
CHECK_EQ(parameters_[0]->getSize(), numClasses_ * (numClasses_ + 2));
parameter_ = parameters_[0];
weight_.reset(new Weight(numClasses_ + 2, numClasses_, parameter_));
// We don't need sequenceStartPositions because each sample of output_ is
// for the cost of one sequence.
......@@ -69,11 +70,7 @@ void CRFLayer::forward(PassType passType) {
for (size_t i = 0; i < numSequences; ++i) {
if (i >= crfs_.size()) {
crfs_.emplace_back(numClasses_,
parameter_->getBuf(PARAMETER_VALUE)->getData(),
parameter_->getBuf(PARAMETER_GRADIENT)
? parameter_->getBuf(PARAMETER_GRADIENT)->getData()
: nullptr);
crfs_.emplace_back(numClasses_, weight_->getW()->getData());
}
output_.value->getData()[i] =
crfs_[i].forward(output.value->getData() + numClasses_ * starts[i],
......@@ -93,22 +90,25 @@ void CRFLayer::backward(const UpdateCallback& callback) {
const int* starts = label.sequenceStartPositions->getData(false);
int numSequences = label.sequenceStartPositions->getSize() - 1;
bool needWGrad = weight_->getWGrad() ? true : false;
for (int i = 0; i < numSequences; ++i) {
crfs_[i].backward(output.value->getData() + numClasses_ * starts[i],
output.grad->getData() + numClasses_ * starts[i],
label.ids->getData() + starts[i],
starts[i + 1] - starts[i]);
if (weightLayer_) {
real weight = getInputValue(*weightLayer_)->getElement(i, 0);
MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
grad->mulScalar(weight);
starts[i + 1] - starts[i],
needWGrad);
real instanceWeight = weightLayer_
? getInputValue(*weightLayer_)->getElement(i, 0)
: real(1.0f);
instanceWeight *= coeff_;
MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
grad->add(*crfs_[i].getXGrad(), real(1.0f), instanceWeight);
if (needWGrad) {
weight_->getWGrad()->add(
*crfs_[i].getWGrad(), real(1.0f), instanceWeight);
}
}
if (coeff_ != real(1.0f)) {
output.grad->mulScalar(coeff_);
}
parameter_->incUpdate(callback);
}
......
......@@ -38,8 +38,9 @@ protected:
size_t numClasses_;
ParameterPtr parameter_;
std::vector<LinearChainCRF> crfs_;
LayerPtr weightLayer_; // weight for each sequence
real coeff_; // weight for the layer
LayerPtr weightLayer_; // weight for each sequence
std::unique_ptr<Weight> weight_; // parameters
real coeff_; // weight for the layer
};
} // namespace paddle
......@@ -17,18 +17,12 @@ limitations under the License. */
namespace paddle {
LinearChainCRF::LinearChainCRF(int numClasses, real* para, real* grad)
LinearChainCRF::LinearChainCRF(int numClasses, real* para)
: numClasses_(numClasses) {
a_ = Matrix::create(para, 1, numClasses_);
b_ = Matrix::create(para + numClasses_, 1, numClasses_);
w_ = Matrix::create(para + 2 * numClasses_, numClasses_, numClasses_);
if (grad) {
da_ = Matrix::create(grad, 1, numClasses_);
db_ = Matrix::create(grad + numClasses_, 1, numClasses_);
dw_ = Matrix::create(grad + 2 * numClasses_, numClasses_, numClasses_);
}
ones_ = Matrix::create(1, numClasses_);
ones_->one();
......@@ -107,19 +101,24 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
return -ll;
}
void LinearChainCRF::backward(real* x, real* dx, int* s, int length) {
void LinearChainCRF::backward(real* x, int* s, int length, bool needWGrad) {
MatrixPtr matX = Matrix::create(x, length, numClasses_);
MatrixPtr matDX = Matrix::create(dx, length, numClasses_);
MatrixPtr matGrad = Matrix::create(length, numClasses_);
Matrix::resizeOrCreate(matGrad_, length, numClasses_);
Matrix::resizeOrCreate(beta_, length, numClasses_);
real* b = b_->getData();
real* dw = dw_ ? dw_->getData() : nullptr;
if (needWGrad) {
Matrix::resizeOrCreate(matWGrad_, numClasses_ + 2, numClasses_);
matWGrad_->zeroMem();
da_ = matWGrad_->subRowMatrix(0, 1);
db_ = matWGrad_->subRowMatrix(1, 2);
dw_ = matWGrad_->subRowMatrix(2, numClasses_ + 2);
}
real* alpha = alpha_->getData();
real* beta = beta_->getData();
real* expW = expW_->getData();
real* expX = expX_->getData();
real* grad = matGrad->getData();
real* grad = matGrad_->getData();
for (int i = 0; i < numClasses_; ++i) {
beta[(length - 1) * numClasses_ + i] = exp(b[i]);
......@@ -140,39 +139,38 @@ void LinearChainCRF::backward(real* x, real* dx, int* s, int length) {
normalizeL1(beta + k * numClasses_, numClasses_);
}
matGrad->dotMul(*alpha_, *beta_);
matGrad->rowNormalizeL1(*matGrad);
matGrad_->dotMul(*alpha_, *beta_);
matGrad_->rowNormalizeL1(*matGrad_);
for (int k = 0; k < length; ++k) {
grad[k * numClasses_ + s[k]] -= (real)1;
}
matDX->add(*matGrad);
if (da_) {
da_->add(*matGrad->subMatrix(/* startRow= */ 0, /* numRows= */ 1));
}
if (db_) {
db_->add(*matGrad->subMatrix(/* startRow= */ length - 1, 1));
}
beta_->dotMul(*beta_, *expX_);
beta_->rowNormalizeL1(*beta_);
if (needWGrad) {
da_->add(*matGrad_->subMatrix(/* startRow= */ 0, /* numRows= */ 1));
db_->add(*matGrad_->subMatrix(/* startRow= */ length - 1, 1));
for (int k = 1; dw && k < length; ++k) {
real sum = 0;
for (int i = 0; i < numClasses_; ++i) {
for (int j = 0; j < numClasses_; ++j) {
sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] *
beta[k * numClasses_ + j];
beta_->dotMul(*beta_, *expX_);
beta_->rowNormalizeL1(*beta_);
real* dw = dw_->getData();
for (int k = 1; k < length; ++k) {
real sum = 0;
for (int i = 0; i < numClasses_; ++i) {
for (int j = 0; j < numClasses_; ++j) {
sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] *
beta[k * numClasses_ + j];
}
}
}
sum = 1 / sum;
for (int i = 0; i < numClasses_; ++i) {
for (int j = 0; j < numClasses_; ++j) {
dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] *
alpha[(k - 1) * numClasses_ + i] *
beta[k * numClasses_ + j];
sum = 1 / sum;
for (int i = 0; i < numClasses_; ++i) {
for (int j = 0; j < numClasses_; ++j) {
dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] *
alpha[(k - 1) * numClasses_ + i] *
beta[k * numClasses_ + j];
}
}
dw[s[k - 1] * numClasses_ + s[k]] -= (real)1;
}
dw[s[k - 1] * numClasses_ + s[k]] -= (real)1;
}
}
......
......@@ -21,7 +21,7 @@ namespace paddle {
class LinearChainCRF {
public:
/**
* The size of para and grad must be \f$(numClasses + 2) * numClasses\f$.
* The size of para must be \f$(numClasses + 2) * numClasses\f$.
* The first numClasses values of para are for starting weights (\f$a\f$).
* The next numClasses values of para are for ending weights (\f$b\f$),
* The remaning values are for transition weights (\f$w\f$).
......@@ -34,7 +34,7 @@ public:
* all possible
* sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
*/
LinearChainCRF(int numClasses, real* para, real* grad);
LinearChainCRF(int numClasses, real* para);
/**
* Calculate the negative log likelihood of s given x.
......@@ -45,29 +45,45 @@ public:
/**
* Calculate the gradient with respect to x, a, b, and w.
* The gradient of x will be stored in dx.
* backward() can only be called after a corresponding call to forward() with
* the same x, s and length.
* @note The gradient is added to dx and grad (provided at constructor).
* The gradient with respect to a, b, and w will not be calculated if
* needWGrad is false.
* @note Please call getWGrad() and getXGrad() to get the gradient with
* respect to (a, b, w) and x respectively.
*/
void backward(real* x, real* dx, int* s, int length);
void backward(real* x, int* s, int length, bool needWGrad);
/**
* Find the most probable sequence given x. The result will be stored in s.
*/
void decode(real* x, int* s, int length);
/*
* Return the gradient with respect to (a, b, w). It can only be called after
* a corresponding call to backward().
*/
MatrixPtr getWGrad() { return matWGrad_; }
/*
* Return the gradient with respect to x. It can only be called after a
* corresponding call to backward().
*/
MatrixPtr getXGrad() { return matGrad_; }
protected:
int numClasses_;
MatrixPtr a_;
MatrixPtr b_;
MatrixPtr w_;
MatrixPtr matWGrad_;
MatrixPtr da_;
MatrixPtr db_;
MatrixPtr dw_;
MatrixPtr ones_;
MatrixPtr expX_;
MatrixPtr matGrad_;
MatrixPtr alpha_;
MatrixPtr beta_;
MatrixPtr maxX_;
......
......@@ -18,6 +18,14 @@ add_unittest_without_exec(test_LayerGrad
add_test(NAME test_LayerGrad
COMMAND test_LayerGrad)
################ test_CRFLayerGrad ####################
add_unittest_without_exec(test_CRFLayerGrad
test_CRFLayerGrad.cpp
LayerGradUtil.cpp)
add_test(NAME test_CRFLayerGrad
COMMAND test_CRFLayerGrad)
add_unittest_without_exec(test_ActivationGrad
test_ActivationGrad.cpp
LayerGradUtil.cpp)
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "ModelConfig.pb.h"
#include "paddle/gserver/layers/DataLayer.h"
#include "paddle/gserver/layers/LinearChainCRF.h"
#include "paddle/trainer/Trainer.h"
#include "LayerGradUtil.h"
#include "paddle/testing/TestUtil.h"
using namespace paddle; // NOLINT
DECLARE_int32(gpu_id);
DECLARE_bool(thread_local_rand_use_global_seed);
static inline bool getNextSequence(std::vector<int>& seq, int numClasses) {
for (auto& v : seq) {
if (++v < numClasses) {
return true;
}
v = 0;
}
return false;
}
// log(exp(x) + exp(y))
static inline real logSum(real x, real y) {
real maxValue = std::max(x, y);
if (std::isinf(maxValue)) {
return -std::numeric_limits<real>::infinity();
} else {
return maxValue + log(exp(x - maxValue) + exp(y - maxValue));
}
}
static inline std::vector<int> genRandLabels(int numClasses, int length) {
std::vector<int> labels(length);
for (int i = 0; i < length; ++i) {
labels[i] = rand() % numClasses; // NOLINT
}
return labels;
}
TEST(CRFLayer, cost) {
const int numClasses = 4;
CpuVector para(numClasses * (numClasses + 2));
real* a = para.getData();
real* b = para.getData() + numClasses;
real* w = para.getData() + 2 * numClasses;
LinearChainCRF crf(4, para.getData());
for (int length : {1, 2, 3, 10}) {
for (int tries = 0; tries < 10; ++tries) {
CpuMatrix x(length, numClasses);
x.randomizeUniform();
para.randnorm(0, 2);
std::vector<int> goldenLabels = genRandLabels(numClasses, length);
real cost = crf.forward(x.getData(), goldenLabels.data(), length);
real logZ = -std::numeric_limits<real>::infinity();
real logNominator = -std::numeric_limits<real>::infinity();
std::vector<int> testResult(length, 0);
do {
real score = a[testResult.front()];
score += x.getElement(0, testResult.front());
for (int k = 1; k < length; ++k) {
score += x.getElement(k, testResult[k]) +
w[numClasses * testResult[k - 1] + testResult[k]];
}
score += b[testResult.back()];
logZ = logSum(logZ, score);
if (goldenLabels == testResult) {
logNominator = score;
}
} while (getNextSequence(testResult, numClasses));
real trueCost = -logNominator + logZ;
real diff = fabs(trueCost - cost);
diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost);
VLOG(1) << "cost=" << cost << " trueCost=" << trueCost << " diff=" << diff
<< std::endl;
if (typeid(real) == typeid(double)) { // NOLINT
EXPECT_LE(diff, 1e-10);
} else {
EXPECT_LE(diff, 5e-3);
}
}
}
}
inline real epsilon() { return typeid(real) == typeid(double) ? 1e-10 : 0.06; }
TestConfig initTestConfig(size_t numClasses, bool withWeight) {
TestConfig config;
config.layerConfig.set_type("crf");
config.layerConfig.set_size(numClasses);
config.biasSize = 0;
config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
"layer_0",
numClasses,
numClasses * (numClasses + 2)});
config.layerConfig.add_inputs();
config.inputDefs.push_back(
{INPUT_SEQUENCE_LABEL, "layer_label", numClasses, 0});
config.layerConfig.add_inputs();
if (withWeight) {
config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight", 1, 0});
config.layerConfig.add_inputs();
}
return config;
}
TEST(Layer, CRFLayer) {
size_t numClasses = 10;
for (int tries = 0; tries < 5; ++tries) {
TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
for (int length : {1, 3, 100}) {
// Not support GPU now
testLayerGrad(config,
"crf",
length,
/* trans= */ false,
/* useGpu= */ false,
/* useWeight= */ false,
epsilon());
}
}
}
TEST(Layer, CRFLayerUseWeight) {
size_t numClasses = 10;
for (int tries = 0; tries < 5; ++tries) {
TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
for (int length : {1, 3, 100}) {
// Not support GPU now
testLayerGrad(config,
"crf",
length,
/* trans= */ false,
/* useGpu= */ false,
/* useWeight= */ false,
epsilon());
}
}
}
int main(int argc, char** argv) {
initMain(argc, argv);
hl_start();
hl_init(FLAGS_gpu_id);
FLAGS_thread_local_rand_use_global_seed = true;
srand(1);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -276,27 +276,6 @@ TEST(Layer, AddtoLayer) {
}
}
TEST(Layer, CRFLayer) {
TestConfig config;
config.layerConfig.set_type("crf");
config.layerConfig.set_size(10);
config.biasSize = 0;
config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 120});
config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
config.layerConfig.add_inputs();
config.layerConfig.add_inputs();
// Not support GPU now
testLayerGrad(config,
"crf",
100,
/* trans */ false,
/* useGpu */ false,
false /*useWeight*/,
0.03 /*epsilon*/);
}
TEST(Layer, CTCLayer) {
TestConfig config;
config.layerConfig.set_type("ctc");
......
......@@ -36,7 +36,7 @@ TEST(LinearChainCRF, decoding) {
real* a = para.getData();
real* b = para.getData() + numClasses;
real* w = para.getData() + 2 * numClasses;
LinearChainCRF crf(4, para.getData(), nullptr);
LinearChainCRF crf(4, para.getData());
for (int length : {1, 2, 3, 10}) {
for (int tries = 0; tries < 10; ++tries) {
CpuMatrix x(length, numClasses);
......
......@@ -2998,7 +2998,7 @@ class CRFLayer(LayerBase):
super(CRFLayer, self).__init__(name, 'crf', size, inputs, device=device)
config_assert(2 <= len(self.inputs) <= 3,
'CRFLayer must have 2 or 3 inputs')
self.create_input_parameter(0, size * (size + 2), [size, size + 2])
self.create_input_parameter(0, size * (size + 2), [size + 2, size])
self.config.coeff = coeff
......@@ -3020,7 +3020,7 @@ class CRFDecodingLayer(LayerBase):
config_assert(
len(self.inputs) <= 2,
'CRFDecodingLayer cannot have more than 2 inputs')
self.create_input_parameter(0, size * (size + 2), [size, size + 2])
self.create_input_parameter(0, size * (size + 2), [size + 2, size])
@config_layer('ctc')
......
......@@ -239,9 +239,9 @@ parameters {
name: "___crf_layer_0__.w0"
size: 24
initial_mean: 0.0
initial_std: 0.5
dims: 4
initial_std: 0.408248290464
dims: 6
dims: 4
initial_strategy: 0
initial_smart: true
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册