提交 51c45854 编写于 作者: L liuyuan 提交者: GitHub

Merge pull request #678 from pengli09/fix-crf-weight-and-coeff-bug

Fix bug in processing instance weight and coeff in CRFLayer
...@@ -24,7 +24,7 @@ bool CRFDecodingLayer::init(const LayerMap& layerMap, ...@@ -24,7 +24,7 @@ bool CRFDecodingLayer::init(const LayerMap& layerMap,
return false; return false;
} }
crf_.reset(new LinearChainCRF( crf_.reset(new LinearChainCRF(
numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData(), nullptr)); numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData()));
return true; return true;
} }
......
...@@ -42,6 +42,7 @@ bool CRFLayer::init(const LayerMap& layerMap, ...@@ -42,6 +42,7 @@ bool CRFLayer::init(const LayerMap& layerMap,
CHECK_EQ(parameters_[0]->getSize(), numClasses_ * (numClasses_ + 2)); CHECK_EQ(parameters_[0]->getSize(), numClasses_ * (numClasses_ + 2));
parameter_ = parameters_[0]; parameter_ = parameters_[0];
weight_.reset(new Weight(numClasses_ + 2, numClasses_, parameter_));
// We don't need sequenceStartPositions because each sample of output_ is // We don't need sequenceStartPositions because each sample of output_ is
// for the cost of one sequence. // for the cost of one sequence.
...@@ -69,11 +70,7 @@ void CRFLayer::forward(PassType passType) { ...@@ -69,11 +70,7 @@ void CRFLayer::forward(PassType passType) {
for (size_t i = 0; i < numSequences; ++i) { for (size_t i = 0; i < numSequences; ++i) {
if (i >= crfs_.size()) { if (i >= crfs_.size()) {
crfs_.emplace_back(numClasses_, crfs_.emplace_back(numClasses_, weight_->getW()->getData());
parameter_->getBuf(PARAMETER_VALUE)->getData(),
parameter_->getBuf(PARAMETER_GRADIENT)
? parameter_->getBuf(PARAMETER_GRADIENT)->getData()
: nullptr);
} }
output_.value->getData()[i] = output_.value->getData()[i] =
crfs_[i].forward(output.value->getData() + numClasses_ * starts[i], crfs_[i].forward(output.value->getData() + numClasses_ * starts[i],
...@@ -93,22 +90,25 @@ void CRFLayer::backward(const UpdateCallback& callback) { ...@@ -93,22 +90,25 @@ void CRFLayer::backward(const UpdateCallback& callback) {
const int* starts = label.sequenceStartPositions->getData(false); const int* starts = label.sequenceStartPositions->getData(false);
int numSequences = label.sequenceStartPositions->getSize() - 1; int numSequences = label.sequenceStartPositions->getSize() - 1;
bool needWGrad = weight_->getWGrad() ? true : false;
for (int i = 0; i < numSequences; ++i) { for (int i = 0; i < numSequences; ++i) {
crfs_[i].backward(output.value->getData() + numClasses_ * starts[i], crfs_[i].backward(output.value->getData() + numClasses_ * starts[i],
output.grad->getData() + numClasses_ * starts[i],
label.ids->getData() + starts[i], label.ids->getData() + starts[i],
starts[i + 1] - starts[i]); starts[i + 1] - starts[i],
if (weightLayer_) { needWGrad);
real weight = getInputValue(*weightLayer_)->getElement(i, 0); real instanceWeight = weightLayer_
MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]); ? getInputValue(*weightLayer_)->getElement(i, 0)
grad->mulScalar(weight); : real(1.0f);
instanceWeight *= coeff_;
MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
grad->add(*crfs_[i].getXGrad(), real(1.0f), instanceWeight);
if (needWGrad) {
weight_->getWGrad()->add(
*crfs_[i].getWGrad(), real(1.0f), instanceWeight);
} }
} }
if (coeff_ != real(1.0f)) {
output.grad->mulScalar(coeff_);
}
parameter_->incUpdate(callback); parameter_->incUpdate(callback);
} }
......
...@@ -38,8 +38,9 @@ protected: ...@@ -38,8 +38,9 @@ protected:
size_t numClasses_; size_t numClasses_;
ParameterPtr parameter_; ParameterPtr parameter_;
std::vector<LinearChainCRF> crfs_; std::vector<LinearChainCRF> crfs_;
LayerPtr weightLayer_; // weight for each sequence LayerPtr weightLayer_; // weight for each sequence
real coeff_; // weight for the layer std::unique_ptr<Weight> weight_; // parameters
real coeff_; // weight for the layer
}; };
} // namespace paddle } // namespace paddle
...@@ -17,18 +17,12 @@ limitations under the License. */ ...@@ -17,18 +17,12 @@ limitations under the License. */
namespace paddle { namespace paddle {
LinearChainCRF::LinearChainCRF(int numClasses, real* para, real* grad) LinearChainCRF::LinearChainCRF(int numClasses, real* para)
: numClasses_(numClasses) { : numClasses_(numClasses) {
a_ = Matrix::create(para, 1, numClasses_); a_ = Matrix::create(para, 1, numClasses_);
b_ = Matrix::create(para + numClasses_, 1, numClasses_); b_ = Matrix::create(para + numClasses_, 1, numClasses_);
w_ = Matrix::create(para + 2 * numClasses_, numClasses_, numClasses_); w_ = Matrix::create(para + 2 * numClasses_, numClasses_, numClasses_);
if (grad) {
da_ = Matrix::create(grad, 1, numClasses_);
db_ = Matrix::create(grad + numClasses_, 1, numClasses_);
dw_ = Matrix::create(grad + 2 * numClasses_, numClasses_, numClasses_);
}
ones_ = Matrix::create(1, numClasses_); ones_ = Matrix::create(1, numClasses_);
ones_->one(); ones_->one();
...@@ -107,19 +101,24 @@ real LinearChainCRF::forward(real* x, int* s, int length) { ...@@ -107,19 +101,24 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
return -ll; return -ll;
} }
void LinearChainCRF::backward(real* x, real* dx, int* s, int length) { void LinearChainCRF::backward(real* x, int* s, int length, bool needWGrad) {
MatrixPtr matX = Matrix::create(x, length, numClasses_); MatrixPtr matX = Matrix::create(x, length, numClasses_);
MatrixPtr matDX = Matrix::create(dx, length, numClasses_); Matrix::resizeOrCreate(matGrad_, length, numClasses_);
MatrixPtr matGrad = Matrix::create(length, numClasses_);
Matrix::resizeOrCreate(beta_, length, numClasses_); Matrix::resizeOrCreate(beta_, length, numClasses_);
real* b = b_->getData(); real* b = b_->getData();
real* dw = dw_ ? dw_->getData() : nullptr; if (needWGrad) {
Matrix::resizeOrCreate(matWGrad_, numClasses_ + 2, numClasses_);
matWGrad_->zeroMem();
da_ = matWGrad_->subRowMatrix(0, 1);
db_ = matWGrad_->subRowMatrix(1, 2);
dw_ = matWGrad_->subRowMatrix(2, numClasses_ + 2);
}
real* alpha = alpha_->getData(); real* alpha = alpha_->getData();
real* beta = beta_->getData(); real* beta = beta_->getData();
real* expW = expW_->getData(); real* expW = expW_->getData();
real* expX = expX_->getData(); real* expX = expX_->getData();
real* grad = matGrad->getData(); real* grad = matGrad_->getData();
for (int i = 0; i < numClasses_; ++i) { for (int i = 0; i < numClasses_; ++i) {
beta[(length - 1) * numClasses_ + i] = exp(b[i]); beta[(length - 1) * numClasses_ + i] = exp(b[i]);
...@@ -140,39 +139,38 @@ void LinearChainCRF::backward(real* x, real* dx, int* s, int length) { ...@@ -140,39 +139,38 @@ void LinearChainCRF::backward(real* x, real* dx, int* s, int length) {
normalizeL1(beta + k * numClasses_, numClasses_); normalizeL1(beta + k * numClasses_, numClasses_);
} }
matGrad->dotMul(*alpha_, *beta_); matGrad_->dotMul(*alpha_, *beta_);
matGrad->rowNormalizeL1(*matGrad); matGrad_->rowNormalizeL1(*matGrad_);
for (int k = 0; k < length; ++k) { for (int k = 0; k < length; ++k) {
grad[k * numClasses_ + s[k]] -= (real)1; grad[k * numClasses_ + s[k]] -= (real)1;
} }
matDX->add(*matGrad);
if (da_) {
da_->add(*matGrad->subMatrix(/* startRow= */ 0, /* numRows= */ 1));
}
if (db_) {
db_->add(*matGrad->subMatrix(/* startRow= */ length - 1, 1));
}
beta_->dotMul(*beta_, *expX_); if (needWGrad) {
beta_->rowNormalizeL1(*beta_); da_->add(*matGrad_->subMatrix(/* startRow= */ 0, /* numRows= */ 1));
db_->add(*matGrad_->subMatrix(/* startRow= */ length - 1, 1));
for (int k = 1; dw && k < length; ++k) { beta_->dotMul(*beta_, *expX_);
real sum = 0; beta_->rowNormalizeL1(*beta_);
for (int i = 0; i < numClasses_; ++i) {
for (int j = 0; j < numClasses_; ++j) { real* dw = dw_->getData();
sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] * for (int k = 1; k < length; ++k) {
beta[k * numClasses_ + j]; real sum = 0;
for (int i = 0; i < numClasses_; ++i) {
for (int j = 0; j < numClasses_; ++j) {
sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] *
beta[k * numClasses_ + j];
}
} }
} sum = 1 / sum;
sum = 1 / sum; for (int i = 0; i < numClasses_; ++i) {
for (int i = 0; i < numClasses_; ++i) { for (int j = 0; j < numClasses_; ++j) {
for (int j = 0; j < numClasses_; ++j) { dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] *
dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] *
alpha[(k - 1) * numClasses_ + i] * beta[k * numClasses_ + j];
beta[k * numClasses_ + j]; }
} }
dw[s[k - 1] * numClasses_ + s[k]] -= (real)1;
} }
dw[s[k - 1] * numClasses_ + s[k]] -= (real)1;
} }
} }
......
...@@ -21,7 +21,7 @@ namespace paddle { ...@@ -21,7 +21,7 @@ namespace paddle {
class LinearChainCRF { class LinearChainCRF {
public: public:
/** /**
* The size of para and grad must be \f$(numClasses + 2) * numClasses\f$. * The size of para must be \f$(numClasses + 2) * numClasses\f$.
* The first numClasses values of para are for starting weights (\f$a\f$). * The first numClasses values of para are for starting weights (\f$a\f$).
* The next numClasses values of para are for ending weights (\f$b\f$), * The next numClasses values of para are for ending weights (\f$b\f$),
* The remaning values are for transition weights (\f$w\f$). * The remaning values are for transition weights (\f$w\f$).
...@@ -34,7 +34,7 @@ public: ...@@ -34,7 +34,7 @@ public:
* all possible * all possible
* sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF. * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
*/ */
LinearChainCRF(int numClasses, real* para, real* grad); LinearChainCRF(int numClasses, real* para);
/** /**
* Calculate the negative log likelihood of s given x. * Calculate the negative log likelihood of s given x.
...@@ -45,29 +45,45 @@ public: ...@@ -45,29 +45,45 @@ public:
/** /**
* Calculate the gradient with respect to x, a, b, and w. * Calculate the gradient with respect to x, a, b, and w.
* The gradient of x will be stored in dx.
* backward() can only be called after a corresponding call to forward() with * backward() can only be called after a corresponding call to forward() with
* the same x, s and length. * the same x, s and length.
* @note The gradient is added to dx and grad (provided at constructor). * The gradient with respect to a, b, and w will not be calculated if
* needWGrad is false.
* @note Please call getWGrad() and getXGrad() to get the gradient with
* respect to (a, b, w) and x respectively.
*/ */
void backward(real* x, real* dx, int* s, int length); void backward(real* x, int* s, int length, bool needWGrad);
/** /**
* Find the most probable sequence given x. The result will be stored in s. * Find the most probable sequence given x. The result will be stored in s.
*/ */
void decode(real* x, int* s, int length); void decode(real* x, int* s, int length);
/*
* Return the gradient with respect to (a, b, w). It can only be called after
* a corresponding call to backward().
*/
MatrixPtr getWGrad() { return matWGrad_; }
/*
* Return the gradient with respect to x. It can only be called after a
* corresponding call to backward().
*/
MatrixPtr getXGrad() { return matGrad_; }
protected: protected:
int numClasses_; int numClasses_;
MatrixPtr a_; MatrixPtr a_;
MatrixPtr b_; MatrixPtr b_;
MatrixPtr w_; MatrixPtr w_;
MatrixPtr matWGrad_;
MatrixPtr da_; MatrixPtr da_;
MatrixPtr db_; MatrixPtr db_;
MatrixPtr dw_; MatrixPtr dw_;
MatrixPtr ones_; MatrixPtr ones_;
MatrixPtr expX_; MatrixPtr expX_;
MatrixPtr matGrad_;
MatrixPtr alpha_; MatrixPtr alpha_;
MatrixPtr beta_; MatrixPtr beta_;
MatrixPtr maxX_; MatrixPtr maxX_;
......
...@@ -18,6 +18,14 @@ add_unittest_without_exec(test_LayerGrad ...@@ -18,6 +18,14 @@ add_unittest_without_exec(test_LayerGrad
add_test(NAME test_LayerGrad add_test(NAME test_LayerGrad
COMMAND test_LayerGrad) COMMAND test_LayerGrad)
################ test_CRFLayerGrad ####################
add_unittest_without_exec(test_CRFLayerGrad
test_CRFLayerGrad.cpp
LayerGradUtil.cpp)
add_test(NAME test_CRFLayerGrad
COMMAND test_CRFLayerGrad)
add_unittest_without_exec(test_ActivationGrad add_unittest_without_exec(test_ActivationGrad
test_ActivationGrad.cpp test_ActivationGrad.cpp
LayerGradUtil.cpp) LayerGradUtil.cpp)
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "ModelConfig.pb.h"
#include "paddle/gserver/layers/DataLayer.h"
#include "paddle/gserver/layers/LinearChainCRF.h"
#include "paddle/trainer/Trainer.h"
#include "LayerGradUtil.h"
#include "paddle/testing/TestUtil.h"
using namespace paddle; // NOLINT
DECLARE_int32(gpu_id);
DECLARE_bool(thread_local_rand_use_global_seed);
static inline bool getNextSequence(std::vector<int>& seq, int numClasses) {
for (auto& v : seq) {
if (++v < numClasses) {
return true;
}
v = 0;
}
return false;
}
// log(exp(x) + exp(y))
static inline real logSum(real x, real y) {
real maxValue = std::max(x, y);
if (std::isinf(maxValue)) {
return -std::numeric_limits<real>::infinity();
} else {
return maxValue + log(exp(x - maxValue) + exp(y - maxValue));
}
}
static inline std::vector<int> genRandLabels(int numClasses, int length) {
std::vector<int> labels(length);
for (int i = 0; i < length; ++i) {
labels[i] = rand() % numClasses; // NOLINT
}
return labels;
}
TEST(CRFLayer, cost) {
const int numClasses = 4;
CpuVector para(numClasses * (numClasses + 2));
real* a = para.getData();
real* b = para.getData() + numClasses;
real* w = para.getData() + 2 * numClasses;
LinearChainCRF crf(4, para.getData());
for (int length : {1, 2, 3, 10}) {
for (int tries = 0; tries < 10; ++tries) {
CpuMatrix x(length, numClasses);
x.randomizeUniform();
para.randnorm(0, 2);
std::vector<int> goldenLabels = genRandLabels(numClasses, length);
real cost = crf.forward(x.getData(), goldenLabels.data(), length);
real logZ = -std::numeric_limits<real>::infinity();
real logNominator = -std::numeric_limits<real>::infinity();
std::vector<int> testResult(length, 0);
do {
real score = a[testResult.front()];
score += x.getElement(0, testResult.front());
for (int k = 1; k < length; ++k) {
score += x.getElement(k, testResult[k]) +
w[numClasses * testResult[k - 1] + testResult[k]];
}
score += b[testResult.back()];
logZ = logSum(logZ, score);
if (goldenLabels == testResult) {
logNominator = score;
}
} while (getNextSequence(testResult, numClasses));
real trueCost = -logNominator + logZ;
real diff = fabs(trueCost - cost);
diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost);
VLOG(1) << "cost=" << cost << " trueCost=" << trueCost << " diff=" << diff
<< std::endl;
if (typeid(real) == typeid(double)) { // NOLINT
EXPECT_LE(diff, 1e-10);
} else {
EXPECT_LE(diff, 5e-3);
}
}
}
}
inline real epsilon() { return typeid(real) == typeid(double) ? 1e-10 : 0.06; }
TestConfig initTestConfig(size_t numClasses, bool withWeight) {
TestConfig config;
config.layerConfig.set_type("crf");
config.layerConfig.set_size(numClasses);
config.biasSize = 0;
config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
"layer_0",
numClasses,
numClasses * (numClasses + 2)});
config.layerConfig.add_inputs();
config.inputDefs.push_back(
{INPUT_SEQUENCE_LABEL, "layer_label", numClasses, 0});
config.layerConfig.add_inputs();
if (withWeight) {
config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight", 1, 0});
config.layerConfig.add_inputs();
}
return config;
}
TEST(Layer, CRFLayer) {
size_t numClasses = 10;
for (int tries = 0; tries < 5; ++tries) {
TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
for (int length : {1, 3, 100}) {
// Not support GPU now
testLayerGrad(config,
"crf",
length,
/* trans= */ false,
/* useGpu= */ false,
/* useWeight= */ false,
epsilon());
}
}
}
TEST(Layer, CRFLayerUseWeight) {
size_t numClasses = 10;
for (int tries = 0; tries < 5; ++tries) {
TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
for (int length : {1, 3, 100}) {
// Not support GPU now
testLayerGrad(config,
"crf",
length,
/* trans= */ false,
/* useGpu= */ false,
/* useWeight= */ false,
epsilon());
}
}
}
int main(int argc, char** argv) {
initMain(argc, argv);
hl_start();
hl_init(FLAGS_gpu_id);
FLAGS_thread_local_rand_use_global_seed = true;
srand(1);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
...@@ -276,27 +276,6 @@ TEST(Layer, AddtoLayer) { ...@@ -276,27 +276,6 @@ TEST(Layer, AddtoLayer) {
} }
} }
TEST(Layer, CRFLayer) {
TestConfig config;
config.layerConfig.set_type("crf");
config.layerConfig.set_size(10);
config.biasSize = 0;
config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 120});
config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
config.layerConfig.add_inputs();
config.layerConfig.add_inputs();
// Not support GPU now
testLayerGrad(config,
"crf",
100,
/* trans */ false,
/* useGpu */ false,
false /*useWeight*/,
0.03 /*epsilon*/);
}
TEST(Layer, CTCLayer) { TEST(Layer, CTCLayer) {
TestConfig config; TestConfig config;
config.layerConfig.set_type("ctc"); config.layerConfig.set_type("ctc");
......
...@@ -36,7 +36,7 @@ TEST(LinearChainCRF, decoding) { ...@@ -36,7 +36,7 @@ TEST(LinearChainCRF, decoding) {
real* a = para.getData(); real* a = para.getData();
real* b = para.getData() + numClasses; real* b = para.getData() + numClasses;
real* w = para.getData() + 2 * numClasses; real* w = para.getData() + 2 * numClasses;
LinearChainCRF crf(4, para.getData(), nullptr); LinearChainCRF crf(4, para.getData());
for (int length : {1, 2, 3, 10}) { for (int length : {1, 2, 3, 10}) {
for (int tries = 0; tries < 10; ++tries) { for (int tries = 0; tries < 10; ++tries) {
CpuMatrix x(length, numClasses); CpuMatrix x(length, numClasses);
......
...@@ -2998,7 +2998,7 @@ class CRFLayer(LayerBase): ...@@ -2998,7 +2998,7 @@ class CRFLayer(LayerBase):
super(CRFLayer, self).__init__(name, 'crf', size, inputs, device=device) super(CRFLayer, self).__init__(name, 'crf', size, inputs, device=device)
config_assert(2 <= len(self.inputs) <= 3, config_assert(2 <= len(self.inputs) <= 3,
'CRFLayer must have 2 or 3 inputs') 'CRFLayer must have 2 or 3 inputs')
self.create_input_parameter(0, size * (size + 2), [size, size + 2]) self.create_input_parameter(0, size * (size + 2), [size + 2, size])
self.config.coeff = coeff self.config.coeff = coeff
...@@ -3020,7 +3020,7 @@ class CRFDecodingLayer(LayerBase): ...@@ -3020,7 +3020,7 @@ class CRFDecodingLayer(LayerBase):
config_assert( config_assert(
len(self.inputs) <= 2, len(self.inputs) <= 2,
'CRFDecodingLayer cannot have more than 2 inputs') 'CRFDecodingLayer cannot have more than 2 inputs')
self.create_input_parameter(0, size * (size + 2), [size, size + 2]) self.create_input_parameter(0, size * (size + 2), [size + 2, size])
@config_layer('ctc') @config_layer('ctc')
......
...@@ -239,9 +239,9 @@ parameters { ...@@ -239,9 +239,9 @@ parameters {
name: "___crf_layer_0__.w0" name: "___crf_layer_0__.w0"
size: 24 size: 24
initial_mean: 0.0 initial_mean: 0.0
initial_std: 0.5 initial_std: 0.408248290464
dims: 4
dims: 6 dims: 6
dims: 4
initial_strategy: 0 initial_strategy: 0
initial_smart: true initial_smart: true
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册