提交 2ac9a3d8 编写于 作者: C caoying03

follow comments.

上级 dd2be3da
......@@ -235,7 +235,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
PADDLE_ENFORCE_LT(
begin_idx, end_idx,
"The start row index must be smaller than the end row index.");
"The start row index must be lesser than the end row index.");
if (dims_[0] == 1) {
return *this;
......
......@@ -26,9 +26,8 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
"Emission",
"(LoDTensor, default: LoDTensor<float>). "
"The unscaled emission weight matrix for the linear chain CRF. "
"This input is a LoDTensor with shape [N x D] where N is the total "
"element number of all input squences in a mini-batch, "
"and D is the total tag number.");
"This input is a LoDTensor with shape [N x D] where N is the size of "
"the mini-batch and D is the total tag number.");
AddInput(
"Transition",
"(Tensor, default: Tensor<float>). A Tensor with shape [(D + 2) x D]. "
......@@ -36,7 +35,7 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
"See more details in the operator's comments.");
AddInput(
"Label",
"(LoDTensor, default: LoDTensor<int>). The groundtruth which is a 2-D "
"(LoDTensor, default: LoDTensor<int>). The ground truth which is a 2-D "
"LoDTensor with shape [N x 1], where N is the total element number in "
"a mini-batch.");
AddOutput(
......@@ -77,12 +76,13 @@ variables. CRF learns the conditional probability \f$P(Y|X)\f$, where
Linear chain CRF is a special case of CRF that is useful for sequence labeling
task. Sequence labeling tasks do not assume a lot of conditional
independences among inputs. They only concern about the input and the output
being linear sequences. Thus, the graph model of such a CRF is a simple chain
or a line, which results in the linear chain CRF.
independences among inputs. The only constraint they impose is that the input
and output must be linear sequences. Thus, the graph of such a CRF is a simple
chain or a line, which results in the linear chain CRF.
This operator implements the Forward-Backward algorithm for the linear chain
CRF. Please see http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
CRF. Please see http://www.cs.columbia.edu/~mcollins/fb.pdf and
http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for reference.
Equation:
......@@ -111,7 +111,7 @@ NOTE:
transition features. The emission feature weights are NOT computed in
this operator. They MUST be computed first before this operator is called.
2. Because this operator performs globally normaliztion over all possible
2. Because this operator performs global normalization over all possible
sequences internally, it expects UNSCALED emission feature weights.
Please do not call this op with the emission feature being output of any
nonlinear activation.
......@@ -171,9 +171,10 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("Alpha", emission_dims);
ctx->SetOutputDim("EmissionExps", emission_dims);
ctx->SetOutputDim("TransitionExps", transition_dims);
// (TODO caoying) This is tricky. The 1st dimension of Output(LogLikelihood)
// TODO(caoying) This is tricky. The 1st dimension of Output(LogLikelihood)
// is the sequence number in a mini-batch. The dimension set here should be
// resized to its correct size in the function Compute.
// resized to its correct size in the function Compute. Fix this once we can
// get LoD information in the InferShape interface.
ctx->SetOutputDim("LogLikelihood", {emission_dims[0], 1});
}
......@@ -236,7 +237,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
protected:
// Explicitly set that the data type of output of the linear_chain_crf_grad
// operator is determined by its input: graidents of LogLikelihood.
// operator is determined by its input: gradients of LogLikelihood.
framework::DataType IndicateDataType(
const framework::ExecutionContext& ctx) const override {
return framework::ToDataType(
......
......@@ -188,7 +188,6 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
const LoDTensor& src, LoDTensor* dst) {
dst->mutable_data<T>(src.dims(), platform::CPUPlace());
dst->CopyFrom(src, platform::CPUPlace(), ctx);
};
copyLoDTensor(ctx, emission_weights_src, emission_weights_dst);
......@@ -248,7 +247,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < tag_num; ++i) {
T sum = 0.;
for (size_t j = 0; j < tag_num; ++j) {
sum += alpha_value[(k - 1) * tag_num + j] *
sum += alpha_value[(k - 1) * tag_num + j] * // (*)
w_exps[(j + state_trans_base_idx) * tag_num + i];
}
alpha_value[k * tag_num + i] = x_exps[k * tag_num + i] * sum;
......@@ -291,7 +290,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
// These local variables hold the inputs and outputs, garanteeing them on
// CPU memory, to provide a consistent reference.
// TODO(caoying) Fix this by moving all these local variables into the
// class's data members once we can profile the training process.
// class's data members once we can profile the training process, or
// implementing a real GPU kernel for CRF.
Tensor* label = nullptr;
Tensor label_tensor;
Tensor* emission_exps = nullptr;
......@@ -344,6 +344,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
transition_grad =
ctx.Output<Tensor>(framework::GradVarName("Transition"));
}
// TODO(caoying) Fix this constraint. When the Input(Emission) is from the
// data reader operator, it can have no gradients.
PADDLE_ENFORCE(emission_grad, "Output(Emission@Grad) should not be null.");
emission_grad->mutable_data<T>(platform::CPUPlace());
math::SetConstant<platform::CPUPlace, T>()(ctx.device_context(),
......@@ -458,7 +461,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < tag_num; ++i) {
T sum = 0.;
for (size_t j = 0; j < tag_num; ++j) {
sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *
sum += w_exps[(i + state_trans_base_idx) * tag_num + j] * // (**)
x_exps[(k + 1) * tag_num + j] *
beta_value[(k + 1) * tag_num + j];
}
......@@ -493,7 +496,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
auto x_exps_mat = EigenMatrix<T>::From(emission_exps);
// TODO(caoying): Fix this to avoid using this local variable.
// TODO(caoying): Fix this to avoid using this local variable if when can
// profiling the training process.
Tensor tmp;
tmp.mutable_data<T>(beta->dims(), platform::CPUPlace());
auto tmp_mat = EigenMatrix<T>::From(tmp);
......
......@@ -83,6 +83,9 @@ class LinearChainCrfForward(object):
class TestLinearChainCrfOp(OpTest):
def set_test_data(self):
# TODO(caoying) Fix the unittest by: add the boundary cases when
# sequence lengths are 1, 2, and 3.
SEQ_NUM = 3
TAG_NUM = 17
MAX_SEQ_LEN = 5
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册