add crf_decoding layer (#6274)

* add crf_decoding layer * fix some typo * fix test_crf_decoding_op

add crf_decoding layer (#6274)
* add crf_decoding layer * fix some typo * fix test_crf_decoding_op
45c8a88a · Qiao Longfei · GitHub · e760641a · 45c8a88a · 45c8a88a
8 changed file
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
@@ -36,17 +36,18 @@ class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
        "w. See more details in comments of the linear_chain_crf operator.");
    AddInput(
        "Label",
-        "(LoDTensor,  LoDTensor<int>). The ground truth with shape "
+        "(LoDTensor,  LoDTensor<int64_t>). The ground truth with shape "
        "[N x 1]. This input is optional. See more details in the operator's "
        "comments.")
        .AsDispensable();
-    AddOutput("ViterbiPath",
+    AddOutput(
-              "(LoDTensor, LoDTensor<int>). The decoding results. What to "
+        "ViterbiPath",
-              "return changes depending on whether the Input(Label) (the groud "
+        "(LoDTensor, LoDTensor<int64_t>). The decoding results. What to "
+        "return changes depending on whether the Input(Label) (the ground "
        "truth) is given. See more details in the operator's comment.");
    AddComment(R"DOC(
 The crf_decoding operator reads the emission feature weights and the transition
-freature weights learned by the linear_chain_crf operator. It implements the
+feature weights learned by the linear_chain_crf operator. It implements the
 Viterbi algorithm which is a dynamic programming algorithm for finding the most
 likely sequence of hidden states, called the Viterbi path, that results in a
 sequence of observed tags.
@@ -60,14 +61,14 @@ operator.
 When Input(Label) is given, the crf_decoding operator returns a row vector
 with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
-prediction, or 1 indicating a tag is correctly predicted. Such an ouput is the
+prediction, or 1 indicating a tag is correctly predicted. Such an output is the
 input to chunk_eval operator.
 2. Input(Label) is not given:
 This is the standard decoding process.
-The crf_decoding operator returns a row vecotr with shape [N x 1] whose values
+The crf_decoding operator returns a row vector with shape [N x 1] whose values
 range from 0 to maximum tag number - 1. Each element indicates an index of a
 predicted tag.
 )DOC");

--- a/paddle/operators/crf_decoding_op.h
+++ b/paddle/operators/crf_decoding_op.h
@@ -43,8 +43,8 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
    const size_t level = 0;
    const size_t seq_num = lod[level].size() - 1;
-    int* path = decoded_path->mutable_data<int>(platform::CPUPlace());
+    int64_t* path = decoded_path->mutable_data<int64_t>(platform::CPUPlace());
-    math::SetConstant<platform::CPUPlace, int>()(ctx.device_context(),
+    math::SetConstant<platform::CPUPlace, int64_t>()(ctx.device_context(),
                                                     decoded_path, 0);
    for (size_t i = 0; i < seq_num; ++i) {
      int start_pos = static_cast<int>(lod[level][i]);
@@ -57,7 +57,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
    if (label) {
      PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL,
                        "The Input(Label) should be a sequence.");
-      const int* label_value = label->data<int>();
+      const int64_t* label_value = label->data<int64_t>();
      size_t batch_size = emission_weights->dims()[0];
      for (size_t i = 0; i < batch_size; ++i) {
        path[i] = label_value[i] == path[i] ? 1 : 0;
@@ -76,7 +76,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
    const T* x = emission_weights.data<T>();
    const T* w = transition_weights.data<T>();
-    int* path = decoded_path->data<int>();
+    int64_t* path = decoded_path->data<int64_t>();
    // alpha is a memo table. An element alpha(k, v) records the score of the
    // best sequence of tags from position 1 to position k with v being the end

--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -237,7 +237,7 @@ class Operator(object):
        def find_name(var_list, name):
            for var_name in var_list:
-                if var_name == name:
+                if var_list[var_name] is not None and var_name == name:
                    return True
            return False

--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
 import copy
 import itertools
-from framework import Variable, default_main_program, default_startup_program, \
+from framework import Variable, Parameter, default_main_program, default_startup_program, \
    unique_name, dtype_is_floating
 from paddle.v2.fluid.initializer import Constant, Xavier
 from param_attr import ParamAttr
@@ -122,6 +122,12 @@ class LayerHelper(object):
        return self.main_program.global_block().create_parameter(
            dtype=dtype, shape=shape, **attr.to_kwargs())
+    def get_parameter(self, name):
+        param = self.main_program.global_block().var(name)
+        if not isinstance(param, Parameter):
+            raise ValueError("no Parameter name %s found" % name)
+        return param
    def create_tmp_variable(self, dtype):
        return self.main_program.current_block().create_var(
            name=unique_name(".".join([self.name, 'tmp'])),

--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
@@ -477,6 +477,24 @@ def linear_chain_crf(input,
    return log_likelihood
+def crf_decoding(input,
+                 param_attr,
+                 label=None,
+                 main_program=None,
+                 startup_program=None):
+    helper = LayerHelper('crf_decoding', **locals())
+    transition = helper.get_parameter(param_attr.name)
+    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='crf_decoding',
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
+        outputs={"ViterbiPath": [viterbi_path]})
+    return viterbi_path
 def assign(input, output, main_program=None, startup_program=None):
    helper = LayerHelper('assign', **locals())
    helper.append_op(

--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -137,12 +137,19 @@ def main():
        param_attr=fluid.ParamAttr(
            name='crfw', learning_rate=mix_hidden_lr))
    avg_cost = fluid.layers.mean(x=crf_cost)
    # TODO(qiao)
-    #   1. add crf_decode_layer and evaluator
+    # check other optimizers and check why out will be NAN
-    #   2. use other optimizer and check why out will be NAN
    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
    sgd_optimizer.minimize(avg_cost)
+    # TODO(qiao)
+    # add dependency track and move this config before optimizer
+    crf_decode = fluid.layers.crf_decoding(
+        input=feature_out,
+        label=target,
+        param_attr=fluid.ParamAttr(name='crfw'))
    train_data = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.conll05.test(), buf_size=8192),
@@ -168,7 +175,6 @@ def main():
                           feed=feeder.feed(data),
                           fetch_list=[avg_cost])
            avg_cost_val = np.array(outs[0])
            if batch_id % 10 == 0:
                print("avg_cost=" + str(avg_cost_val))

--- a/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
+++ b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
@@ -20,14 +20,14 @@ class CRFDecoding(object):
        self.w = transition_weights[2:, :]
        self.track = np.zeros(
-            (seq_start_positions[-1], self.tag_num), dtype="int32")
+            (seq_start_positions[-1], self.tag_num), dtype="int64")
        self.decoded_path = np.zeros(
-            (seq_start_positions[-1], 1), dtype="int32")
+            (seq_start_positions[-1], 1), dtype="int64")
    def _decode_one_sequence(self, decoded_path, x):
        seq_len, tag_num = x.shape
        alpha = np.zeros((seq_len, tag_num), dtype="float64")
-        track = np.zeros((seq_len, tag_num), dtype="int32")
+        track = np.zeros((seq_len, tag_num), dtype="int64")
        for i in range(tag_num):
            alpha[0, i] = self.a[i] + x[0, i]
@@ -125,10 +125,10 @@ class TestCRFDecodingOp2(OpTest):
            axis=0)
        labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int32")
+            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
        predicted_labels = np.ones(
-            (lod[-1][-1], 1), dtype="int32") * (TAG_NUM - 1)
+            (lod[-1][-1], 1), dtype="int64") * (TAG_NUM - 1)
-        expected_output = (labels == predicted_labels).astype("int32")
+        expected_output = (labels == predicted_labels).astype("int64")
        self.inputs = {
            "Emission": (emission, lod),

--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -4,6 +4,7 @@ import unittest
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.framework import Program, program_guard
+from paddle.v2.fluid.param_attr import ParamAttr
 class TestBook(unittest.TestCase):
@@ -132,8 +133,12 @@ class TestBook(unittest.TestCase):
            images = layers.data(name='pixel', shape=[784], dtype='float32')
            label = layers.data(name='label', shape=[1], dtype='int32')
            hidden = layers.fc(input=images, size=128)
-            crf = layers.linear_chain_crf(input=hidden, label=label)
+            crf = layers.linear_chain_crf(
+                input=hidden, label=label, param_attr=ParamAttr(name="crfw"))
+            crf_decode = layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
            self.assertNotEqual(crf, None)
+            self.assertNotEqual(crf_decode, None)
        print(str(program))