From 4eb484579c95a37a247e277f72af20c09d6176b8 Mon Sep 17 00:00:00 2001
From: kinghuin <kinghuin_chull@163.com>
Date: Thu, 28 Jan 2021 14:40:49 +0800
Subject: [PATCH] [Cherry-pick] fix crf bug in paddlenlp (#5242)

[Cherry-pick] fix crf bug in paddlenlp
---
 PaddleNLP/examples/lexical_analysis/data.py   |   4 +-
 PaddleNLP/examples/lexical_analysis/eval.py   |   2 +-
 PaddleNLP/examples/lexical_analysis/model.py  |  12 +-
 .../examples/lexical_analysis/predict.py      |   4 +-
 PaddleNLP/examples/lexical_analysis/train.py  |   9 +-
 .../express_ner/run_bigru_crf.py              |   2 +-
 PaddleNLP/paddlenlp/layers/crf.py             | 127 +++++++++---------
 PaddleNLP/paddlenlp/metrics/chunk.py          |  12 +-
 8 files changed, 90 insertions(+), 82 deletions(-)

diff --git a/PaddleNLP/examples/lexical_analysis/data.py b/PaddleNLP/examples/lexical_analysis/data.py
index 74523d43..3342b7c8 100644
--- a/PaddleNLP/examples/lexical_analysis/data.py
+++ b/PaddleNLP/examples/lexical_analysis/data.py
@@ -161,11 +161,11 @@ def parse_lac_result(words, preds, lengths, word_vocab, label_vocab):
     for sent_index in range(len(lengths)):
         sent = [
             id2word_dict[index]
-            for index in words[sent_index][:lengths[sent_index] - 1]
+            for index in words[sent_index][:lengths[sent_index]]
         ]
         tags = [
             id2label_dict[index]
-            for index in preds[sent_index][:lengths[sent_index] - 1]
+            for index in preds[sent_index][:lengths[sent_index]]
         ]
 
         sent_out = []
diff --git a/PaddleNLP/examples/lexical_analysis/eval.py b/PaddleNLP/examples/lexical_analysis/eval.py
index d8b78eda..0a2d2e7e 100644
--- a/PaddleNLP/examples/lexical_analysis/eval.py
+++ b/PaddleNLP/examples/lexical_analysis/eval.py
@@ -56,7 +56,7 @@ def evaluate(args):
         dataset=test_dataset,
         batch_size=args.batch_size,
         shuffle=False,
-        drop_last=True)
+        drop_last=False)
     test_loader = paddle.io.DataLoader(
         dataset=test_dataset,
         batch_sampler=test_sampler,
diff --git a/PaddleNLP/examples/lexical_analysis/model.py b/PaddleNLP/examples/lexical_analysis/model.py
index 6437cab0..26c62e15 100644
--- a/PaddleNLP/examples/lexical_analysis/model.py
+++ b/PaddleNLP/examples/lexical_analysis/model.py
@@ -39,7 +39,8 @@ class BiGruCrf(nn.Layer):
                  vocab_size,
                  num_labels,
                  emb_lr=2.0,
-                 crf_lr=0.2):
+                 crf_lr=0.2,
+                 with_start_stop_tag=True):
         super(BiGruCrf, self).__init__()
         self.word_emb_dim = word_emb_dim
         self.vocab_size = vocab_size
@@ -73,14 +74,17 @@ class BiGruCrf(nn.Layer):
 
         self.fc = nn.Linear(
             in_features=self.hidden_size * 2,
-            out_features=self.num_labels + 2,
+            out_features=self.num_labels + 2 \
+                if with_start_stop_tag else self.num_labels,
             weight_attr=paddle.ParamAttr(
                 initializer=nn.initializer.Uniform(
                     low=-self.init_bound, high=self.init_bound),
                 regularizer=paddle.regularizer.L2Decay(coeff=1e-4)))
 
-        self.crf = LinearChainCrf(self.num_labels, self.crf_lr)
-        self.viterbi_decoder = ViterbiDecoder(self.crf.transitions)
+        self.crf = LinearChainCrf(self.num_labels, self.crf_lr,
+                                  with_start_stop_tag)
+        self.viterbi_decoder = ViterbiDecoder(self.crf.transitions,
+                                              with_start_stop_tag)
 
     def forward(self, inputs, lengths):
         word_embed = self.word_embedding(inputs)
diff --git a/PaddleNLP/examples/lexical_analysis/predict.py b/PaddleNLP/examples/lexical_analysis/predict.py
index accdd78b..4b93a42c 100644
--- a/PaddleNLP/examples/lexical_analysis/predict.py
+++ b/PaddleNLP/examples/lexical_analysis/predict.py
@@ -55,7 +55,7 @@ def infer(args):
         dataset=infer_dataset,
         batch_size=args.batch_size,
         shuffle=False,
-        drop_last=True)
+        drop_last=False)
     infer_loader = paddle.io.DataLoader(
         dataset=infer_dataset,
         batch_sampler=infer_sampler,
@@ -75,7 +75,7 @@ def infer(args):
         test_data=infer_loader, batch_size=args.batch_size)
 
     # Post-processing the lexical analysis results
-    lengths = np.array(lengths).reshape([-1])
+    lengths = np.array([l for lens in lengths for l in lens]).reshape([-1])
     preds = np.array(
         [pred for batch_pred in crf_decodes for pred in batch_pred])
 
diff --git a/PaddleNLP/examples/lexical_analysis/train.py b/PaddleNLP/examples/lexical_analysis/train.py
index ffda35b5..5ad9dc4b 100644
--- a/PaddleNLP/examples/lexical_analysis/train.py
+++ b/PaddleNLP/examples/lexical_analysis/train.py
@@ -77,7 +77,7 @@ def train(args):
         dataset=test_dataset,
         batch_size=args.batch_size,
         shuffle=False,
-        drop_last=True)
+        drop_last=False)
     test_loader = paddle.io.DataLoader(
         dataset=test_dataset,
         batch_sampler=test_sampler,
@@ -93,7 +93,7 @@ def train(args):
     # Prepare optimizer, loss and metric evaluator
     optimizer = paddle.optimizer.Adam(
         learning_rate=args.base_lr, parameters=model.parameters())
-    crf_loss = LinearChainCrfLoss(network.crf.transitions)
+    crf_loss = LinearChainCrfLoss(network.crf)
     chunk_evaluator = ChunkEvaluator(
         label_list=train_dataset.label_vocab.keys(), suffix=True)
     model.prepare(optimizer, crf_loss, chunk_evaluator)
@@ -101,7 +101,6 @@ def train(args):
         model.load(args.init_checkpoint)
 
     # Start training
-    callback = paddle.callbacks.ProgBarLogger(log_freq=10, verbose=3)
     model.fit(train_data=train_loader,
               eval_data=test_loader,
               batch_size=args.batch_size,
@@ -110,9 +109,7 @@ def train(args):
               log_freq=10,
               save_dir=args.model_save_dir,
               save_freq=1,
-              drop_last=True,
-              shuffle=True,
-              callbacks=callback)
+              shuffle=True)
 
 
 if __name__ == "__main__":
diff --git a/PaddleNLP/examples/named_entity_recognition/express_ner/run_bigru_crf.py b/PaddleNLP/examples/named_entity_recognition/express_ner/run_bigru_crf.py
index f8969230..7f7eb6e3 100644
--- a/PaddleNLP/examples/named_entity_recognition/express_ner/run_bigru_crf.py
+++ b/PaddleNLP/examples/named_entity_recognition/express_ner/run_bigru_crf.py
@@ -164,7 +164,7 @@ if __name__ == '__main__':
 
     optimizer = paddle.optimizer.Adam(
         learning_rate=0.001, parameters=model.parameters())
-    crf_loss = LinearChainCrfLoss(network.crf.transitions)
+    crf_loss = LinearChainCrfLoss(network.crf)
     chunk_evaluator = ChunkEvaluator(
         label_list=train_ds.label_vocab.keys(), suffix=True)
     model.prepare(optimizer, crf_loss, chunk_evaluator)
diff --git a/PaddleNLP/paddlenlp/layers/crf.py b/PaddleNLP/paddlenlp/layers/crf.py
index 82a1801c..e116978f 100644
--- a/PaddleNLP/paddlenlp/layers/crf.py
+++ b/PaddleNLP/paddlenlp/layers/crf.py
@@ -58,7 +58,8 @@ class LinearChainCrf(nn.Layer):
 
     def _initialize_alpha(self, batch_size):
         # alpha accumulate the path value to get the different next tag
-        if self._initial_alpha is None:
+        if self._initial_alpha is None or batch_size > self._initial_alpha.shape[
+                0]:
             # Initialized by a small value.
             initial_alpha = paddle.full(
                 (batch_size, self.num_tags - 1),
@@ -69,7 +70,7 @@ class LinearChainCrf(nn.Layer):
                 (batch_size, 1), dtype='float32', fill_value=0.)
             self._initial_alpha = paddle.concat(
                 [initial_alpha, alpha_start], axis=1)
-        return self._initial_alpha
+        return self._initial_alpha[:batch_size, :]
 
     def forward(self, inputs, lengths):
         """
@@ -99,27 +100,20 @@ class LinearChainCrf(nn.Layer):
 
         all_alpha = []
         if self.with_start_stop_tag:
-            alpha = self._initialize_alpha(batch_size).detach()
-            for i, input_exp in enumerate(inputs_t_exp):
-                # input_exp: batch_size, num_tags, num_tags
-                # alpha_exp: batch_size, num_tags, num_tags
+            alpha = self._initialize_alpha(batch_size)
+
+        for i, input_exp in enumerate(inputs_t_exp):
+            # input_exp: batch_size, num_tags, num_tags
+            # alpha_exp: batch_size, num_tags, num_tags
+            if i == 0 and not self.with_start_stop_tag:
+                mat = input_exp
+            else:
                 alpha_exp = alpha.unsqueeze(1).expand(
                     [batch_size, n_labels, n_labels])
                 # F(n) = logsumexp(F(n-1) + p(y_n) + T(y_{n-1}, y_n))
                 mat = input_exp + trans_exp + alpha_exp
-                alpha = paddle.logsumexp(mat, 2)
-                all_alpha.append(alpha)
-        else:
-            for i, input_exp in enumerate(inputs_t_exp):
-                if i == 0:
-                    alpha = inputs.transpose([1, 0, 2])[0]
-                else:
-                    alpha_exp = alpha.unsqueeze(1).expand(
-                        [batch_size, n_labels, n_labels])
-                    # F(n) = logsumexp(F(n-1) + p(y_n) + T(y_{n-1}, y_n))
-                    mat = input_exp + trans_exp + alpha_exp
-                    alpha = paddle.logsumexp(mat, 2)
-                all_alpha.append(alpha)
+            alpha = paddle.logsumexp(mat, 2)
+            all_alpha.append(alpha)
 
         # Get the valid alpha
         all_alpha = paddle.stack(all_alpha).transpose([1, 0, 2])
@@ -166,8 +160,7 @@ class LinearChainCrf(nn.Layer):
             sequence_mask(
                 self._get_batch_seq_index(batch_size, seq_len), lengths),
             'float32')
-        if self.with_start_stop_tag:
-            mask = mask[:, :seq_len]
+        mask = mask[:, :seq_len]
 
         mask_scores = scores * mask
         score = paddle.sum(mask_scores, 1)
@@ -191,6 +184,10 @@ class LinearChainCrf(nn.Layer):
                 fill_value=self.stop_idx)
             labels_ext = (1 - mask) * pad_stop + mask * labels_ext
         else:
+            mask = paddle.cast(
+                sequence_mask(
+                    self._get_batch_seq_index(batch_size, seq_len), lengths),
+                'int32')
             labels_ext = labels
 
         start_tag_indices = labels_ext[:, :-1]
@@ -212,7 +209,8 @@ class LinearChainCrf(nn.Layer):
         return score
 
     def _get_start_stop_tensor(self, batch_size):
-        if self._start_tensor is None or self._stop_tensor is None:
+        if self._start_tensor is None or self._stop_tensor is None or batch_size != self._start_tensor.shape[
+                0]:
             self._start_tensor = paddle.full(
                 (batch_size, 1), dtype='int64', fill_value=self.start_idx)
             self._stop_tensor = paddle.full(
@@ -220,7 +218,8 @@ class LinearChainCrf(nn.Layer):
         return self._start_tensor, self._stop_tensor
 
     def _get_batch_index(self, batch_size):
-        if self._batch_index is None:
+        if self._batch_index is None or batch_size != self._batch_index.shape[
+                0]:
             self._batch_index = paddle.arange(end=batch_size, dtype="int64")
         return self._batch_index
 
@@ -231,36 +230,39 @@ class LinearChainCrf(nn.Layer):
 
     def _get_batch_seq_index(self, batch_size, length):
         if self._batch_seq_index is None or length + 2 > self._batch_seq_index.shape[
-                1]:
+                1] or batch_size > self._batch_seq_index.shape[0]:
             self._batch_seq_index = paddle.cumsum(
                 paddle.ones([batch_size, length + 2], "int64"), axis=1) - 1
         if self.with_start_stop_tag:
-            return self._batch_seq_index[:, :length + 2]
+            return self._batch_seq_index[:batch_size, :length + 2]
         else:
-            return self._batch_seq_index[:, :length]
+            return self._batch_seq_index[:batch_size, :length]
 
 
-class LinearChainCrfLoss(LinearChainCrf):
+class LinearChainCrfLoss(nn.Layer):
     """The negative log-likelihood for linear chain Conditional Random Field (CRF).
 
     let $$ Z(x) = \\sum_{y'}exp(score(x,y')) $$, means the sum of all path scores,
     then we have $$ loss = -logp(y|x) = -log(exp(score(x,y))/Z(x)) = -score(x,y) + logZ(x) $$
 
     Args:
-        transitions (Tensor): The transition matrix.
+        crf (LinearChainCrf): The LinearChainCrf network.
     """
 
-    def __init__(self, transitions):
-        num_labels = transitions.shape[0] - 2
-        super(LinearChainCrfLoss, self).__init__(num_labels)
-        self.transitions.set_value(transitions)
+    def __init__(self, crf):
+        super(LinearChainCrfLoss, self).__init__()
+        self.crf = crf
+        if isinstance(crf, paddle.fluid.framework.ParamBase):
+            raise ValueError(
+                "From paddlenlp >= 2.0.0b4, the first param of LinearChainCrfLoss shoule be a LinearChainCrf object. For input parameter 'crf.transitions', you can remove '.transitions' to 'crf'"
+            )
 
     def forward(self, inputs, lengths, predictions, labels):
         # Note: When closing to convergence, the loss could be a small negative number. This may caused by underflow when calculating exp in logsumexp.
         #       We add relu here to avoid negative loss. In theory, the crf loss must be greater than or equal to 0, relu will not impact on it.
         return nn.functional.relu(
-            super(LinearChainCrfLoss, self).forward(inputs, lengths) -
-            self.gold_score(inputs, labels, lengths))
+            self.crf.forward(inputs, lengths) - self.crf.gold_score(
+                inputs, labels, lengths))
 
 
 class ViterbiDecoder(nn.Layer):
@@ -278,7 +280,9 @@ class ViterbiDecoder(nn.Layer):
         self.transitions = transitions
         self.with_start_stop_tag = with_start_stop_tag
         # If consider start and stop, -1 should be START and -2 should be STOP.
-        self.stop_idx = -2
+        if with_start_stop_tag:
+            self.start_idx = -1
+            self.stop_idx = -2
         self.num_tags = transitions.shape[0]
 
         self._initial_alpha = None
@@ -287,7 +291,8 @@ class ViterbiDecoder(nn.Layer):
 
     def _initialize_alpha(self, batch_size):
         # alpha accumulate the path value to get the different next tag
-        if self._initial_alpha is None:
+        if self._initial_alpha is None or batch_size > self._initial_alpha.shape[
+                0]:
             # Initialized by a small value.
             initial_alpha = paddle.full(
                 (batch_size, self.num_tags - 1),
@@ -298,7 +303,7 @@ class ViterbiDecoder(nn.Layer):
                 (batch_size, 1), dtype='float32', fill_value=0.)
             self._initial_alpha = paddle.concat(
                 [initial_alpha, alpha_start], axis=1)
-        return self._initial_alpha
+        return self._initial_alpha[:batch_size, :]
 
     def forward(self, inputs, lengths):
         """
@@ -313,32 +318,34 @@ class ViterbiDecoder(nn.Layer):
         """
         batch_size, seq_len, n_labels = inputs.shape
         inputs_t = inputs.transpose([1, 0, 2])
-        trn_exp = self.transitions.unsqueeze(0).expand(
+        trans_exp = self.transitions.unsqueeze(0).expand(
             [batch_size, n_labels, n_labels])
 
         all_alpha = []
         historys = []
 
-        alpha = self._initialize_alpha(batch_size).detach(
-        ) if self.with_start_stop_tag else None
-        # inputs_t： seq_len, batch_size, n_labels
-        # logit: batch_size, n_labels
+        if self.with_start_stop_tag:
+            alpha = self._initialize_alpha(batch_size)
+        else:
+            alpha = paddle.zeros((batch_size, self.num_tags), dtype='float32')
+
         for i, logit in enumerate(inputs_t):
-            if alpha is not None:
-                alpha_exp = alpha.unsqueeze(1).expand(
-                    [batch_size, n_labels, n_labels])
-                # alpha_trn_sum: batch_size, n_labels, n_labels
-                alpha_trn_sum = alpha_exp + trn_exp
-                # alpha_max: batch_size, n_labels
-                # We don't include the emission scores here because the max does not depend on them (we add them in below)
-                alpha_max = alpha_trn_sum.max(2)
+            alpha_exp = alpha.unsqueeze(1).expand(
+                [batch_size, n_labels, n_labels])
+            # alpha_trn_sum: batch_size, n_labels, n_labels
+            alpha_trn_sum = alpha_exp + trans_exp
+            # alpha_max: batch_size, n_labels
+            # We don't include the emission scores here because the max does not depend on them (we add them in below)
+            alpha_max = alpha_trn_sum.max(2)
+            if i == 0:
+                # if self.with_start_stop_tag, the first antecedent tag must be START, drop it.
+                # else, the first label has not antecedent tag, pass it.
+                pass
+            else:
                 alpha_argmax = alpha_trn_sum.argmax(2)
                 historys.append(alpha_argmax)
-                # Now add in the emission scores
-                alpha = alpha_max + logit
-            else:
-                alpha = logit
-
+            # Now add the emission scores
+            alpha = alpha_max + logit
             all_alpha.append(alpha)
 
         # Get the valid alpha
@@ -358,6 +365,7 @@ class ViterbiDecoder(nn.Layer):
         historys = paddle.stack(historys).numpy()
         lengths_np = lengths.numpy()
         batch_path = []
+        max_len = 0
         for batch_id in range(batch_size):
             best_last_tag = last_ids[batch_id]
             path = [best_last_tag]
@@ -365,17 +373,16 @@ class ViterbiDecoder(nn.Layer):
                 # hist: batch_size, n_labels
                 best_last_tag = hist[batch_id][best_last_tag]
                 path.append(best_last_tag)
-            if self.with_start_stop_tag:
-                # the first one is start
-                start = path.pop()
             path.reverse()
+            max_len = max(max_len, len(path))
             # Pad to the max sequence length, so that the ChunkEvaluator can compute it
-            path += [0] * (seq_len - len(path))
             batch_path.append(path)
+        batch_path = [path + [0] * (max_len - len(path)) for path in batch_path]
         batch_path = paddle.to_tensor(batch_path)
         return scores, batch_path
 
     def _get_batch_index(self, batch_size):
-        if self._batch_index is None:
+        if self._batch_index is None or batch_size != self._batch_index.shape[
+                0]:
             self._batch_index = paddle.arange(end=batch_size, dtype="int64")
         return self._batch_index
diff --git a/PaddleNLP/paddlenlp/metrics/chunk.py b/PaddleNLP/paddlenlp/metrics/chunk.py
index a3ae5dbe..bd30f2fb 100644
--- a/PaddleNLP/paddlenlp/metrics/chunk.py
+++ b/PaddleNLP/paddlenlp/metrics/chunk.py
@@ -112,12 +112,12 @@ class ChunkEvaluator(paddle.metric.Metric):
             float: mean precision, recall and f1 score.
         """
         precision = float(
-            self.num_correct_chunks
-        ) / self.num_infer_chunks if self.num_infer_chunks else 0
-        recall = float(self.num_correct_chunks
-                       ) / self.num_label_chunks if self.num_label_chunks else 0
-        f1_score = float(2 * precision * recall) / (
-            precision + recall) if self.num_correct_chunks else 0
+            self.num_correct_chunks /
+            self.num_infer_chunks) if self.num_infer_chunks else 0.
+        recall = float(self.num_correct_chunks /
+                       self.num_label_chunks) if self.num_label_chunks else 0.
+        f1_score = float(2 * precision * recall / (
+            precision + recall)) if self.num_correct_chunks else 0.
         return precision, recall, f1_score
 
     def reset(self):
-- 
GitLab