Refine PPL and reader for seq2seq.

12fb5614 · guosheng · 833a0157 · 12fb5614 · 12fb5614 · 12fb5614
7 changed file
--- a/seq2seq/README.md
+++ b/seq2seq/README.md
@@ -151,7 +151,7 @@ python infer.py \
    --reload_model attention_models/epoch_10 \
    --infer_output_file attention_infer_output/infer_output.txt \
    --beam_size 10 \
-    --use_gpu True
+    --use_gpu True \
    --eager_run False  
 ```

--- a/seq2seq/args.py
+++ b/seq2seq/args.py
@@ -88,6 +88,12 @@ def parse_args():
        default=5.0,
        help="max grad norm for global norm clip")
+    parser.add_argument(
+        "--log_freq",
+        type=int,
+        default=100,
+        help="The frequency to print training logs")
    parser.add_argument(
        "--model_path",
        type=str,

--- a/seq2seq/reader.py
+++ b/seq2seq/reader.py
@@ -168,7 +168,7 @@ class SampleInfo(object):
    def __init__(self, i, lens):
        self.i = i
        self.lens = lens
-        self.max_len = lens[0]
+        self.max_len = lens[0]  # to be consitent with the original reader
    def get_ranges(self, min_length=None, max_length=None, truncate=False):
        ranges = []
@@ -379,7 +379,7 @@ class Seq2SeqBatchSampler(BatchSampler):
                reverse = True
                for i in range(0, len(infos), self._pool_size):
                    # to avoid placing short next to long sentences
-                    reverse = not reverse
+                    reverse = False  # not reverse
                    infos[i:i + self._pool_size] = sorted(
                        infos[i:i + self._pool_size],
                        key=lambda x: x.max_len,

--- a/seq2seq/seq2seq.yaml
+++ b/seq2seq/seq2seq.yaml
-# used for continuous evaluation
-enable_ce: False
-eager_run: False
-# The frequency to save trained models when training.
-save_step: 10000
-# The frequency to fetch and print output when training.
-print_step: 100
-# path of the checkpoint, to resume the previous training
-init_from_checkpoint: ""
-# path of the pretrain model, to better solve the current task
-init_from_pretrain_model: ""
-# path of trained parameter, to make prediction
-init_from_params: "trained_params/step_100000/"
-# the directory for saving model
-save_model: "trained_models"
-# the directory for saving inference model.
-inference_model_dir: "infer_model"
-# Set seed for CE or debug
-random_seed: None
-# The pattern to match training data files.
-training_file: "wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de"
-# The pattern to match validation data files.
-validation_file: "wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de"
-# The pattern to match test data files.
-predict_file: "wmt16_ende_data_bpe/newstest2016.tok.bpe.32000.en-de"
-# The file to output the translation results of predict_file to.
-output_file: "predict.txt"
-# The path of vocabulary file of source language.
-src_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000"
-# The path of vocabulary file of target language.
-trg_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000"
-# The <bos>, <eos> and <unk> tokens in the dictionary.
-special_token: ["<s>", "<e>", "<unk>"]
-# max length of sequences
-max_length: 256
-# whether to use cuda
-use_cuda: True
-# args for reader, see reader.py for details
-token_delimiter: " "
-use_token_batch: True
-pool_size: 200000
-sort_type: "pool"
-shuffle: True
-shuffle_batch: True
-batch_size: 4096
-# Hyparams for training:
-# the number of epoches for training
-epoch: 30
-# the hyper parameters for Adam optimizer.
-# This static learning_rate will be multiplied to the LearningRateScheduler
-# derived learning rate the to get the final learning rate.
-learning_rate: 0.001
-# Hyparams for generation:
-# the parameters for beam search.
-beam_size: 5
-max_out_len: 256
-# the number of decoded sentences to output.
-n_best: 1
-# Hyparams for model:
-# These following five vocabularies related configurations will be set
-# automatically according to the passed vocabulary path and special tokens.
-# size of source word dictionary.
-src_vocab_size: 10000
-# size of target word dictionay
-trg_vocab_size: 10000
-# index for <bos> token
-bos_idx: 0
-# index for <eos> token
-eos_idx: 1
-# index for <unk> token
-unk_idx: 2
-embed_dim: 512
-hidden_size: 512
-num_layers: 2
-dropout: 0.1
--- a/seq2seq/seq2seq_base.py
+++ b/seq2seq/seq2seq_base.py
@@ -200,3 +200,44 @@ class BaseInferModel(BaseModel):
        # dynamic decoding with beam search
        rs, _ = self.beam_search_decoder(inits=encoder_final_states)
        return rs
+class BaseGreedyInferModel(BaseModel):
+    def __init__(self,
+                 src_vocab_size,
+                 trg_vocab_size,
+                 embed_dim,
+                 hidden_size,
+                 num_layers,
+                 dropout_prob=0.,
+                 bos_id=0,
+                 eos_id=1,
+                 beam_size=1,
+                 max_out_len=256):
+        args = dict(locals())
+        args.pop("self")
+        args.pop("__class__", None)  # py3
+        args.pop("beam_size", None)
+        self.bos_id = args.pop("bos_id")
+        self.eos_id = args.pop("eos_id")
+        self.max_out_len = args.pop("max_out_len")
+        super(BaseGreedyInferModel, self).__init__(**args)
+        # dynamic decoder for inference
+        decoder_helper = GreedyEmbeddingHelper(
+            start_tokens=bos_id,
+            end_token=eos_id,
+            embedding_fn=self.decoder.embedder)
+        decoder = BasicDecoder(
+            cell=self.decoder.stack_lstm.cell,
+            helper=decoder_helper,
+            output_fn=self.decoder.output_layer)
+        self.greedy_search_decoder = DynamicDecode(
+            decoder, max_step_num=max_out_len, is_test=True)
+    def forward(self, src, src_length):
+        # encoding
+        encoder_output, encoder_final_states = self.encoder(src, src_length)
+        # dynamic decoding with greedy search
+        rs, _ = self.greedy_search_decoder(inits=encoder_final_states)
+        return rs.sample_ids
--- a/seq2seq/train.py
+++ b/seq2seq/train.py
@@ -30,65 +30,7 @@ from args import parse_args
 from seq2seq_base import BaseModel, CrossEntropyCriterion
 from seq2seq_attn import AttentionModel
 from reader import create_data_loader
+from utility import PPL, TrainCallback
-class TrainCallback(ProgBarLogger):
-    def __init__(self, args, ppl, verbose=2):
-        super(TrainCallback, self).__init__(1, verbose)
-        # control metric
-        self.ppl = ppl
-        self.batch_size = args.batch_size
-    def on_train_begin(self, logs=None):
-        super(TrainCallback, self).on_train_begin(logs)
-        self.train_metrics += ["ppl"]  # remove loss to not print it
-        self.ppl.reset()
-    def on_train_batch_end(self, step, logs=None):
-        batch_loss = logs["loss"][0]
-        self.ppl.total_loss += batch_loss * self.batch_size
-        logs["ppl"] = np.exp(self.ppl.total_loss / self.ppl.word_count)
-        if step > 0 and step % self.ppl.reset_freq == 0:
-            self.ppl.reset()
-        super(TrainCallback, self).on_train_batch_end(step, logs)
-    def on_eval_begin(self, logs=None):
-        super(TrainCallback, self).on_eval_begin(logs)
-        self.eval_metrics = ["ppl"]
-        self.ppl.reset()
-    def on_eval_batch_end(self, step, logs=None):
-        batch_loss = logs["loss"][0]
-        self.ppl.total_loss += batch_loss * self.batch_size
-        logs["ppl"] = np.exp(self.ppl.total_loss / self.ppl.word_count)
-        super(TrainCallback, self).on_eval_batch_end(step, logs)
-class PPL(Metric):
-    def __init__(self, reset_freq=100, name=None):
-        super(PPL, self).__init__()
-        self._name = name or "ppl"
-        self.reset_freq = reset_freq
-        self.reset()
-    def add_metric_op(self, pred, label):
-        seq_length = label[0]
-        word_num = fluid.layers.reduce_sum(seq_length)
-        return word_num
-    def update(self, word_num):
-        self.word_count += word_num
-        return word_num
-    def reset(self):
-        self.total_loss = 0
-        self.word_count = 0
-    def accumulate(self):
-        return self.word_count
-    def name(self):
-        return self._name
 def do_train(args):
@@ -122,10 +64,13 @@ def do_train(args):
    model = model_maker(args.src_vocab_size, args.tar_vocab_size,
                        args.hidden_size, args.hidden_size, args.num_layers,
                        args.dropout)
-    optimizer = fluid.optimizer.Adam(
+    grad_clip = fluid.clip.GradientClipByGlobalNorm(
-        learning_rate=args.learning_rate, parameter_list=model.parameters())
-    optimizer._grad_clip = fluid.clip.GradientClipByGlobalNorm(
        clip_norm=args.max_grad_norm)
+    optimizer = fluid.optimizer.Adam(
+        learning_rate=args.learning_rate,
+        parameter_list=model.parameters(),
+        grad_clip=grad_clip)
    ppl_metric = PPL()
    model.prepare(
        optimizer,
@@ -139,7 +84,7 @@ def do_train(args):
              eval_freq=1,
              save_freq=1,
              save_dir=args.model_path,
-              callbacks=[TrainCallback(args, ppl_metric)])
+              callbacks=[TrainCallback(ppl_metric, args.log_freq)])
 if __name__ == "__main__":

--- a/seq2seq/utility.py
+++ b/seq2seq/utility.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle.fluid as fluid
+from metrics import Metric
+from callbacks import ProgBarLogger
+class TrainCallback(ProgBarLogger):
+    def __init__(self, ppl, log_freq, verbose=2):
+        super(TrainCallback, self).__init__(log_freq, verbose)
+        self.ppl = ppl
+    def on_train_begin(self, logs=None):
+        super(TrainCallback, self).on_train_begin(logs)
+        self.train_metrics = ["ppl"]  # remove loss to not print it
+    def on_epoch_begin(self, epoch=None, logs=None):
+        super(TrainCallback, self).on_epoch_begin(epoch, logs)
+        self.ppl.reset()
+    def on_train_batch_end(self, step, logs=None):
+        logs["ppl"] = self.ppl.cal_acc_ppl(logs["loss"][0], logs["batch_size"])
+        if step > 0 and step % self.ppl.reset_freq == 0:
+            self.ppl.reset()
+        super(TrainCallback, self).on_train_batch_end(step, logs)
+    def on_eval_begin(self, logs=None):
+        super(TrainCallback, self).on_eval_begin(logs)
+        self.eval_metrics = ["ppl"]
+        self.ppl.reset()
+    def on_eval_batch_end(self, step, logs=None):
+        logs["ppl"] = self.ppl.cal_acc_ppl(logs["loss"][0], logs["batch_size"])
+        super(TrainCallback, self).on_eval_batch_end(step, logs)
+class PPL(Metric):
+    def __init__(self, reset_freq=100, name=None):
+        super(PPL, self).__init__()
+        self._name = name or "ppl"
+        self.reset_freq = reset_freq
+        self.reset()
+    def add_metric_op(self, pred, label):
+        seq_length = label[0]
+        word_num = fluid.layers.reduce_sum(seq_length)
+        return word_num
+    def update(self, word_num):
+        self.word_count += word_num[0]
+        return word_num
+    def reset(self):
+        self.total_loss = 0
+        self.word_count = 0
+    def accumulate(self):
+        return self.word_count
+    def name(self):
+        return self._name
+    def cal_acc_ppl(self, batch_loss, batch_size):
+        self.total_loss += batch_loss * batch_size
+        ppl = np.exp(self.total_loss / self.word_count)
+        return ppl
\ No newline at end of file