From a48aacb4a81e5e24326450f1373e678bca408ce4 Mon Sep 17 00:00:00 2001 From: smallv0221 <33639025+smallv0221@users.noreply.github.com> Date: Tue, 22 Dec 2020 19:19:15 +0800 Subject: [PATCH] fix warmup step bug and qa padding (#5128) --- .../DuReader-robust/run_du.py | 2 +- PaddleNLP/paddlenlp/datasets/squad.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/run_du.py b/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/run_du.py index feaed689..6b2614db 100644 --- a/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/run_du.py +++ b/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/run_du.py @@ -181,7 +181,7 @@ def do_train(args): args.learning_rate, lambda current_step, warmup_proportion=args.warmup_proportion, num_training_steps=args.max_steps if args.max_steps > 0 else - (len(train_ds.examples)//args.batch_size*args.num_train_epochs): float( + (len(train_data_loader)*args.num_train_epochs): float( current_step) / float(max(1, warmup_proportion*num_training_steps)) if current_step < warmup_proportion*num_training_steps else max( 0.0, diff --git a/PaddleNLP/paddlenlp/datasets/squad.py b/PaddleNLP/paddlenlp/datasets/squad.py index f5355aa4..467d5293 100644 --- a/PaddleNLP/paddlenlp/datasets/squad.py +++ b/PaddleNLP/paddlenlp/datasets/squad.py @@ -243,7 +243,14 @@ class SQuAD(Dataset): segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) - + input_ids = input_ids + [ + tokenizer.vocab[tokenizer.pad_token] + for _ in range(self.max_seq_length - len(input_ids)) + ] + segment_ids = segment_ids + [ + tokenizer.vocab[tokenizer.pad_token] + for _ in range(self.max_seq_length - len(segment_ids)) + ] input_mask = [1] * len(input_ids) start_position = None -- GitLab