From d111569654555efdc5cf8e11d709d70aedd95c37 Mon Sep 17 00:00:00 2001 From: Guo Sheng Date: Sun, 20 Dec 2020 03:22:42 +0800 Subject: [PATCH] Fix single data file in BERT pre-training. (#5127) --- PaddleNLP/examples/language_model/bert/README.md | 2 +- PaddleNLP/examples/language_model/bert/run_pretrain.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/PaddleNLP/examples/language_model/bert/README.md b/PaddleNLP/examples/language_model/bert/README.md index 09888834..660f5a47 100644 --- a/PaddleNLP/examples/language_model/bert/README.md +++ b/PaddleNLP/examples/language_model/bert/README.md @@ -131,7 +131,7 @@ python -u ./run_glue.py \ | Task | Metric | Result | |:-----:|:----------------------------:|:-----------------:| | SST-2 | Accuracy | 0.92660 | -| QNLI | Accuracy | 0.91781 | +| QNLI | Accuracy | 0.91707 | | CoLA | Mattehew's corr | 0.59557 | | MRPC | F1/Accuracy | 0.91667/0.88235 | | STS-B | Person/Spearman corr | 0.88847/0.88350 | diff --git a/PaddleNLP/examples/language_model/bert/run_pretrain.py b/PaddleNLP/examples/language_model/bert/run_pretrain.py index e70ed0b7..0cbc8e00 100644 --- a/PaddleNLP/examples/language_model/bert/run_pretrain.py +++ b/PaddleNLP/examples/language_model/bert/run_pretrain.py @@ -340,13 +340,13 @@ def do_train(args): train_data_loader, _ = create_pretraining_dataset( data_file, args.max_predictions_per_seq, shared_file_list, args, - worker_init) + worker_init) # TODO(guosheng): better way to process single file - if f_start_id + 1 == len(files): single_file = True - + single_file = True if f_start_id + 1 == len(files) else False + for f_id in range(f_start_id, len(files)): - if not single_file: + if not single_file and f_id == f_start_id: continue if paddle.distributed.get_world_size() > num_files: data_file = files[(f_id * paddle.distributed.get_world_size() + -- GitLab