diff --git a/PaddleNLP/benchmark/bert/README.md b/PaddleNLP/benchmark/bert/README.md index 0948c3e677fa4c2c64572b2d6e22a922a508556c..684b63af41452b222b41d391025f564a64d76a48 100644 --- a/PaddleNLP/benchmark/bert/README.md +++ b/PaddleNLP/benchmark/bert/README.md @@ -20,7 +20,7 @@ export DATA_DIR=${HOME}/bert_data/wikicorpus_en ### 运行模型训练脚本 ```shell -1. 如果是需要多单机多卡/多机多卡训练,则使用下面的命令进行训练 +1. 如果是需要单机多卡/多机多卡训练,则使用下面的命令进行训练 unset CUDA_VISIBLE_DEVICES fleetrun --gpus 0,1,2,3 ./run_pretrain.py \ --model_type bert \ diff --git a/PaddleNLP/benchmark/bert/run_pretrain_single.py b/PaddleNLP/benchmark/bert/run_pretrain_single.py index 1bd2104907657e963503b72e609469e10c37c014..9641ffbd05a46f5fbfba4a9b55b27466ee120aaa 100644 --- a/PaddleNLP/benchmark/bert/run_pretrain_single.py +++ b/PaddleNLP/benchmark/bert/run_pretrain_single.py @@ -266,21 +266,34 @@ def do_train(args): for f_id in range(0, len(files)): train_data_loader, _ = create_pretraining_dataset( files[f_id], args.max_predictions_per_seq, args, data_holders) + train_reader_cost = 0.0 + train_run_cost = 0.0 + total_samples = 0 + reader_start = time.time() for step, batch in enumerate(train_data_loader): + train_reader_cost += time.time() - reader_start global_step += 1 + train_start = time.time() loss_return = exe.run(main_program,\ feed=batch, fetch_list=[loss]) + train_run_cost += time.time() - train_start + total_samples += args.batch_size # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_steps == 0: - time_cost = time.time() - tic_train print( - "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, ips: %.2f sequences/s" - % (global_step, epoch, step, loss_return[0], - args.logging_steps / time_cost, - args.logging_steps * args.batch_size / time_cost)) - tic_train = time.time() + "global step: %d, epoch: %d, batch: %d, loss: %f, " + "avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec" + % + (global_step, epoch, step, loss_return[0], + train_reader_cost / args.logging_steps, + (train_reader_cost + train_run_cost) / + args.logging_steps, total_samples / args.logging_steps, + total_samples / (train_reader_cost + train_run_cost))) + train_reader_cost = 0.0 + train_run_cost = 0.0 + total_samples = 0 if global_step % args.save_steps == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) @@ -290,8 +303,10 @@ def do_train(args): paddle.fluid.io.save_params(exe, output_dir) tokenizer.save_pretrained(output_dir) if global_step >= args.max_steps: + reader_start = time.time() del train_data_loader return + reader_start = time.time() del train_data_loader epoch += 1