From 08c08300e959103349f2782a496b2b17d40e3c0d Mon Sep 17 00:00:00 2001 From: Guo Sheng Date: Fri, 25 Dec 2020 11:38:09 +0800 Subject: [PATCH] Fix glue set_seed. (#5130) (#5144) --- PaddleNLP/benchmark/bert/run_glue.py | 24 +++++++++++-------- PaddleNLP/examples/glue/run_glue.py | 10 +++++--- .../examples/language_model/bert/run_glue.py | 10 +++++--- .../language_model/electra/run_glue.py | 10 +++++--- PaddleNLP/examples/slim/run_glue_ofa.py | 14 +++++++---- PaddleNLP/legacy/benchmark/bert/run_glue.py | 24 +++++++++++-------- 6 files changed, 58 insertions(+), 34 deletions(-) diff --git a/PaddleNLP/benchmark/bert/run_glue.py b/PaddleNLP/benchmark/bert/run_glue.py index 21388921..3a8db53f 100644 --- a/PaddleNLP/benchmark/bert/run_glue.py +++ b/PaddleNLP/benchmark/bert/run_glue.py @@ -174,9 +174,13 @@ def reset_program_state_dict(model, state_dict, pretrained_state_dict): def set_seed(args): - random.seed(args.seed + paddle.distributed.get_rank()) - np.random.seed(args.seed + paddle.distributed.get_rank()) - paddle.seed(args.seed + paddle.distributed.get_rank()) + # Use the same data seed(for data shuffle) for all procs to guarantee data + # consistency after sharding. + random.seed(args.seed) + np.random.seed(args.seed) + # Maybe different op seeds(for dropout) for different procs is better. By: + # `paddle.seed(args.seed + paddle.distributed.get_rank())` + paddle.seed(args.seed) def evaluate(exe, metric, loss, correct, dev_program, data_loader): @@ -276,12 +280,12 @@ def do_train(args): place = paddle.set_device(args.select_device) set_seed(args) - # Create the main_program for the training and dev_program for the validation + # Create the main_program for the training and dev_program for the validation main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() dev_program = paddle.static.Program() - # Get the configuration of tokenizer and model + # Get the configuration of tokenizer and model args.task_name = args.task_name.lower() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] @@ -371,8 +375,8 @@ def do_train(args): loss = loss_fct(logits, labels) dev_program = main_program.clone(for_test=True) - # Create the training-backward program, this pass will not be - # executed in the validation + # Create the training-backward program, this pass will not be + # executed in the validation with paddle.static.program_guard(main_program, startup_program): lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, @@ -400,9 +404,9 @@ def do_train(args): metric = metric_class() correct = metric.compute(logits, labels) - # Initialize the fine-tuning parameter, we will load the parameters in + # Initialize the fine-tuning parameter, we will load the parameters in # pre-training model. And initialize the parameter which not in pre-training model - # by the normal distribution. + # by the normal distribution. exe = paddle.static.Executor(place) exe.run(startup_program) state_dict = model.state_dict() @@ -424,7 +428,7 @@ def do_train(args): tic_train = time.time() lr_scheduler.step() if global_step % args.save_steps == 0: - # Validation pass, record the loss and metric + # Validation pass, record the loss and metric if args.task_name == "mnli": evaluate(exe, metric, loss, correct, dev_program, dev_data_loader_matched) diff --git a/PaddleNLP/examples/glue/run_glue.py b/PaddleNLP/examples/glue/run_glue.py index d85d987b..93de366f 100644 --- a/PaddleNLP/examples/glue/run_glue.py +++ b/PaddleNLP/examples/glue/run_glue.py @@ -162,9 +162,13 @@ def parse_args(): def set_seed(args): - random.seed(args.seed + paddle.distributed.get_rank()) - np.random.seed(args.seed + paddle.distributed.get_rank()) - paddle.seed(args.seed + paddle.distributed.get_rank()) + # Use the same data seed(for data shuffle) for all procs to guarantee data + # consistency after sharding. + random.seed(args.seed) + np.random.seed(args.seed) + # Maybe different op seeds(for dropout) for different procs is better. By: + # `paddle.seed(args.seed + paddle.distributed.get_rank())` + paddle.seed(args.seed) def evaluate(model, loss_fct, metric, data_loader): diff --git a/PaddleNLP/examples/language_model/bert/run_glue.py b/PaddleNLP/examples/language_model/bert/run_glue.py index d9f25a7c..6917c223 100644 --- a/PaddleNLP/examples/language_model/bert/run_glue.py +++ b/PaddleNLP/examples/language_model/bert/run_glue.py @@ -157,9 +157,13 @@ def parse_args(): def set_seed(args): - random.seed(args.seed + paddle.distributed.get_rank()) - np.random.seed(args.seed + paddle.distributed.get_rank()) - paddle.seed(args.seed + paddle.distributed.get_rank()) + # Use the same data seed(for data shuffle) for all procs to guarantee data + # consistency after sharding. + random.seed(args.seed) + np.random.seed(args.seed) + # Maybe different op seeds(for dropout) for different procs is better. By: + # `paddle.seed(args.seed + paddle.distributed.get_rank())` + paddle.seed(args.seed) def evaluate(model, loss_fct, metric, data_loader): diff --git a/PaddleNLP/examples/language_model/electra/run_glue.py b/PaddleNLP/examples/language_model/electra/run_glue.py index 9ad07ae8..96d4deda 100644 --- a/PaddleNLP/examples/language_model/electra/run_glue.py +++ b/PaddleNLP/examples/language_model/electra/run_glue.py @@ -51,9 +51,13 @@ MODEL_CLASSES = { def set_seed(args): - random.seed(args.seed + paddle.distributed.get_rank()) - np.random.seed(args.seed + paddle.distributed.get_rank()) - paddle.seed(args.seed + paddle.distributed.get_rank()) + # Use the same data seed(for data shuffle) for all procs to guarantee data + # consistency after sharding. + random.seed(args.seed) + np.random.seed(args.seed) + # Maybe different op seeds(for dropout) for different procs is better. By: + # `paddle.seed(args.seed + paddle.distributed.get_rank())` + paddle.seed(args.seed) def evaluate(model, loss_fct, metric, data_loader): diff --git a/PaddleNLP/examples/slim/run_glue_ofa.py b/PaddleNLP/examples/slim/run_glue_ofa.py index 70dba0db..ac51dff7 100644 --- a/PaddleNLP/examples/slim/run_glue_ofa.py +++ b/PaddleNLP/examples/slim/run_glue_ofa.py @@ -161,9 +161,13 @@ def parse_args(): def set_seed(args): - random.seed(args.seed + paddle.distributed.get_rank()) - np.random.seed(args.seed + paddle.distributed.get_rank()) - paddle.seed(args.seed + paddle.distributed.get_rank()) + # Use the same data seed(for data shuffle) for all procs to guarantee data + # consistency after sharding. + random.seed(args.seed) + np.random.seed(args.seed) + # Maybe different op seeds(for dropout) for different procs is better. By: + # `paddle.seed(args.seed + paddle.distributed.get_rank())` + paddle.seed(args.seed) def evaluate(model, criterion, metric, data_loader, width_mult=1.0): @@ -411,7 +415,7 @@ def do_train(args): # Step2: Convert origin model to supernet. sp_config = supernet(expand_ratio=args.width_mult_list) model = Convert(sp_config).convert(model) - # Use weights saved in the dictionary to initialize supernet. + # Use weights saved in the dictionary to initialize supernet. utils.set_state_dict(model, origin_weights) del origin_weights @@ -444,7 +448,7 @@ def do_train(args): if args.task_name == "mnli": dev_data_loader = (dev_data_loader_matched, dev_data_loader_mismatched) - # Step6: Calculate the importance of neurons and head, + # Step6: Calculate the importance of neurons and head, # and then reorder them according to the importance. head_importance, neuron_importance = utils.compute_neuron_head_importance( args.task_name, diff --git a/PaddleNLP/legacy/benchmark/bert/run_glue.py b/PaddleNLP/legacy/benchmark/bert/run_glue.py index 32011ef8..49e87f6a 100644 --- a/PaddleNLP/legacy/benchmark/bert/run_glue.py +++ b/PaddleNLP/legacy/benchmark/bert/run_glue.py @@ -163,9 +163,13 @@ def reset_program_state_dict(model, state_dict, pretrained_state_dict): def set_seed(args): - random.seed(args.seed + paddle.distributed.get_rank()) - np.random.seed(args.seed + paddle.distributed.get_rank()) - paddle.seed(args.seed + paddle.distributed.get_rank()) + # Use the same data seed(for data shuffle) for all procs to guarantee data + # consistency after sharding. + random.seed(args.seed) + np.random.seed(args.seed) + # Maybe different op seeds(for dropout) for different procs is better. By: + # `paddle.seed(args.seed + paddle.distributed.get_rank())` + paddle.seed(args.seed) def evaluate(exe, metric, loss, correct, dev_program, data_loader): @@ -256,12 +260,12 @@ def do_train(args): place = paddle.CUDAPlace(0) set_seed(args) - # Create the main_program for the training and dev_program for the validation + # Create the main_program for the training and dev_program for the validation main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() dev_program = paddle.static.Program() - # Get the configuration of tokenizer and model + # Get the configuration of tokenizer and model args.task_name = args.task_name.lower() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] @@ -325,8 +329,8 @@ def do_train(args): loss = loss_fct(logits, labels) dev_program = main_program.clone(for_test=True) - # Create the training-backward program, this pass will not be - # executed in the validation + # Create the training-backward program, this pass will not be + # executed in the validation with paddle.static.program_guard(main_program, startup_program): lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, @@ -354,9 +358,9 @@ def do_train(args): metric = metric_class() correct = metric.compute(logits, labels) - # Initialize the fine-tuning parameter, we will load the parameters in + # Initialize the fine-tuning parameter, we will load the parameters in # pre-training model. And initialize the parameter which not in pre-training model - # by the normal distribution. + # by the normal distribution. exe = paddle.static.Executor(place) exe.run(startup_program) state_dict = model.state_dict() @@ -378,7 +382,7 @@ def do_train(args): tic_train = time.time() lr_scheduler.step() if global_step % args.save_steps == 0: - # Validation pass, record the loss and metric + # Validation pass, record the loss and metric evaluate(exe, metric, loss, correct, dev_program, dev_data_loader) output_dir = os.path.join(args.output_dir, -- GitLab