From 08c08300e959103349f2782a496b2b17d40e3c0d Mon Sep 17 00:00:00 2001
From: Guo Sheng <whucsgs@163.com>
Date: Fri, 25 Dec 2020 11:38:09 +0800
Subject: [PATCH] Fix glue set_seed. (#5130) (#5144)

---
 PaddleNLP/benchmark/bert/run_glue.py          | 24 +++++++++++--------
 PaddleNLP/examples/glue/run_glue.py           | 10 +++++---
 .../examples/language_model/bert/run_glue.py  | 10 +++++---
 .../language_model/electra/run_glue.py        | 10 +++++---
 PaddleNLP/examples/slim/run_glue_ofa.py       | 14 +++++++----
 PaddleNLP/legacy/benchmark/bert/run_glue.py   | 24 +++++++++++--------
 6 files changed, 58 insertions(+), 34 deletions(-)

diff --git a/PaddleNLP/benchmark/bert/run_glue.py b/PaddleNLP/benchmark/bert/run_glue.py
index 21388921..3a8db53f 100644
--- a/PaddleNLP/benchmark/bert/run_glue.py
+++ b/PaddleNLP/benchmark/bert/run_glue.py
@@ -174,9 +174,13 @@ def reset_program_state_dict(model, state_dict, pretrained_state_dict):
 
 
 def set_seed(args):
-    random.seed(args.seed + paddle.distributed.get_rank())
-    np.random.seed(args.seed + paddle.distributed.get_rank())
-    paddle.seed(args.seed + paddle.distributed.get_rank())
+    # Use the same data seed(for data shuffle) for all procs to guarantee data
+    # consistency after sharding.
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # Maybe different op seeds(for dropout) for different procs is better. By:
+    # `paddle.seed(args.seed + paddle.distributed.get_rank())`
+    paddle.seed(args.seed)
 
 
 def evaluate(exe, metric, loss, correct, dev_program, data_loader):
@@ -276,12 +280,12 @@ def do_train(args):
     place = paddle.set_device(args.select_device)
     set_seed(args)
 
-    # Create the main_program for the training and dev_program for the validation 
+    # Create the main_program for the training and dev_program for the validation
     main_program = paddle.static.default_main_program()
     startup_program = paddle.static.default_startup_program()
     dev_program = paddle.static.Program()
 
-    # Get the configuration of tokenizer and model  
+    # Get the configuration of tokenizer and model
     args.task_name = args.task_name.lower()
     args.model_type = args.model_type.lower()
     model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
@@ -371,8 +375,8 @@ def do_train(args):
         loss = loss_fct(logits, labels)
         dev_program = main_program.clone(for_test=True)
 
-    # Create the training-backward program, this pass will not be 
-    # executed in the validation    
+    # Create the training-backward program, this pass will not be
+    # executed in the validation
     with paddle.static.program_guard(main_program, startup_program):
         lr_scheduler = paddle.optimizer.lr.LambdaDecay(
             args.learning_rate,
@@ -400,9 +404,9 @@ def do_train(args):
         metric = metric_class()
         correct = metric.compute(logits, labels)
 
-    # Initialize the fine-tuning parameter, we will load the parameters in 
+    # Initialize the fine-tuning parameter, we will load the parameters in
     # pre-training model. And initialize the parameter which not in pre-training model
-    # by the normal distribution. 
+    # by the normal distribution.
     exe = paddle.static.Executor(place)
     exe.run(startup_program)
     state_dict = model.state_dict()
@@ -424,7 +428,7 @@ def do_train(args):
                 tic_train = time.time()
             lr_scheduler.step()
             if global_step % args.save_steps == 0:
-                # Validation pass, record the loss and metric 
+                # Validation pass, record the loss and metric
                 if args.task_name == "mnli":
                     evaluate(exe, metric, loss, correct, dev_program,
                              dev_data_loader_matched)
diff --git a/PaddleNLP/examples/glue/run_glue.py b/PaddleNLP/examples/glue/run_glue.py
index d85d987b..93de366f 100644
--- a/PaddleNLP/examples/glue/run_glue.py
+++ b/PaddleNLP/examples/glue/run_glue.py
@@ -162,9 +162,13 @@ def parse_args():
 
 
 def set_seed(args):
-    random.seed(args.seed + paddle.distributed.get_rank())
-    np.random.seed(args.seed + paddle.distributed.get_rank())
-    paddle.seed(args.seed + paddle.distributed.get_rank())
+    # Use the same data seed(for data shuffle) for all procs to guarantee data
+    # consistency after sharding.
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # Maybe different op seeds(for dropout) for different procs is better. By:
+    # `paddle.seed(args.seed + paddle.distributed.get_rank())`
+    paddle.seed(args.seed)
 
 
 def evaluate(model, loss_fct, metric, data_loader):
diff --git a/PaddleNLP/examples/language_model/bert/run_glue.py b/PaddleNLP/examples/language_model/bert/run_glue.py
index d9f25a7c..6917c223 100644
--- a/PaddleNLP/examples/language_model/bert/run_glue.py
+++ b/PaddleNLP/examples/language_model/bert/run_glue.py
@@ -157,9 +157,13 @@ def parse_args():
 
 
 def set_seed(args):
-    random.seed(args.seed + paddle.distributed.get_rank())
-    np.random.seed(args.seed + paddle.distributed.get_rank())
-    paddle.seed(args.seed + paddle.distributed.get_rank())
+    # Use the same data seed(for data shuffle) for all procs to guarantee data
+    # consistency after sharding.
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # Maybe different op seeds(for dropout) for different procs is better. By:
+    # `paddle.seed(args.seed + paddle.distributed.get_rank())`
+    paddle.seed(args.seed)
 
 
 def evaluate(model, loss_fct, metric, data_loader):
diff --git a/PaddleNLP/examples/language_model/electra/run_glue.py b/PaddleNLP/examples/language_model/electra/run_glue.py
index 9ad07ae8..96d4deda 100644
--- a/PaddleNLP/examples/language_model/electra/run_glue.py
+++ b/PaddleNLP/examples/language_model/electra/run_glue.py
@@ -51,9 +51,13 @@ MODEL_CLASSES = {
 
 
 def set_seed(args):
-    random.seed(args.seed + paddle.distributed.get_rank())
-    np.random.seed(args.seed + paddle.distributed.get_rank())
-    paddle.seed(args.seed + paddle.distributed.get_rank())
+    # Use the same data seed(for data shuffle) for all procs to guarantee data
+    # consistency after sharding.
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # Maybe different op seeds(for dropout) for different procs is better. By:
+    # `paddle.seed(args.seed + paddle.distributed.get_rank())`
+    paddle.seed(args.seed)
 
 
 def evaluate(model, loss_fct, metric, data_loader):
diff --git a/PaddleNLP/examples/slim/run_glue_ofa.py b/PaddleNLP/examples/slim/run_glue_ofa.py
index 70dba0db..ac51dff7 100644
--- a/PaddleNLP/examples/slim/run_glue_ofa.py
+++ b/PaddleNLP/examples/slim/run_glue_ofa.py
@@ -161,9 +161,13 @@ def parse_args():
 
 
 def set_seed(args):
-    random.seed(args.seed + paddle.distributed.get_rank())
-    np.random.seed(args.seed + paddle.distributed.get_rank())
-    paddle.seed(args.seed + paddle.distributed.get_rank())
+    # Use the same data seed(for data shuffle) for all procs to guarantee data
+    # consistency after sharding.
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # Maybe different op seeds(for dropout) for different procs is better. By:
+    # `paddle.seed(args.seed + paddle.distributed.get_rank())`
+    paddle.seed(args.seed)
 
 
 def evaluate(model, criterion, metric, data_loader, width_mult=1.0):
@@ -411,7 +415,7 @@ def do_train(args):
     # Step2: Convert origin model to supernet.
     sp_config = supernet(expand_ratio=args.width_mult_list)
     model = Convert(sp_config).convert(model)
-    # Use weights saved in the dictionary to initialize supernet. 
+    # Use weights saved in the dictionary to initialize supernet.
     utils.set_state_dict(model, origin_weights)
     del origin_weights
 
@@ -444,7 +448,7 @@ def do_train(args):
     if args.task_name == "mnli":
         dev_data_loader = (dev_data_loader_matched, dev_data_loader_mismatched)
 
-    # Step6: Calculate the importance of neurons and head, 
+    # Step6: Calculate the importance of neurons and head,
     # and then reorder them according to the importance.
     head_importance, neuron_importance = utils.compute_neuron_head_importance(
         args.task_name,
diff --git a/PaddleNLP/legacy/benchmark/bert/run_glue.py b/PaddleNLP/legacy/benchmark/bert/run_glue.py
index 32011ef8..49e87f6a 100644
--- a/PaddleNLP/legacy/benchmark/bert/run_glue.py
+++ b/PaddleNLP/legacy/benchmark/bert/run_glue.py
@@ -163,9 +163,13 @@ def reset_program_state_dict(model, state_dict, pretrained_state_dict):
 
 
 def set_seed(args):
-    random.seed(args.seed + paddle.distributed.get_rank())
-    np.random.seed(args.seed + paddle.distributed.get_rank())
-    paddle.seed(args.seed + paddle.distributed.get_rank())
+    # Use the same data seed(for data shuffle) for all procs to guarantee data
+    # consistency after sharding.
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # Maybe different op seeds(for dropout) for different procs is better. By:
+    # `paddle.seed(args.seed + paddle.distributed.get_rank())`
+    paddle.seed(args.seed)
 
 
 def evaluate(exe, metric, loss, correct, dev_program, data_loader):
@@ -256,12 +260,12 @@ def do_train(args):
     place = paddle.CUDAPlace(0)
     set_seed(args)
 
-    # Create the main_program for the training and dev_program for the validation 
+    # Create the main_program for the training and dev_program for the validation
     main_program = paddle.static.default_main_program()
     startup_program = paddle.static.default_startup_program()
     dev_program = paddle.static.Program()
 
-    # Get the configuration of tokenizer and model  
+    # Get the configuration of tokenizer and model
     args.task_name = args.task_name.lower()
     args.model_type = args.model_type.lower()
     model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
@@ -325,8 +329,8 @@ def do_train(args):
         loss = loss_fct(logits, labels)
         dev_program = main_program.clone(for_test=True)
 
-    # Create the training-backward program, this pass will not be 
-    # executed in the validation    
+    # Create the training-backward program, this pass will not be
+    # executed in the validation
     with paddle.static.program_guard(main_program, startup_program):
         lr_scheduler = paddle.optimizer.lr.LambdaDecay(
             args.learning_rate,
@@ -354,9 +358,9 @@ def do_train(args):
         metric = metric_class()
         correct = metric.compute(logits, labels)
 
-    # Initialize the fine-tuning parameter, we will load the parameters in 
+    # Initialize the fine-tuning parameter, we will load the parameters in
     # pre-training model. And initialize the parameter which not in pre-training model
-    # by the normal distribution. 
+    # by the normal distribution.
     exe = paddle.static.Executor(place)
     exe.run(startup_program)
     state_dict = model.state_dict()
@@ -378,7 +382,7 @@ def do_train(args):
                 tic_train = time.time()
             lr_scheduler.step()
             if global_step % args.save_steps == 0:
-                # Validation pass, record the loss and metric 
+                # Validation pass, record the loss and metric
                 evaluate(exe, metric, loss, correct, dev_program,
                          dev_data_loader)
                 output_dir = os.path.join(args.output_dir,
-- 
GitLab