diff --git a/fluid/PaddleNLP/neural_machine_translation/transformer/train.py b/fluid/PaddleNLP/neural_machine_translation/transformer/train.py index 0e9c18416f62c85e76dd060f1fad44073e5841fc..5fc98868aa6e36bc5d1c5c0ad7ab231cda0fd52d 100644 --- a/fluid/PaddleNLP/neural_machine_translation/transformer/train.py +++ b/fluid/PaddleNLP/neural_machine_translation/transformer/train.py @@ -469,7 +469,7 @@ def train_loop(exe, # For faster executor exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = True - # exec_strategy.num_iteration_per_drop_scope = 5 + exec_strategy.num_iteration_per_drop_scope = int(args.fetch_steps) build_strategy = fluid.BuildStrategy() # Since the token number differs among devices, customize gradient scale to # use token average cost among multi-devices. and the gradient scale is