diff --git a/fluid/PaddleNLP/neural_machine_translation/transformer/train.py b/fluid/PaddleNLP/neural_machine_translation/transformer/train.py
index 0e9c18416f62c85e76dd060f1fad44073e5841fc..5fc98868aa6e36bc5d1c5c0ad7ab231cda0fd52d 100644
--- a/fluid/PaddleNLP/neural_machine_translation/transformer/train.py
+++ b/fluid/PaddleNLP/neural_machine_translation/transformer/train.py
@@ -469,7 +469,7 @@ def train_loop(exe,
     # For faster executor
     exec_strategy = fluid.ExecutionStrategy()
     exec_strategy.use_experimental_executor = True
-    # exec_strategy.num_iteration_per_drop_scope = 5
+    exec_strategy.num_iteration_per_drop_scope = int(args.fetch_steps)
     build_strategy = fluid.BuildStrategy()
     # Since the token number differs among devices, customize gradient scale to
     # use token average cost among multi-devices. and the gradient scale is