diff --git a/fluid/neural_machine_translation/transformer/infer.py b/fluid/neural_machine_translation/transformer/infer.py index 26ce487a925636d68a75c031416486456086c1b6..2d0e5accca7747ee30c861a9f711462b9c92fa35 100644 --- a/fluid/neural_machine_translation/transformer/infer.py +++ b/fluid/neural_machine_translation/transformer/infer.py @@ -171,7 +171,7 @@ def fast_infer(test_data, trg_idx2word, use_wordpiece): ]) # This is used here to set dropout to the test mode. - infer_program = fluid.default_main_program().inference_optimize() + infer_program = fluid.default_main_program().clone(for_test=True) for batch_id, data in enumerate(test_data.batch_generator()): data_input = prepare_batch_input( diff --git a/fluid/neural_machine_translation/transformer/train.py b/fluid/neural_machine_translation/transformer/train.py index d24cff3ec038b43266c23e8968ea87033981ff0e..3d37268e61fa4840ac0eba40656927beb12e13af 100644 --- a/fluid/neural_machine_translation/transformer/train.py +++ b/fluid/neural_machine_translation/transformer/train.py @@ -428,7 +428,7 @@ def train_loop(exe, train_prog, startup_prog, dev_count, sum_cost, avg_cost, # Since the token number differs among devices, customize gradient scale to # use token average cost among multi-devices. and the gradient scale is # `1 / token_number` for average cost. - build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized + # build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized train_exe = fluid.ParallelExecutor( use_cuda=TrainTaskConfig.use_gpu, loss_name=avg_cost.name,