diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py index 24e0b196d4974ae9f8e3fe0612691aa53b48c2f3..c751e229cbbe2b900ead900297ff9956946b9e75 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py @@ -25,7 +25,6 @@ class AMPOptimizer(MetaOptimizerBase): "LarsOptimizer", "LambOptimizer", "RecomputeOptimizer", - "GradientMergeOptimizer", "GraphExecutionOptimizer", ] self.meta_optimizers_black_list = ["DGCOptimizer"] diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py index 6315fbf5a0d633c576b033594f6ebc1e06228e31..380fbc2e09ebffe89cfaabfd8d753dc47e8d85ff 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py @@ -21,6 +21,7 @@ class GradientMergeOptimizer(MetaOptimizerBase): self.inner_opt = optimizer self.wrapped_opt = None self.meta_optimizers_white_list = [ + "AMPOptimizer", "LarsOptimizer", "LambOptimizer", "GraphExecutionOptimizer", diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py index 37996b6228efe4c2e211b027551bafb4c029d98a..2215d11aa06c2f81fdd1591a75a2e4acf23f2998 100644 --- a/python/paddle/fluid/contrib/mixed_precision/decorator.py +++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py @@ -159,9 +159,6 @@ class OptimizerWithMixedPrecision(object): params_grads = self._optimizer.backward( self._scaled_loss, startup_program, parameter_list, no_grad_set, callbacks) - # Change the op_role_var attr for some ops, so that gradients - # transferred across GPUs can be FP16. - update_role_var_grad(train_program, params_grads) return params_grads def apply_gradients(self, params_grads): @@ -176,6 +173,10 @@ class OptimizerWithMixedPrecision(object): A list of optimize operators. """ + # Change the op_role_var attr for some ops, so that gradients + # transferred across GPUs can be FP16. + update_role_var_grad(self._train_program, params_grads) + grads = [g for _, g in params_grads] if not self._is_distributed: with self._train_program._optimized_guard(grads): diff --git a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py index a40bc9a9fba6efd60af34846426d33740212d286..2d03b267fe9e3c969a82976fd3309374c53efa46 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py @@ -46,6 +46,19 @@ class TestFleetGradientMergeMetaOptimizer(TestFleetMetaOptimizer): self.assertIn('@GradientMerge', ''.join(vars)) self.assertIn('subprog', ''.join(vars)) + def test_gm_amp_optimizer(self): + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, strategy = self.net(train_prog, startup_prog) + self.set_strategy(strategy, 'gradient_merge') + self.set_strategy(strategy, 'amp') + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + print(train_prog) + + vars = [x.name for x in train_prog.list_vars()] + self.assertIn('@GradientMerge', ''.join(vars)) + self.assertIn('cast', ''.join(vars)) + if __name__ == "__main__": unittest.main()