From ab04997846bdc7497772987604e30889ed60cc88 Mon Sep 17 00:00:00 2001 From: WangXi Date: Tue, 5 Jan 2021 16:47:27 +0800 Subject: [PATCH] [fleet] combine amp and gradient merge, test=develop (#30086) --- .../fleet/meta_optimizers/amp_optimizer.py | 1 - .../meta_optimizers/gradient_merge_optimizer.py | 1 + .../fluid/contrib/mixed_precision/decorator.py | 7 ++++--- .../test_fleet_gradient_merge_meta_optimizer.py | 13 +++++++++++++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py index 24e0b196d49..c751e229cbb 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py @@ -25,7 +25,6 @@ class AMPOptimizer(MetaOptimizerBase): "LarsOptimizer", "LambOptimizer", "RecomputeOptimizer", - "GradientMergeOptimizer", "GraphExecutionOptimizer", ] self.meta_optimizers_black_list = ["DGCOptimizer"] diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py index 6315fbf5a0d..380fbc2e09e 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py @@ -21,6 +21,7 @@ class GradientMergeOptimizer(MetaOptimizerBase): self.inner_opt = optimizer self.wrapped_opt = None self.meta_optimizers_white_list = [ + "AMPOptimizer", "LarsOptimizer", "LambOptimizer", "GraphExecutionOptimizer", diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py index 37996b6228e..2215d11aa06 100644 --- a/python/paddle/fluid/contrib/mixed_precision/decorator.py +++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py @@ -159,9 +159,6 @@ class OptimizerWithMixedPrecision(object): params_grads = self._optimizer.backward( self._scaled_loss, startup_program, parameter_list, no_grad_set, callbacks) - # Change the op_role_var attr for some ops, so that gradients - # transferred across GPUs can be FP16. - update_role_var_grad(train_program, params_grads) return params_grads def apply_gradients(self, params_grads): @@ -176,6 +173,10 @@ class OptimizerWithMixedPrecision(object): A list of optimize operators. """ + # Change the op_role_var attr for some ops, so that gradients + # transferred across GPUs can be FP16. + update_role_var_grad(self._train_program, params_grads) + grads = [g for _, g in params_grads] if not self._is_distributed: with self._train_program._optimized_guard(grads): diff --git a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py index a40bc9a9fba..2d03b267fe9 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py @@ -46,6 +46,19 @@ class TestFleetGradientMergeMetaOptimizer(TestFleetMetaOptimizer): self.assertIn('@GradientMerge', ''.join(vars)) self.assertIn('subprog', ''.join(vars)) + def test_gm_amp_optimizer(self): + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, strategy = self.net(train_prog, startup_prog) + self.set_strategy(strategy, 'gradient_merge') + self.set_strategy(strategy, 'amp') + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + print(train_prog) + + vars = [x.name for x in train_prog.list_vars()] + self.assertIn('@GradientMerge', ''.join(vars)) + self.assertIn('cast', ''.join(vars)) + if __name__ == "__main__": unittest.main() -- GitLab