From 5903151f3bdd6c01208915992fd74aaf88041638 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Sun, 27 Sep 2020 07:07:50 +0000 Subject: [PATCH] move apply in minimize --- .../paddle/fluid/dygraph/varbase_patch_methods.py | 2 +- python/paddle/fluid/optimizer.py | 14 ++++++++------ python/paddle/optimizer/adamw.py | 6 +----- python/paddle/optimizer/optimizer.py | 14 ++++++++------ 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 438e307b121..6ac13923a2a 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -19,7 +19,7 @@ import paddle from .. import framework from .. import core from ..framework import Variable, Parameter, ParamBase -from .base import switch_to_static_graph, to_variable +from .base import switch_to_static_graph from .math_op_patch import monkey_patch_math_varbase from .parallel import scale_loss diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index d4c99641b2c..761f6409fed 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -22,7 +22,7 @@ from collections import defaultdict import paddle from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard -from paddle.fluid.dygraph.parallel import scale_loss, apply_collective_grads +from paddle.fluid.dygraph.parallel import apply_collective_grads from . import framework from . import layers @@ -772,8 +772,14 @@ class Optimizer(object): self._dtype = loss.dtype if framework.in_dygraph_mode(): + parameter_list = parameter_list if parameter_list \ + else self._parameter_list + + if paddle.distributed.get_world_size() > 1: + apply_collective_grads(parameter_list) + params_grads = [] - for param in self._parameter_list: + for param in parameter_list: if not param.trainable: continue if param._grad_ivar() is not None: @@ -941,10 +947,6 @@ class Optimizer(object): parameter_list = parameter_list if parameter_list \ else self._parameter_list - if paddle.distributed.get_world_size() > 1: - loss = scale_loss(loss) - apply_collective_grads(parameter_list) - params_grads = self.backward( loss, startup_program=startup_program, diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 983b83884ae..0b04f03eb14 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -16,7 +16,7 @@ from .optimizer import Optimizer from .adam import Adam from ..fluid import framework import paddle -from paddle.fluid.dygraph.parallel import scale_loss, apply_collective_grads +from paddle.fluid.dygraph.parallel import apply_collective_grads __all__ = ['AdamW'] @@ -189,10 +189,6 @@ class AdamW(Adam): parameters = parameters if parameters \ else self._parameter_list - if paddle.distributed.get_world_size() > 1: - loss = scale_loss(loss) - apply_collective_grads(parameter_list) - params_grads = self.backward( loss=loss, startup_program=startup_program, diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 77f753e46b5..15519cdd300 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -22,7 +22,7 @@ from collections import defaultdict import paddle from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard -from paddle.fluid.dygraph.parallel import scale_loss, apply_collective_grads +from paddle.fluid.dygraph.parallel import apply_collective_grads from ..fluid import framework from ..fluid import layers @@ -676,8 +676,14 @@ class Optimizer(object): self._dtype = loss.dtype if framework.in_dygraph_mode(): + parameter_list = parameters if parameters \ + else self._parameter_list + + if paddle.distributed.get_world_size() > 1: + apply_collective_grads(parameter_list) + params_grads = [] - for param in self._parameter_list: + for param in parameter_list: if not param.trainable: continue if param._grad_ivar() is not None: @@ -873,10 +879,6 @@ class Optimizer(object): parameter_list = parameters if parameters \ else self._parameter_list - if paddle.distributed.get_world_size() > 1: - loss = scale_loss(loss) - apply_collective_grads(parameter_list) - params_grads = self.backward( loss, startup_program=startup_program, -- GitLab