diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 438e307b121013d0bc65baaa1c3d53238a86ac4e..6ac13923a2a7f370512ff79b128d2c70809f8e94 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -19,7 +19,7 @@ import paddle from .. import framework from .. import core from ..framework import Variable, Parameter, ParamBase -from .base import switch_to_static_graph, to_variable +from .base import switch_to_static_graph from .math_op_patch import monkey_patch_math_varbase from .parallel import scale_loss diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index d4c99641b2c3502d7361904fe627ff6caba3b33b..761f6409fed761c3799c4c32fccdec768aa2d5d9 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -22,7 +22,7 @@ from collections import defaultdict import paddle from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard -from paddle.fluid.dygraph.parallel import scale_loss, apply_collective_grads +from paddle.fluid.dygraph.parallel import apply_collective_grads from . import framework from . import layers @@ -772,8 +772,14 @@ class Optimizer(object): self._dtype = loss.dtype if framework.in_dygraph_mode(): + parameter_list = parameter_list if parameter_list \ + else self._parameter_list + + if paddle.distributed.get_world_size() > 1: + apply_collective_grads(parameter_list) + params_grads = [] - for param in self._parameter_list: + for param in parameter_list: if not param.trainable: continue if param._grad_ivar() is not None: @@ -941,10 +947,6 @@ class Optimizer(object): parameter_list = parameter_list if parameter_list \ else self._parameter_list - if paddle.distributed.get_world_size() > 1: - loss = scale_loss(loss) - apply_collective_grads(parameter_list) - params_grads = self.backward( loss, startup_program=startup_program, diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 983b83884aeb79fe6c60fc50bda97964457ad451..0b04f03eb14da313467b31afaf1b4a05ef7eca3d 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -16,7 +16,7 @@ from .optimizer import Optimizer from .adam import Adam from ..fluid import framework import paddle -from paddle.fluid.dygraph.parallel import scale_loss, apply_collective_grads +from paddle.fluid.dygraph.parallel import apply_collective_grads __all__ = ['AdamW'] @@ -189,10 +189,6 @@ class AdamW(Adam): parameters = parameters if parameters \ else self._parameter_list - if paddle.distributed.get_world_size() > 1: - loss = scale_loss(loss) - apply_collective_grads(parameter_list) - params_grads = self.backward( loss=loss, startup_program=startup_program, diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 77f753e46b5ab04a18575489e2728c20276c974d..15519cdd300e998305a9cc2e1435cc90cf23529b 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -22,7 +22,7 @@ from collections import defaultdict import paddle from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard -from paddle.fluid.dygraph.parallel import scale_loss, apply_collective_grads +from paddle.fluid.dygraph.parallel import apply_collective_grads from ..fluid import framework from ..fluid import layers @@ -676,8 +676,14 @@ class Optimizer(object): self._dtype = loss.dtype if framework.in_dygraph_mode(): + parameter_list = parameters if parameters \ + else self._parameter_list + + if paddle.distributed.get_world_size() > 1: + apply_collective_grads(parameter_list) + params_grads = [] - for param in self._parameter_list: + for param in parameter_list: if not param.trainable: continue if param._grad_ivar() is not None: @@ -873,10 +879,6 @@ class Optimizer(object): parameter_list = parameters if parameters \ else self._parameter_list - if paddle.distributed.get_world_size() > 1: - loss = scale_loss(loss) - apply_collective_grads(parameter_list) - params_grads = self.backward( loss, startup_program=startup_program,