未验证 提交 36bc5511 编写于 作者: Z zhaoyingli 提交者: GitHub

[AutoParallel] keep lr_sheduler same bewteen executor and engine (#55516)

上级 77032f0e
......@@ -171,14 +171,14 @@ class LRSchedulerAuto(LRScheduler):
if self.by_step and self.train_step % self.acc_step == 0:
if (
self.model._optimizer
and hasattr(self.model._optimizer, '_learning_rate')
self.model.optimizer
and hasattr(self.model.optimizer, '_learning_rate')
and isinstance(
self.model._optimizer._learning_rate,
self.model.optimizer._learning_rate,
paddle.optimizer.lr.LRScheduler,
)
):
self.model._optimizer._learning_rate.step()
self.model.optimizer._learning_rate.step()
class History(Callback):
......
......@@ -970,7 +970,9 @@ class Engine:
save_dir=save_dir,
verbose=verbose,
metrics=self._metrics_name(),
acc_step=self._acc_steps,
acc_step=1
if self._strategy.pipeline.enable
else self._acc_steps, # lr update once every local batch
)
cbks.on_begin('train')
......
......@@ -223,10 +223,15 @@ class Parallelizer:
def _generate_optimizer(
self, main_program, startup_program, optimizer, params_grads
):
# NOTE: `apply_gradients` will add an Accumulator for a parameter only once,
# but optimizer will be called repeatedly in re-launch, so optimizer need to be copied.
# NOTE:
# 1. `apply_gradients` will add an Accumulator for a parameter only once,
# but optimizer will be called repeatedly in re-launch, so optimizer need to be copied.
# 2. lr_scheduler cannot be deepcopy, cause 'deepcopy' will lead to difference of learning_rate between executor and engine.
learning_rate = optimizer._learning_rate
optimizer = copy.deepcopy(optimizer)
self._dist_context._serial_optimizer = optimizer
self._dist_context._serial_optimizer._learning_rate = learning_rate
with program_guard(main_program, startup_program):
with unique_name.guard("opt_"):
optimizer_ops = optimizer.apply_gradients(params_grads)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册