未验证 提交 36bc5511 编写于 作者: Z zhaoyingli 提交者: GitHub

[AutoParallel] keep lr_sheduler same bewteen executor and engine (#55516)

上级 77032f0e
...@@ -171,14 +171,14 @@ class LRSchedulerAuto(LRScheduler): ...@@ -171,14 +171,14 @@ class LRSchedulerAuto(LRScheduler):
if self.by_step and self.train_step % self.acc_step == 0: if self.by_step and self.train_step % self.acc_step == 0:
if ( if (
self.model._optimizer self.model.optimizer
and hasattr(self.model._optimizer, '_learning_rate') and hasattr(self.model.optimizer, '_learning_rate')
and isinstance( and isinstance(
self.model._optimizer._learning_rate, self.model.optimizer._learning_rate,
paddle.optimizer.lr.LRScheduler, paddle.optimizer.lr.LRScheduler,
) )
): ):
self.model._optimizer._learning_rate.step() self.model.optimizer._learning_rate.step()
class History(Callback): class History(Callback):
......
...@@ -970,7 +970,9 @@ class Engine: ...@@ -970,7 +970,9 @@ class Engine:
save_dir=save_dir, save_dir=save_dir,
verbose=verbose, verbose=verbose,
metrics=self._metrics_name(), metrics=self._metrics_name(),
acc_step=self._acc_steps, acc_step=1
if self._strategy.pipeline.enable
else self._acc_steps, # lr update once every local batch
) )
cbks.on_begin('train') cbks.on_begin('train')
......
...@@ -223,10 +223,15 @@ class Parallelizer: ...@@ -223,10 +223,15 @@ class Parallelizer:
def _generate_optimizer( def _generate_optimizer(
self, main_program, startup_program, optimizer, params_grads self, main_program, startup_program, optimizer, params_grads
): ):
# NOTE: `apply_gradients` will add an Accumulator for a parameter only once, # NOTE:
# 1. `apply_gradients` will add an Accumulator for a parameter only once,
# but optimizer will be called repeatedly in re-launch, so optimizer need to be copied. # but optimizer will be called repeatedly in re-launch, so optimizer need to be copied.
# 2. lr_scheduler cannot be deepcopy, cause 'deepcopy' will lead to difference of learning_rate between executor and engine.
learning_rate = optimizer._learning_rate
optimizer = copy.deepcopy(optimizer) optimizer = copy.deepcopy(optimizer)
self._dist_context._serial_optimizer = optimizer self._dist_context._serial_optimizer = optimizer
self._dist_context._serial_optimizer._learning_rate = learning_rate
with program_guard(main_program, startup_program): with program_guard(main_program, startup_program):
with unique_name.guard("opt_"): with unique_name.guard("opt_"):
optimizer_ops = optimizer.apply_gradients(params_grads) optimizer_ops = optimizer.apply_gradients(params_grads)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册