diff --git a/python/paddle/fluid/imperative/learning_rate_scheduler.py b/python/paddle/fluid/imperative/learning_rate_scheduler.py index 60d59b0f761b763ecf376a437c2bc9318fc57ff7..0ace448d7f884ff866085c43171acb4353ff4b91 100644 --- a/python/paddle/fluid/imperative/learning_rate_scheduler.py +++ b/python/paddle/fluid/imperative/learning_rate_scheduler.py @@ -14,10 +14,13 @@ from __future__ import print_function +import math + from .. import unique_name __all__ = [ - 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay', 'InverseTimeDecay' + 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay', + 'InverseTimeDecay', 'CosineDecay' ] @@ -34,7 +37,7 @@ class LearningRateDecay(object): def __call__(self): lr = self.step() if isinstance(lr, float): - lr = self._create_lr_var(lr) + lr = self.create_lr_var(lr) self.step_num += self.step_size return lr @@ -166,18 +169,58 @@ class PolynomialDecay(LearningRateDecay): def step(self): from .. import layers + tmp_step_num = self.step_num + tmp_decay_steps = self.decay_steps if self.cycle: div_res = layers.ceil( - self.create_lr_var(self.step_num / self.decay_steps)) + self.create_lr_var(tmp_step_num / self.decay_steps)) zero_var = 0.0 one_var = 1.0 - if float(self.step_num) == zero_var: + if float(tmp_step_num) == zero_var: div_res = one_var - decay_steps = self.decay_steps * div_res + tmp_decay_steps = self.decay_steps * div_res else: - global_step = global_step if global_step < self.decay_steps else self.decay_steps + tmp_step_num = self.create_lr_var(tmp_step_num + if tmp_step_num < self.decay_steps + else self.decay_steps) + + decayed_lr = (self.learning_rate - self.end_learning_rate) * \ + ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate + return decayed_lr - decayed_lr = (self.learning_rate - self.end_learning_rate) * \ - ((1 - global_step / self.decay_steps) ** self.power) + self.end_learning_rate - return self.create_lr_var(decayed_lr) + +class CosineDecay(LearningRateDecay): + def __init__(self, + learning_rate, + step_each_epoch, + epochs, + begin=0, + step=1, + dtype='float32'): + super(CosineDecay, self).__init__(begin, step, dtype) + self.learning_rate = learning_rate + self.step_each_epoch = step_each_epoch + self.epochs = epochs + + def step(self): + from .. import layers + cur_epoch = layers.floor( + self.create_lr_var(self.step_num / self.step_each_epoch)) + decayed_lr = self.learning_rate * 0.5 * ( + layers.cos(cur_epoch * math.pi / self.epochs) + 1) + return decayed_lr + + +class NoamDecay(LearningRateDecay): + def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'): + super(NoamDecay, self).__init__(begin, step, dtype) + self.d_model = d_model + self.warmup_steps = warmup_steps + + def step(self): + from .. import layers + a = self.create_lr_var(global_step**-0.5) + b = self.create_lr_var((warmup_steps**-1.5) * global_step) + lr_value = (d_model**-0.5) * layers.elementwise_min(a, b) + return lr_value diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index 535234104699a8b5fc0870f55847812c65b2f59a..069ade544587ed43906debe03d6d13aa6c2e0447 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -69,13 +69,17 @@ def noam_decay(d_model, warmup_steps): The decayed learning rate. """ with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter(1) + if imperative_base.enabled(): + decay = imperate_lr.NoamDecay(d_model, warmup_steps) + return decay + else: + global_step = _decay_step_counter(1) - a = global_step**-0.5 - b = (warmup_steps**-1.5) * global_step - lr_value = (d_model**-0.5) * nn.elementwise_min(a, b) + a = global_step**-0.5 + b = (warmup_steps**-1.5) * global_step + lr_value = (d_model**-0.5) * nn.elementwise_min(a, b) - return lr_value + return lr_value def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): @@ -364,12 +368,17 @@ def cosine_decay(learning_rate, step_each_epoch, epochs): learning_rate = base_lr, step_each_epoch=10000, epochs=120) """ with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter() + if imperative_base.enabled(): + decay = imperate_lr.CosineDecay(learning_rate, step_each_epoch, + epochs) + return decay + else: + global_step = _decay_step_counter() - cur_epoch = ops.floor(global_step / step_each_epoch) - decayed_lr = learning_rate * 0.5 * ( - ops.cos(cur_epoch * math.pi / epochs) + 1) - return decayed_lr + cur_epoch = ops.floor(global_step / step_each_epoch) + decayed_lr = learning_rate * 0.5 * ( + ops.cos(cur_epoch * math.pi / epochs) + 1) + return decayed_lr def append_LARS(params_grads, learning_rate, weight_decay): @@ -391,6 +400,9 @@ def append_LARS(params_grads, learning_rate, weight_decay): / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param))) """ + assert not imperative_base.enabled( + ), "append_LARS is NOT supported in dygraph mode now" + def _balanced_weight(param_norm, grad_norm): if weight_decay == 1.0: return grad_norm + param_norm diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 7a5147ef2e0907117404a8e227a4dd30cd54b1aa..f0544a80a9df9f2d5692b112f6feafa93dc28743 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -195,6 +195,8 @@ class Optimizer(object): name = self._name + "_" + name if (name in self._accumulators and param.name in self._accumulators[name]): + if framework._in_imperative_mode(): + return self._accumulators[name][param.name] raise Exception("Accumulator {} already exists for parameter {}". format(name, param.name)) if shape == None: diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index 783dd6c8957bdd286a1bc8b726a26f5aa891a6dc..f509ff4a2336a9fda8263327ff623ceefd9deea4 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -43,7 +43,7 @@ class MLP(fluid.imperative.Layer): class TestImperativeOptimizerBase(unittest.TestCase): def setUp(self): - self.batch_num = 10 + self.batch_num = 20 def get_optimizer(self): raise NotImplementedError() @@ -214,5 +214,25 @@ class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): self._check_mlp() +class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay( + learning_rate=0.1, step_each_epoch=10000, epochs=120)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay( + d_model=512, warmup_steps=8000)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + if __name__ == '__main__': unittest.main()