From ec9c0874bc711ad7bf3eca52581c58e31f2d4a4a Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 27 Mar 2019 15:33:58 +0800 Subject: [PATCH] Implement Expotential NatureExp Inversetime and Polynomal Decay --- .../imperative/learning_rate_scheduler.py | 118 +++++++++++++++++- .../fluid/layers/learning_rate_scheduler.py | 95 ++++++++------ .../unittests/test_imperative_optimizer.py | 88 ++++++++++--- 3 files changed, 248 insertions(+), 53 deletions(-) diff --git a/python/paddle/fluid/imperative/learning_rate_scheduler.py b/python/paddle/fluid/imperative/learning_rate_scheduler.py index 38d893be5..60d59b0f7 100644 --- a/python/paddle/fluid/imperative/learning_rate_scheduler.py +++ b/python/paddle/fluid/imperative/learning_rate_scheduler.py @@ -16,7 +16,9 @@ from __future__ import print_function from .. import unique_name -__all__ = ['PiecewiseDecay'] +__all__ = [ + 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay', 'InverseTimeDecay' +] class LearningRateDecay(object): @@ -65,3 +67,117 @@ class PiecewiseDecay(LearningRateDecay): if self.step_num < self.boundaries[i]: return self.vars[i] return self.vars[len(self.values) - 1] + + +class NaturalExpDecay(LearningRateDecay): + def __init__(self, + learning_rate, + decay_steps, + decay_rate, + staircase=False, + begin=0, + step=1, + dtype='float32'): + super(NaturalExpDecay, self).__init__(begin, step, dtype) + self.learning_rate = learning_rate + self.decay_steps = decay_steps + self.decay_rate = decay_rate + self.staircase = staircase + + def step(self): + from .. import layers + div_res = self.create_lr_var(self.step_num / self.decay_steps) + if self.staircase: + div_res = layers.floor(div_res) + decayed_lr = self.learning_rate * layers.exp(-1 * self.decay_rate * + div_res) + + return decayed_lr + + +class ExponentialDecay(LearningRateDecay): + def __init__(self, + learning_rate, + decay_steps, + decay_rate, + staircase=False, + begin=0, + step=1, + dtype='float32'): + super(ExponentialDecay, self).__init__(begin, step, dtype) + self.learning_rate = learning_rate + self.decay_steps = decay_steps + self.decay_rate = decay_rate + self.staircase = staircase + + def step(self): + from .. import layers + div_res = self.create_lr_var(self.step_num / self.decay_steps) + if self.staircase: + div_res = layers.floor(div_res) + + decayed_lr = self.learning_rate * (self.decay_rate**div_res) + + return decayed_lr + + +class InverseTimeDecay(LearningRateDecay): + def __init__(self, + learning_rate, + decay_steps, + decay_rate, + staircase=False, + begin=0, + step=1, + dtype='float32'): + super(InverseTimeDecay, self).__init__(begin, step, dtype) + self.learning_rate = learning_rate + self.decay_steps = decay_steps + self.decay_rate = decay_rate + self.staircase = staircase + + def step(self): + from .. import layers + div_res = self.create_lr_var(self.step_num / self.decay_steps) + if self.staircase: + div_res = layers.floor(div_res) + + decayed_lr = self.learning_rate / (1 + self.decay_rate * div_res) + + return decayed_lr + + +class PolynomialDecay(LearningRateDecay): + def __init__(self, + learning_rate, + decay_steps, + end_learning_rate=0.0001, + power=1.0, + cycle=False, + begin=0, + step=1, + dtype='float32'): + super(PolynomialDecay, self).__init__(begin, step, dtype) + self.learning_rate = learning_rate + self.decay_steps = decay_steps + self.end_learning_rate = end_learning_rate + self.power = power + self.cycle = cycle + + def step(self): + from .. import layers + if self.cycle: + div_res = layers.ceil( + self.create_lr_var(self.step_num / self.decay_steps)) + zero_var = 0.0 + one_var = 1.0 + + if float(self.step_num) == zero_var: + div_res = one_var + decay_steps = self.decay_steps * div_res + else: + global_step = global_step if global_step < self.decay_steps else self.decay_steps + + decayed_lr = (self.learning_rate - self.end_learning_rate) * \ + ((1 - global_step / self.decay_steps) ** self.power) + self.end_learning_rate + return self.create_lr_var(decayed_lr) diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index 50dedac36..535234104 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -115,14 +115,19 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): """ with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter() + if imperative_base.enabled(): + decay = imperate_lr.ExponentialDecay(learning_rate, decay_steps, + decay_rate, staircase) + return decay + else: + global_step = _decay_step_counter() - div_res = global_step / decay_steps - if staircase: - div_res = ops.floor(div_res) - decayed_lr = learning_rate * (decay_rate**div_res) + div_res = global_step / decay_steps + if staircase: + div_res = ops.floor(div_res) + decayed_lr = learning_rate * (decay_rate**div_res) - return decayed_lr + return decayed_lr def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): @@ -144,14 +149,19 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): The decayed learning rate """ with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter() + if imperative_base.enabled(): + decay = imperate_lr.NaturalExpDecay(learning_rate, decay_steps, + decay_rate, staircase) + return decay + else: + global_step = _decay_step_counter() - div_res = global_step / decay_steps - if staircase: - div_res = ops.floor(div_res) - decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res) + div_res = global_step / decay_steps + if staircase: + div_res = ops.floor(div_res) + decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res) - return decayed_lr + return decayed_lr def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): @@ -190,15 +200,20 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): sgd_optimizer.minimize(avg_cost) """ with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter() + if imperative_base.enabled(): + decay = imperate_lr.InverseTimeDecay(learning_rate, decay_steps, + decay_rate, staircase) + return decay + else: + global_step = _decay_step_counter() - div_res = global_step / decay_steps - if staircase: - div_res = ops.floor(div_res) + div_res = global_step / decay_steps + if staircase: + div_res = ops.floor(div_res) - decayed_lr = learning_rate / (1 + decay_rate * div_res) + decayed_lr = learning_rate / (1 + decay_rate * div_res) - return decayed_lr + return decayed_lr def polynomial_decay(learning_rate, @@ -230,27 +245,33 @@ def polynomial_decay(learning_rate, Variable: The decayed learning rate """ with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter() - - if cycle: - div_res = ops.ceil(global_step / decay_steps) - zero_var = tensor.fill_constant( - shape=[1], dtype='float32', value=0.0) - one_var = tensor.fill_constant( - shape=[1], dtype='float32', value=1.0) - - with control_flow.Switch() as switch: - with switch.case(global_step == zero_var): - tensor.assign(input=one_var, output=div_res) - decay_steps = decay_steps * div_res + if imperative_base.enabled(): + decay = imperate_lr.PolynomialDecay(learning_rate, decay_steps, + end_learning_rate, power, cycle) + return decay else: - decay_steps_var = tensor.fill_constant( - shape=[1], dtype='float32', value=float(decay_steps)) - global_step = nn.elementwise_min(x=global_step, y=decay_steps_var) + global_step = _decay_step_counter() - decayed_lr = (learning_rate - end_learning_rate) * \ - ((1 - global_step / decay_steps) ** power) + end_learning_rate - return decayed_lr + if cycle: + div_res = ops.ceil(global_step / decay_steps) + zero_var = tensor.fill_constant( + shape=[1], dtype='float32', value=0.0) + one_var = tensor.fill_constant( + shape=[1], dtype='float32', value=1.0) + + with control_flow.Switch() as switch: + with switch.case(global_step == zero_var): + tensor.assign(input=one_var, output=div_res) + decay_steps = decay_steps * div_res + else: + decay_steps_var = tensor.fill_constant( + shape=[1], dtype='float32', value=float(decay_steps)) + global_step = nn.elementwise_min( + x=global_step, y=decay_steps_var) + + decayed_lr = (learning_rate - end_learning_rate) * \ + ((1 - global_step / decay_steps) ** power) + end_learning_rate + return decayed_lr def piecewise_decay(boundaries, values): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index 54d28c008..783dd6c89 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -22,7 +22,7 @@ import six import paddle import paddle.fluid as fluid from paddle.fluid import core -from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.optimizer import SGDOptimizer, Adam from paddle.fluid.imperative.nn import FC from paddle.fluid.imperative.base import to_variable from test_imperative_base import new_program_scope @@ -46,14 +46,9 @@ class TestImperativeOptimizerBase(unittest.TestCase): self.batch_num = 10 def get_optimizer(self): - bd = [3, 6, 9] - self.optimizer = SGDOptimizer( - learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, - values=[0.1 * (0.1**i) for i in range(len(bd) + 1)])) - return self.optimizer + raise NotImplementedError() - def test_optimizer_float32(self): + def _check_mlp(self): seed = 90 with fluid.imperative.guard(): fluid.default_startup_program().random_seed = seed @@ -83,16 +78,14 @@ class TestImperativeOptimizerBase(unittest.TestCase): dy_out = avg_loss._numpy() if batch_id == 0: - for param in fluid.default_main_program().global_block( - ).all_parameters(): + for param in mlp.parameters(): dy_param_init_value[param.name] = param._numpy() avg_loss._backward() optimizer.minimize(avg_loss) mlp.clear_gradients() dy_param_value = {} - for param in fluid.default_main_program().global_block( - ).all_parameters(): + for param in mlp.parameters(): dy_param_value[param.name] = param._numpy() with new_program_scope(): @@ -102,7 +95,7 @@ class TestImperativeOptimizerBase(unittest.TestCase): exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) - mnist = MLP('mlp') + mlp = MLP('mlp') optimizer = self.get_optimizer() train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=128, drop_last=True) @@ -110,14 +103,14 @@ class TestImperativeOptimizerBase(unittest.TestCase): img = fluid.layers.data( name='pixel', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') - cost = mnist(img) + cost = mlp(img) avg_loss = fluid.layers.reduce_mean(cost) optimizer.minimize(avg_loss) # initialize params and fetch them static_param_init_value = {} static_param_name_list = [] - for param in mnist.parameters(): + for param in mlp.parameters(): static_param_name_list.append(param.name) out = exe.run(fluid.default_startup_program(), @@ -156,5 +149,70 @@ class TestImperativeOptimizerBase(unittest.TestCase): self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5)) +class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + bd = [3, 6, 9] + optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)])) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_adam(self): + self._check_mlp() + + +class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay( + learning_rate=0.1, decay_steps=5, cycle=self.cycle)) + return optimizer + + def test_sgd_cycle(self): + self.cycle = True + self._check_mlp() + + def test_sgd(self): + self.cycle = False + self._check_mlp() + + if __name__ == '__main__': unittest.main() -- GitLab