From 175cf6e02467353e0187e4ccfd86e5ac3c670387 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 26 Feb 2018 11:40:24 +0800 Subject: [PATCH] Add global_step in nn.py --- python/paddle/fluid/layer_helper.py | 19 ++++ python/paddle/fluid/layers/nn.py | 23 ++++ python/paddle/fluid/learning_rate_decay.py | 107 +++++++----------- python/paddle/fluid/optimizer.py | 25 +--- .../tests/book/test_label_semantic_roles.py | 6 +- .../unittests/test_learning_rate_decay.py | 25 ++-- 6 files changed, 101 insertions(+), 104 deletions(-) diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 6437dbb446e..da7e74c901e 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -330,9 +330,28 @@ class LayerHelper(object): return self.main_program.current_block().create_var(*args, **kwargs) def create_global_variable(self, persistable=False, *args, **kwargs): + """ + create global variable, note that there is no initializer for this global variable. + Args: + persistable(bool): True if it is a checkpoint value. + *args: See create_var's documentation + **kwargs: See create_var's documentation + + Returns(Variable): the created variable. + """ return self.main_program.global_block().create_var( *args, persistable=persistable, **kwargs) + def create_or_get_global_variable(self, name, *args, **kwargs): + """ + Creates a global variable if not exists and returns the variable and + a boolean flag which is true when it is a new variable. + """ + if self.main_program.global_block().has_var(name): + return self.main_program.global_block().var(name), False + else: + return self.create_global_variable(name=name, *args, **kwargs), True + def set_variable_initializer(self, var, initializer): assert isinstance(var, Variable) self.startup_program.global_block().create_var( diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 3453dd945d5..061c9ca7f8b 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -69,6 +69,7 @@ __all__ = [ 'softmax_with_cross_entropy', 'smooth_l1', 'one_hot', + 'global_step_counter', ] @@ -3250,3 +3251,25 @@ def one_hot(input, depth): attrs={'depth': depth}, outputs={'Out': one_hot_out}) return one_hot_out + + +def global_step_counter(): + """ + Return the run counter of the main program, which is started with 1. + Returns(Variable): The global run counter. + """ + helper = LayerHelper('global_step_counter') + counter_name = '@STEP_COUNTER@' + counter, is_new_var = helper.create_or_get_global_variable( + name=counter_name, dtype='int64', shape=[1], persistable=True) + if is_new_var: + helper.set_variable_initializer( + counter, initializer=Constant( + value=0, force_cpu=True)) + helper.main_program.global_block().prepend_op( + type='increment', + inputs={'X': [counter]}, + outputs={'Out': [counter]}) + counter.stop_gradient = True + + return counter diff --git a/python/paddle/fluid/learning_rate_decay.py b/python/paddle/fluid/learning_rate_decay.py index 0826d3da79a..558a739c307 100644 --- a/python/paddle/fluid/learning_rate_decay.py +++ b/python/paddle/fluid/learning_rate_decay.py @@ -30,11 +30,14 @@ strategy according to this module. """ -def exponential_decay(learning_rate, - global_step, - decay_steps, - decay_rate, - staircase=False): +def float_global_step(): + # the first global step is zero in learning rate decay + global_step = layers.global_step_counter() - 1 + global_step = layers.cast(global_step, 'float32') + return global_step + + +def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): """Applies exponential decay to the learning rate. ```python @@ -44,7 +47,6 @@ def exponential_decay(learning_rate, Args: learning_rate: A scalar float32 value or a Variable. This will be the initial learning rate during training - global_step: A Variable that record the training step. decay_steps: A Python `int32` number. decay_rate: A Python `float` number. staircase: Boolean. If set true, decay the learning rate every decay_steps. @@ -52,8 +54,7 @@ def exponential_decay(learning_rate, Returns: The decayed learning rate """ - if not isinstance(global_step, Variable): - raise ValueError("global_step is required for exponential_decay.") + global_step = float_global_step() with init_on_cpu(): # update learning_rate @@ -65,23 +66,17 @@ def exponential_decay(learning_rate, return decayed_lr -def natural_exp_decay(learning_rate, - global_step, - decay_steps, - decay_rate, - staircase=False): +def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): """Applies natural exponential decay to the initial learning rate. - ```python - if not staircase: - decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps)) - else: - decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps)) - ``` + >>> if not staircase: + >>> decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps)) + >>> else: + >>> decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps)) + Args: learning_rate: A scalar float32 value or a Variable. This will be the initial learning rate during training - global_step: A Variable that record the training step. decay_steps: A Python `int32` number. decay_rate: A Python `float` number. staircase: Boolean. If set true, decay the learning rate every decay_steps. @@ -89,8 +84,7 @@ def natural_exp_decay(learning_rate, Returns: The decayed learning rate """ - if not isinstance(global_step, Variable): - raise ValueError("global_step is required for natural_exp_decay.") + global_step = float_global_step() with init_on_cpu(): div_res = global_step / decay_steps @@ -101,23 +95,17 @@ def natural_exp_decay(learning_rate, return decayed_lr -def inverse_time_decay(learning_rate, - global_step, - decay_steps, - decay_rate, - staircase=False): +def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): """Applies inverse time decay to the initial learning rate. - ```python - if staircase: - decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step)) - else: - decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step) - ``` + >>> if staircase: + >>> decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step)) + >>> else: + >>> decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step) + Args: learning_rate: A scalar float32 value or a Variable. This - will be the initial learning rate during training - global_step: A Variable that record the training step. + will be the initial learning rate during training. decay_steps: A Python `int32` number. decay_rate: A Python `float` number. staircase: Boolean. If set true, decay the learning rate every decay_steps. @@ -125,8 +113,7 @@ def inverse_time_decay(learning_rate, Returns: The decayed learning rate """ - if not isinstance(global_step, Variable): - raise ValueError("global_step is required for inverse_time_decay.") + global_step = float_global_step() with init_on_cpu(): div_res = global_step / decay_steps @@ -139,26 +126,22 @@ def inverse_time_decay(learning_rate, def polynomial_decay(learning_rate, - global_step, decay_steps, end_learning_rate=0.0001, power=1.0, cycle=False): """Applies polynomial decay to the initial learning rate. - ```python - if cycle: - decay_steps = decay_steps * ceil(global_step / decay_steps) - else: - global_step = min(global_step, decay_steps) - decayed_learning_rate = (learning_rate - end_learning_rate) * - (1 - global_step / decay_steps) ^ power + - end_learning_rate - ``` + >>> if cycle: + >>> decay_steps = decay_steps * ceil(global_step / decay_steps) + >>> else: + >>> global_step = min(global_step, decay_steps) + >>> decayed_learning_rate = (learning_rate - end_learning_rate) * + >>> (1 - global_step / decay_steps) ^ power + + >>> end_learning_rate Args: learning_rate: A scalar float32 value or a Variable. This will be the initial learning rate during training - global_step: A Variable that record the training step. decay_steps: A Python `int32` number. end_learning_rate: A Python `float` number. power: A Python `float` number @@ -167,8 +150,7 @@ def polynomial_decay(learning_rate, Returns: The decayed learning rate """ - if not isinstance(global_step, Variable): - raise ValueError("global_step is required for inverse_time_decay.") + global_step = float_global_step() with init_on_cpu(): if cycle: @@ -193,27 +175,24 @@ def polynomial_decay(learning_rate, return decayed_lr -def piecewise_decay(global_step, boundaries, values): +def piecewise_decay(boundaries, values): """Applies piecewise decay to the initial learning rate. - ```python - boundaries = [10000, 20000] - values = [1.0, 0.5, 0.1] - - if step < 10000: - learning_rate = 1.0 - elif step >= 10000 and step < 20000: - learning_rate = 0.5 - else: - learning_rate = 0.1 - ``` + >>> boundaries = [10000, 20000] + >>> values = [1.0, 0.5, 0.1] + >>> + >>> if step < 10000: + >>> learning_rate = 1.0 + >>> elif 10000 <= step < 20000: + >>> learning_rate = 0.5 + >>> else: + >>> learning_rate = 0.1 """ if len(values) - len(boundaries) != 1: raise ValueError("len(values) - len(boundaries) should be 1") - if not isinstance(global_step, Variable): - raise ValueError("global_step is required for piecewise_decay.") + global_step = float_global_step() with init_on_cpu(): lr = layers.create_global_var( diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 61febc4e383..db43141ea16 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -35,9 +35,8 @@ class Optimizer(object): but need to use one of it's implementation. """ - def __init__(self, learning_rate, global_step=None, regularization=None): + def __init__(self, learning_rate, regularization=None): assert learning_rate is not None - self._global_step = global_step self.regularization = regularization self._global_learning_rate = learning_rate # Dictionary of accumulators. Some optimizer subclasses need to @@ -144,26 +143,6 @@ class Optimizer(object): format(name, param.name)) return self._accumulators[name][param.name] - def _increment_global_step(self, block): - """Increment the global step by 1 after every iteration - - Args: - block: the block in which the loss variable is present - - Returns: - list with global_step increment op as its only element - """ - assert isinstance(block, framework.Block) - assert self._global_step is not None - # create the increment op - increment_op = block.append_op( - type="increment", - inputs={"X": self._global_step}, - outputs={"Out": self._global_step}, - attrs={"step": 1.0}) - - return increment_op - def create_optimization_pass(self, parameters_and_grads, loss, @@ -210,8 +189,6 @@ class Optimizer(object): # FIXME: Need to fix this once we figure out how to handle dependencies self._finish_update(loss.block) - if self._global_step is not None: - self._increment_global_step(loss.block) end = len(global_block.ops) return global_block.slice_ops(start, end) diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index fcc9dbf8bbf..12e797fdabc 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -168,16 +168,12 @@ def train(use_cuda, save_dirname=None): # TODO(qiao) # check other optimizers and check why out will be NAN - global_step = fluid.layers.create_global_var( - shape=[1], value=0, dtype='float32', force_cpu=True, persistable=True) sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.learning_rate_decay.exponential_decay( learning_rate=0.0001, - global_step=global_step, decay_steps=100000, decay_rate=0.5, - staircase=True), - global_step=global_step) + staircase=True)) sgd_optimizer.minimize(avg_cost) # TODO(qiao) diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py b/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py index 595b0516892..14c7615c140 100644 --- a/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py +++ b/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py @@ -28,7 +28,7 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): - exponent = float(global_step) / float(decay_steps) + exponent = global_step / decay_steps if staircase: exponent = math.floor(exponent) return learning_rate * decay_rate**exponent @@ -83,22 +83,25 @@ def piecewise_decay(global_step, boundaries, values): class TestLearningRateDecay(unittest.TestCase): def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs): - global_step = layers.create_global_var( - shape=[1], value=0.0, dtype='float32', persistable=True) - - decayed_lr = fluid_decay_fn(global_step=global_step, **kwargs) - layers.increment(global_step, 1.0) + decayed_lr = fluid_decay_fn(**kwargs) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) for step in range(10): - step_val, lr_val = exe.run(fluid.default_main_program(), - feed=[], - fetch_list=[global_step, decayed_lr]) - python_decayed_lr = python_decay_fn(global_step=step, **kwargs) - self.assertAlmostEqual(python_decayed_lr, lr_val[0]) + step_val, lr_val = exe.run( + fluid.default_main_program(), + feed=[], + fetch_list=[fluid.layers.global_step_counter(), decayed_lr]) + python_decayed_lr = python_decay_fn( + global_step=float(step), **kwargs) + self.assertAlmostEqual( + python_decayed_lr, + lr_val[0], + msg='Failed fn is {0}, Python result is {1}, Fluid result is {2}'. + format(python_decay_fn.__name__, + str(python_decayed_lr), str(lr_val[0]))) def test_decay(self): common_kwargs_true = { -- GitLab