From 051ba1ce1dcebb6fcd43e46fff648b323b087fca Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 9 Feb 2018 16:44:33 +0800 Subject: [PATCH] Use force cpu in fill constant op (#8254) --- python/paddle/v2/fluid/initializer.py | 37 +++++- .../paddle/v2/fluid/layers/math_op_patch.py | 10 +- python/paddle/v2/fluid/layers/tensor.py | 27 ++++- python/paddle/v2/fluid/learning_rate_decay.py | 114 ++++++++++-------- .../tests/book/test_label_semantic_roles.py | 12 +- 5 files changed, 138 insertions(+), 62 deletions(-) diff --git a/python/paddle/v2/fluid/initializer.py b/python/paddle/v2/fluid/initializer.py index b9c0d12ad6c..8c70fd90eff 100644 --- a/python/paddle/v2/fluid/initializer.py +++ b/python/paddle/v2/fluid/initializer.py @@ -14,14 +14,37 @@ import framework import numpy as np +import contextlib __all__ = [ - 'Constant', - 'Uniform', - 'Normal', - 'Xavier', + 'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu', + 'init_on_cpu' ] +_force_init_on_cpu_ = False + + +def force_init_on_cpu(): + return _force_init_on_cpu_ + + +@contextlib.contextmanager +def init_on_cpu(): + """ + Switch program with `with` statement + + Examples: + >>> with init_on_cpu(): + >>> step = layers.create_global_var() + + """ + global _force_init_on_cpu_ + + pre_state = force_init_on_cpu() + _force_init_on_cpu_ = True + yield + _force_init_on_cpu_ = pre_state + class Initializer(object): """Base class for variable initializers @@ -80,7 +103,7 @@ class ConstantInitializer(Initializer): """Implements the constant initializer """ - def __init__(self, value=0.0): + def __init__(self, value=0.0, force_cpu=False): """Constructor for ConstantInitializer Args: @@ -89,6 +112,7 @@ class ConstantInitializer(Initializer): assert value is not None super(ConstantInitializer, self).__init__() self._value = value + self._force_cpu = force_cpu def __call__(self, var, block): """Add constant initialization ops for a variable @@ -110,7 +134,8 @@ class ConstantInitializer(Initializer): attrs={ "shape": var.shape, "dtype": int(var.dtype), - "value": self._value + "value": float(self._value), + 'force_cpu': self._force_cpu or force_init_on_cpu() }) var.op = op return op diff --git a/python/paddle/v2/fluid/layers/math_op_patch.py b/python/paddle/v2/fluid/layers/math_op_patch.py index 79a130a3eb1..9b5f22759cf 100644 --- a/python/paddle/v2/fluid/layers/math_op_patch.py +++ b/python/paddle/v2/fluid/layers/math_op_patch.py @@ -14,6 +14,7 @@ from ..framework import Variable, unique_name from layer_function_generator import OpProtoHolder +from ..initializer import force_init_on_cpu __all__ = ['monkey_patch_variable'] @@ -36,9 +37,12 @@ def monkey_patch_variable(): block.append_op( type="fill_constant", outputs={'Out': [var]}, - attrs={'dtype': var.dtype, - 'shape': shape, - 'value': value}) + attrs={ + 'dtype': var.dtype, + 'shape': shape, + 'value': value, + 'force_cpu': force_init_on_cpu() + }) return var def create_scalar(block, value, dtype): diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py index 704e040b9f4..2d4e0ab0cc6 100644 --- a/python/paddle/v2/fluid/layers/tensor.py +++ b/python/paddle/v2/fluid/layers/tensor.py @@ -16,7 +16,7 @@ from ..layer_helper import LayerHelper from ..param_attr import ParamAttr from ..framework import convert_np_dtype_to_dtype_ from ..framework import Variable -from ..initializer import Constant +from ..initializer import Constant, force_init_on_cpu from ..core import DataType import numpy @@ -69,12 +69,30 @@ def create_parameter(shape, default_initializer) -def create_global_var(shape, value, dtype, persistable=False, name=None): +def create_global_var(shape, + value, + dtype, + persistable=False, + force_cpu=False, + name=None): + """ + Create a global variable. such as global_step + Args: + shape(list[int]): shape of the variable + value(float): the value of the variable + dtype(string): element type of the parameter + persistable(bool): if this variable is persistable + force_cpu(bool): force this variable to be on CPU + + Returns: + Variable: the created Variable + """ helper = LayerHelper("global_var", **locals()) var = helper.create_global_variable( dtype=dtype, shape=shape, persistable=persistable, name=name) helper.set_variable_initializer( - var, initializer=Constant(value=float(value))) + var, initializer=Constant( + value=float(value), force_cpu=force_cpu)) return var @@ -221,6 +239,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): dtype(np.dtype|core.DataType|str): Data type of the output tensor. value(float): The constant value used to initialize the output tensor. out(Variable): The output tensor. + force_cpu(True|False): data should be on CPU if set true. Returns: Variable: The tensor variable storing the output. @@ -242,7 +261,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): 'shape': shape, 'dtype': out.dtype, 'value': float(value), - 'force_cpu': force_cpu + 'force_cpu': force_cpu or force_init_on_cpu() }) out.stop_gradient = True return out diff --git a/python/paddle/v2/fluid/learning_rate_decay.py b/python/paddle/v2/fluid/learning_rate_decay.py index 13dc98075f7..2a2a29fd9cb 100644 --- a/python/paddle/v2/fluid/learning_rate_decay.py +++ b/python/paddle/v2/fluid/learning_rate_decay.py @@ -14,6 +14,7 @@ import layers from framework import Variable +from initializer import init_on_cpu __all__ = [ 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', @@ -54,11 +55,14 @@ def exponential_decay(learning_rate, if not isinstance(global_step, Variable): raise ValueError("global_step is required for exponential_decay.") - # update learning_rate - div_res = global_step / decay_steps - if staircase: - div_res = layers.floor(x=div_res) - return learning_rate * (decay_rate**div_res) + with init_on_cpu(): + # update learning_rate + div_res = global_step / decay_steps + if staircase: + div_res = layers.floor(x=div_res) + decayed_lr = learning_rate * (decay_rate**div_res) + + return decayed_lr def natural_exp_decay(learning_rate, @@ -88,10 +92,13 @@ def natural_exp_decay(learning_rate, if not isinstance(global_step, Variable): raise ValueError("global_step is required for natural_exp_decay.") - div_res = global_step / decay_steps - if staircase: - div_res = layers.floor(x=div_res) - return learning_rate * layers.exp(x=(-1 * decay_rate * div_res)) + with init_on_cpu(): + div_res = global_step / decay_steps + if staircase: + div_res = layers.floor(x=div_res) + decayed_lr = learning_rate * layers.exp(x=(-1 * decay_rate * div_res)) + + return decayed_lr def inverse_time_decay(learning_rate, @@ -121,11 +128,14 @@ def inverse_time_decay(learning_rate, if not isinstance(global_step, Variable): raise ValueError("global_step is required for inverse_time_decay.") - div_res = global_step / decay_steps - if staircase: - div_res = layers.floor(x=div_res) + with init_on_cpu(): + div_res = global_step / decay_steps + if staircase: + div_res = layers.floor(x=div_res) + + decayed_lr = learning_rate / (1 + decay_rate * div_res) - return learning_rate / (1 + decay_rate * div_res) + return decayed_lr def polynomial_decay(learning_rate, @@ -160,22 +170,27 @@ def polynomial_decay(learning_rate, if not isinstance(global_step, Variable): raise ValueError("global_step is required for inverse_time_decay.") - if cycle: - div_res = layers.ceil(x=(global_step / decay_steps)) - zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0) - one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0) - - with layers.Switch() as switch: - with switch.case(layers.equal(x=global_step, y=zero_var)): - layers.assign(input=one_var, output=div_res) - decay_steps = decay_steps * div_res - else: - decay_steps_var = layers.fill_constant( - shape=[1], dtype='float32', value=float(decay_steps)) - global_step = layers.elementwise_min(x=global_step, y=decay_steps_var) - - return (learning_rate - end_learning_rate) * \ - ((1 - global_step / decay_steps) ** power) + end_learning_rate + with init_on_cpu(): + if cycle: + div_res = layers.ceil(x=(global_step / decay_steps)) + zero_var = layers.fill_constant( + shape=[1], dtype='float32', value=0.0) + one_var = layers.fill_constant( + shape=[1], dtype='float32', value=1.0) + + with layers.Switch() as switch: + with switch.case(layers.equal(x=global_step, y=zero_var)): + layers.assign(input=one_var, output=div_res) + decay_steps = decay_steps * div_res + else: + decay_steps_var = layers.fill_constant( + shape=[1], dtype='float32', value=float(decay_steps)) + global_step = layers.elementwise_min( + x=global_step, y=decay_steps_var) + + decayed_lr = (learning_rate - end_learning_rate) * \ + ((1 - global_step / decay_steps) ** power) + end_learning_rate + return decayed_lr def piecewise_decay(global_step, boundaries, values): @@ -200,24 +215,27 @@ def piecewise_decay(global_step, boundaries, values): if not isinstance(global_step, Variable): raise ValueError("global_step is required for piecewise_decay.") - lr = layers.create_global_var( - shape=[1], - value=0.0, - dtype='float32', - persistable=True, - name="learning_rate") - - with layers.Switch() as switch: - for i in range(len(boundaries)): - boundary_val = layers.fill_constant( - shape=[1], dtype='float32', value=float(boundaries[i])) - value_var = layers.fill_constant( - shape=[1], dtype='float32', value=float(values[i])) - with switch.case(layers.less_than(global_step, boundary_val)): - layers.assign(value_var, lr) - last_value_var = layers.fill_constant( - shape=[1], dtype='float32', value=float(values[len(values) - 1])) - with switch.default(): - layers.assign(last_value_var, lr) + with init_on_cpu(): + lr = layers.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="learning_rate") + + with layers.Switch() as switch: + for i in range(len(boundaries)): + boundary_val = layers.fill_constant( + shape=[1], dtype='float32', value=float(boundaries[i])) + value_var = layers.fill_constant( + shape=[1], dtype='float32', value=float(values[i])) + with switch.case(layers.less_than(global_step, boundary_val)): + layers.assign(value_var, lr) + last_value_var = layers.fill_constant( + shape=[1], + dtype='float32', + value=float(values[len(values) - 1])) + with switch.default(): + layers.assign(last_value_var, lr) return lr diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py index 1491f7a8d54..f33e81186bd 100644 --- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py @@ -18,6 +18,7 @@ import numpy as np import paddle.v2 as paddle import paddle.v2.dataset.conll05 as conll05 import paddle.v2.fluid as fluid +from paddle.v2.fluid.initializer import init_on_cpu import contextlib import time import unittest @@ -167,7 +168,16 @@ def train(use_cuda, save_dirname=None): # TODO(qiao) # check other optimizers and check why out will be NAN - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001) + global_step = fluid.layers.create_global_var( + shape=[1], value=0, dtype='float32', force_cpu=True, persistable=True) + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=fluid.learning_rate_decay.exponential_decay( + learning_rate=0.0001, + global_step=global_step, + decay_steps=100000, + decay_rate=0.5, + staircase=True), + global_step=global_step) sgd_optimizer.minimize(avg_cost) # TODO(qiao) -- GitLab