Use force cpu in fill constant op (#8254)

051ba1ce · Qiao Longfei · GitHub · 222155cc · 051ba1ce · 051ba1ce
5 changed file
--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
@@ -14,14 +14,37 @@

 import framework
 import numpy as np
+import contextlib

 __all__ = [
-    'Constant',
-    'Uniform',
-    'Normal',
-    'Xavier',
+    'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
+    'init_on_cpu'
 ]

+_force_init_on_cpu_ = False
+
+
+def force_init_on_cpu():
+    return _force_init_on_cpu_
+
+
+@contextlib.contextmanager
+def init_on_cpu():
+    """
+    Switch program with `with` statement
+
+    Examples:
+        >>> with init_on_cpu():
+        >>>   step = layers.create_global_var()
+
+    """
+    global _force_init_on_cpu_
+
+    pre_state = force_init_on_cpu()
+    _force_init_on_cpu_ = True
+    yield
+    _force_init_on_cpu_ = pre_state
+

 class Initializer(object):
    """Base class for variable initializers
@@ -80,7 +103,7 @@ class ConstantInitializer(Initializer):
    """Implements the constant initializer
    """

-    def __init__(self, value=0.0):
+    def __init__(self, value=0.0, force_cpu=False):
        """Constructor for ConstantInitializer

        Args:
@@ -89,6 +112,7 @@ class ConstantInitializer(Initializer):
        assert value is not None
        super(ConstantInitializer, self).__init__()
        self._value = value
+        self._force_cpu = force_cpu

    def __call__(self, var, block):
        """Add constant initialization ops for a variable
@@ -110,7 +134,8 @@ class ConstantInitializer(Initializer):
            attrs={
                "shape": var.shape,
                "dtype": int(var.dtype),
-                "value": self._value
+                "value": float(self._value),
+                'force_cpu': self._force_cpu or force_init_on_cpu()
            })
        var.op = op
        return op

--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
@@ -14,6 +14,7 @@

 from ..framework import Variable, unique_name
 from layer_function_generator import OpProtoHolder
+from ..initializer import force_init_on_cpu

 __all__ = ['monkey_patch_variable']

@@ -36,9 +37,12 @@ def monkey_patch_variable():
        block.append_op(
            type="fill_constant",
            outputs={'Out': [var]},
-            attrs={'dtype': var.dtype,
-                   'shape': shape,
-                   'value': value})
+            attrs={
+                'dtype': var.dtype,
+                'shape': shape,
+                'value': value,
+                'force_cpu': force_init_on_cpu()
+            })
        return var

    def create_scalar(block, value, dtype):

--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -16,7 +16,7 @@ from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
 from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
-from ..initializer import Constant
+from ..initializer import Constant, force_init_on_cpu
 from ..core import DataType
 import numpy

@@ -69,12 +69,30 @@ def create_parameter(shape,
                                   default_initializer)


-def create_global_var(shape, value, dtype, persistable=False, name=None):
+def create_global_var(shape,
+                      value,
+                      dtype,
+                      persistable=False,
+                      force_cpu=False,
+                      name=None):
+    """
+    Create a global variable. such as global_step
+    Args:
+        shape(list[int]): shape of the variable
+        value(float): the value of the variable
+        dtype(string): element type of the parameter
+        persistable(bool): if this variable is persistable
+        force_cpu(bool): force this variable to be on CPU
+
+    Returns:
+        Variable: the created Variable
+    """
    helper = LayerHelper("global_var", **locals())
    var = helper.create_global_variable(
        dtype=dtype, shape=shape, persistable=persistable, name=name)
    helper.set_variable_initializer(
-        var, initializer=Constant(value=float(value)))
+        var, initializer=Constant(
+            value=float(value), force_cpu=force_cpu))
    return var


@@ -221,6 +239,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
        dtype(np.dtype|core.DataType|str): Data type of the output tensor.
        value(float): The constant value used to initialize the output tensor.
        out(Variable): The output tensor.
+        force_cpu(True|False): data should be on CPU if set true.

    Returns:
        Variable: The tensor variable storing the output.
@@ -242,7 +261,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
            'shape': shape,
            'dtype': out.dtype,
            'value': float(value),
-            'force_cpu': force_cpu
+            'force_cpu': force_cpu or force_init_on_cpu()
        })
    out.stop_gradient = True
    return out

--- a/python/paddle/v2/fluid/learning_rate_decay.py
+++ b/python/paddle/v2/fluid/learning_rate_decay.py
@@ -14,6 +14,7 @@

 import layers
 from framework import Variable
+from initializer import init_on_cpu

 __all__ = [
    'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
@@ -54,11 +55,14 @@ def exponential_decay(learning_rate,
    if not isinstance(global_step, Variable):
        raise ValueError("global_step is required for exponential_decay.")

-    # update learning_rate
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
-    return learning_rate * (decay_rate**div_res)
+    with init_on_cpu():
+        # update learning_rate
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+        decayed_lr = learning_rate * (decay_rate**div_res)
+
+    return decayed_lr


 def natural_exp_decay(learning_rate,
@@ -88,10 +92,13 @@ def natural_exp_decay(learning_rate,
    if not isinstance(global_step, Variable):
        raise ValueError("global_step is required for natural_exp_decay.")

-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
-    return learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+    with init_on_cpu():
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+        decayed_lr = learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+
+    return decayed_lr


 def inverse_time_decay(learning_rate,
@@ -121,11 +128,14 @@ def inverse_time_decay(learning_rate,
    if not isinstance(global_step, Variable):
        raise ValueError("global_step is required for inverse_time_decay.")

-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
+    with init_on_cpu():
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+
+        decayed_lr = learning_rate / (1 + decay_rate * div_res)

-    return learning_rate / (1 + decay_rate * div_res)
+    return decayed_lr


 def polynomial_decay(learning_rate,
@@ -160,22 +170,27 @@ def polynomial_decay(learning_rate,
    if not isinstance(global_step, Variable):
        raise ValueError("global_step is required for inverse_time_decay.")

-    if cycle:
-        div_res = layers.ceil(x=(global_step / decay_steps))
-        zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
-        one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
-
-        with layers.Switch() as switch:
-            with switch.case(layers.equal(x=global_step, y=zero_var)):
-                layers.assign(input=one_var, output=div_res)
-        decay_steps = decay_steps * div_res
-    else:
-        decay_steps_var = layers.fill_constant(
-            shape=[1], dtype='float32', value=float(decay_steps))
-        global_step = layers.elementwise_min(x=global_step, y=decay_steps_var)
-
-    return (learning_rate - end_learning_rate) * \
-           ((1 - global_step / decay_steps) ** power) + end_learning_rate
+    with init_on_cpu():
+        if cycle:
+            div_res = layers.ceil(x=(global_step / decay_steps))
+            zero_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=0.0)
+            one_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=1.0)
+
+            with layers.Switch() as switch:
+                with switch.case(layers.equal(x=global_step, y=zero_var)):
+                    layers.assign(input=one_var, output=div_res)
+            decay_steps = decay_steps * div_res
+        else:
+            decay_steps_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=float(decay_steps))
+            global_step = layers.elementwise_min(
+                x=global_step, y=decay_steps_var)
+
+        decayed_lr = (learning_rate - end_learning_rate) * \
+                     ((1 - global_step / decay_steps) ** power) + end_learning_rate
+    return decayed_lr


 def piecewise_decay(global_step, boundaries, values):
@@ -200,24 +215,27 @@ def piecewise_decay(global_step, boundaries, values):
    if not isinstance(global_step, Variable):
        raise ValueError("global_step is required for piecewise_decay.")

-    lr = layers.create_global_var(
-        shape=[1],
-        value=0.0,
-        dtype='float32',
-        persistable=True,
-        name="learning_rate")
-
-    with layers.Switch() as switch:
-        for i in range(len(boundaries)):
-            boundary_val = layers.fill_constant(
-                shape=[1], dtype='float32', value=float(boundaries[i]))
-            value_var = layers.fill_constant(
-                shape=[1], dtype='float32', value=float(values[i]))
-            with switch.case(layers.less_than(global_step, boundary_val)):
-                layers.assign(value_var, lr)
-        last_value_var = layers.fill_constant(
-            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
-        with switch.default():
-            layers.assign(last_value_var, lr)
+    with init_on_cpu():
+        lr = layers.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="learning_rate")
+
+        with layers.Switch() as switch:
+            for i in range(len(boundaries)):
+                boundary_val = layers.fill_constant(
+                    shape=[1], dtype='float32', value=float(boundaries[i]))
+                value_var = layers.fill_constant(
+                    shape=[1], dtype='float32', value=float(values[i]))
+                with switch.case(layers.less_than(global_step, boundary_val)):
+                    layers.assign(value_var, lr)
+            last_value_var = layers.fill_constant(
+                shape=[1],
+                dtype='float32',
+                value=float(values[len(values) - 1]))
+            with switch.default():
+                layers.assign(last_value_var, lr)

    return lr
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -18,6 +18,7 @@ import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
 import paddle.v2.fluid as fluid
+from paddle.v2.fluid.initializer import init_on_cpu
 import contextlib
 import time
 import unittest
@@ -167,7 +168,16 @@ def train(use_cuda, save_dirname=None):

    # TODO(qiao)
    # check other optimizers and check why out will be NAN
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+    global_step = fluid.layers.create_global_var(
+        shape=[1], value=0, dtype='float32', force_cpu=True, persistable=True)
+    sgd_optimizer = fluid.optimizer.SGD(
+        learning_rate=fluid.learning_rate_decay.exponential_decay(
+            learning_rate=0.0001,
+            global_step=global_step,
+            decay_steps=100000,
+            decay_rate=0.5,
+            staircase=True),
+        global_step=global_step)
    sgd_optimizer.minimize(avg_cost)

    # TODO(qiao)