From 051ba1ce1dcebb6fcd43e46fff648b323b087fca Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 9 Feb 2018 16:44:33 +0800
Subject: [PATCH] Use force cpu in fill constant op (#8254)

---
 python/paddle/v2/fluid/initializer.py         |  37 +++++-
 .../paddle/v2/fluid/layers/math_op_patch.py   |  10 +-
 python/paddle/v2/fluid/layers/tensor.py       |  27 ++++-
 python/paddle/v2/fluid/learning_rate_decay.py | 114 ++++++++++--------
 .../tests/book/test_label_semantic_roles.py   |  12 +-
 5 files changed, 138 insertions(+), 62 deletions(-)

diff --git a/python/paddle/v2/fluid/initializer.py b/python/paddle/v2/fluid/initializer.py
index b9c0d12ad6c..8c70fd90eff 100644
--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
@@ -14,14 +14,37 @@
 
 import framework
 import numpy as np
+import contextlib
 
 __all__ = [
-    'Constant',
-    'Uniform',
-    'Normal',
-    'Xavier',
+    'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
+    'init_on_cpu'
 ]
 
+_force_init_on_cpu_ = False
+
+
+def force_init_on_cpu():
+    return _force_init_on_cpu_
+
+
+@contextlib.contextmanager
+def init_on_cpu():
+    """
+    Switch program with `with` statement
+
+    Examples:
+        >>> with init_on_cpu():
+        >>>   step = layers.create_global_var()
+
+    """
+    global _force_init_on_cpu_
+
+    pre_state = force_init_on_cpu()
+    _force_init_on_cpu_ = True
+    yield
+    _force_init_on_cpu_ = pre_state
+
 
 class Initializer(object):
     """Base class for variable initializers
@@ -80,7 +103,7 @@ class ConstantInitializer(Initializer):
     """Implements the constant initializer
     """
 
-    def __init__(self, value=0.0):
+    def __init__(self, value=0.0, force_cpu=False):
         """Constructor for ConstantInitializer
 
         Args:
@@ -89,6 +112,7 @@ class ConstantInitializer(Initializer):
         assert value is not None
         super(ConstantInitializer, self).__init__()
         self._value = value
+        self._force_cpu = force_cpu
 
     def __call__(self, var, block):
         """Add constant initialization ops for a variable
@@ -110,7 +134,8 @@ class ConstantInitializer(Initializer):
             attrs={
                 "shape": var.shape,
                 "dtype": int(var.dtype),
-                "value": self._value
+                "value": float(self._value),
+                'force_cpu': self._force_cpu or force_init_on_cpu()
             })
         var.op = op
         return op
diff --git a/python/paddle/v2/fluid/layers/math_op_patch.py b/python/paddle/v2/fluid/layers/math_op_patch.py
index 79a130a3eb1..9b5f22759cf 100644
--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
@@ -14,6 +14,7 @@
 
 from ..framework import Variable, unique_name
 from layer_function_generator import OpProtoHolder
+from ..initializer import force_init_on_cpu
 
 __all__ = ['monkey_patch_variable']
 
@@ -36,9 +37,12 @@ def monkey_patch_variable():
         block.append_op(
             type="fill_constant",
             outputs={'Out': [var]},
-            attrs={'dtype': var.dtype,
-                   'shape': shape,
-                   'value': value})
+            attrs={
+                'dtype': var.dtype,
+                'shape': shape,
+                'value': value,
+                'force_cpu': force_init_on_cpu()
+            })
         return var
 
     def create_scalar(block, value, dtype):
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
index 704e040b9f4..2d4e0ab0cc6 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -16,7 +16,7 @@ from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
 from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
-from ..initializer import Constant
+from ..initializer import Constant, force_init_on_cpu
 from ..core import DataType
 import numpy
 
@@ -69,12 +69,30 @@ def create_parameter(shape,
                                    default_initializer)
 
 
-def create_global_var(shape, value, dtype, persistable=False, name=None):
+def create_global_var(shape,
+                      value,
+                      dtype,
+                      persistable=False,
+                      force_cpu=False,
+                      name=None):
+    """
+    Create a global variable. such as global_step
+    Args:
+        shape(list[int]): shape of the variable
+        value(float): the value of the variable
+        dtype(string): element type of the parameter
+        persistable(bool): if this variable is persistable
+        force_cpu(bool): force this variable to be on CPU
+
+    Returns:
+        Variable: the created Variable
+    """
     helper = LayerHelper("global_var", **locals())
     var = helper.create_global_variable(
         dtype=dtype, shape=shape, persistable=persistable, name=name)
     helper.set_variable_initializer(
-        var, initializer=Constant(value=float(value)))
+        var, initializer=Constant(
+            value=float(value), force_cpu=force_cpu))
     return var
 
 
@@ -221,6 +239,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
         dtype(np.dtype|core.DataType|str): Data type of the output tensor.
         value(float): The constant value used to initialize the output tensor.
         out(Variable): The output tensor.
+        force_cpu(True|False): data should be on CPU if set true.
 
     Returns:
         Variable: The tensor variable storing the output.
@@ -242,7 +261,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
             'shape': shape,
             'dtype': out.dtype,
             'value': float(value),
-            'force_cpu': force_cpu
+            'force_cpu': force_cpu or force_init_on_cpu()
         })
     out.stop_gradient = True
     return out
diff --git a/python/paddle/v2/fluid/learning_rate_decay.py b/python/paddle/v2/fluid/learning_rate_decay.py
index 13dc98075f7..2a2a29fd9cb 100644
--- a/python/paddle/v2/fluid/learning_rate_decay.py
+++ b/python/paddle/v2/fluid/learning_rate_decay.py
@@ -14,6 +14,7 @@
 
 import layers
 from framework import Variable
+from initializer import init_on_cpu
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
@@ -54,11 +55,14 @@ def exponential_decay(learning_rate,
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for exponential_decay.")
 
-    # update learning_rate
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
-    return learning_rate * (decay_rate**div_res)
+    with init_on_cpu():
+        # update learning_rate
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+        decayed_lr = learning_rate * (decay_rate**div_res)
+
+    return decayed_lr
 
 
 def natural_exp_decay(learning_rate,
@@ -88,10 +92,13 @@ def natural_exp_decay(learning_rate,
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for natural_exp_decay.")
 
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
-    return learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+    with init_on_cpu():
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+        decayed_lr = learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+
+    return decayed_lr
 
 
 def inverse_time_decay(learning_rate,
@@ -121,11 +128,14 @@ def inverse_time_decay(learning_rate,
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for inverse_time_decay.")
 
-    div_res = global_step / decay_steps
-    if staircase:
-        div_res = layers.floor(x=div_res)
+    with init_on_cpu():
+        div_res = global_step / decay_steps
+        if staircase:
+            div_res = layers.floor(x=div_res)
+
+        decayed_lr = learning_rate / (1 + decay_rate * div_res)
 
-    return learning_rate / (1 + decay_rate * div_res)
+    return decayed_lr
 
 
 def polynomial_decay(learning_rate,
@@ -160,22 +170,27 @@ def polynomial_decay(learning_rate,
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for inverse_time_decay.")
 
-    if cycle:
-        div_res = layers.ceil(x=(global_step / decay_steps))
-        zero_var = layers.fill_constant(shape=[1], dtype='float32', value=0.0)
-        one_var = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
-
-        with layers.Switch() as switch:
-            with switch.case(layers.equal(x=global_step, y=zero_var)):
-                layers.assign(input=one_var, output=div_res)
-        decay_steps = decay_steps * div_res
-    else:
-        decay_steps_var = layers.fill_constant(
-            shape=[1], dtype='float32', value=float(decay_steps))
-        global_step = layers.elementwise_min(x=global_step, y=decay_steps_var)
-
-    return (learning_rate - end_learning_rate) * \
-           ((1 - global_step / decay_steps) ** power) + end_learning_rate
+    with init_on_cpu():
+        if cycle:
+            div_res = layers.ceil(x=(global_step / decay_steps))
+            zero_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=0.0)
+            one_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=1.0)
+
+            with layers.Switch() as switch:
+                with switch.case(layers.equal(x=global_step, y=zero_var)):
+                    layers.assign(input=one_var, output=div_res)
+            decay_steps = decay_steps * div_res
+        else:
+            decay_steps_var = layers.fill_constant(
+                shape=[1], dtype='float32', value=float(decay_steps))
+            global_step = layers.elementwise_min(
+                x=global_step, y=decay_steps_var)
+
+        decayed_lr = (learning_rate - end_learning_rate) * \
+                     ((1 - global_step / decay_steps) ** power) + end_learning_rate
+    return decayed_lr
 
 
 def piecewise_decay(global_step, boundaries, values):
@@ -200,24 +215,27 @@ def piecewise_decay(global_step, boundaries, values):
     if not isinstance(global_step, Variable):
         raise ValueError("global_step is required for piecewise_decay.")
 
-    lr = layers.create_global_var(
-        shape=[1],
-        value=0.0,
-        dtype='float32',
-        persistable=True,
-        name="learning_rate")
-
-    with layers.Switch() as switch:
-        for i in range(len(boundaries)):
-            boundary_val = layers.fill_constant(
-                shape=[1], dtype='float32', value=float(boundaries[i]))
-            value_var = layers.fill_constant(
-                shape=[1], dtype='float32', value=float(values[i]))
-            with switch.case(layers.less_than(global_step, boundary_val)):
-                layers.assign(value_var, lr)
-        last_value_var = layers.fill_constant(
-            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
-        with switch.default():
-            layers.assign(last_value_var, lr)
+    with init_on_cpu():
+        lr = layers.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="learning_rate")
+
+        with layers.Switch() as switch:
+            for i in range(len(boundaries)):
+                boundary_val = layers.fill_constant(
+                    shape=[1], dtype='float32', value=float(boundaries[i]))
+                value_var = layers.fill_constant(
+                    shape=[1], dtype='float32', value=float(values[i]))
+                with switch.case(layers.less_than(global_step, boundary_val)):
+                    layers.assign(value_var, lr)
+            last_value_var = layers.fill_constant(
+                shape=[1],
+                dtype='float32',
+                value=float(values[len(values) - 1]))
+            with switch.default():
+                layers.assign(last_value_var, lr)
 
     return lr
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
index 1491f7a8d54..f33e81186bd 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -18,6 +18,7 @@ import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
 import paddle.v2.fluid as fluid
+from paddle.v2.fluid.initializer import init_on_cpu
 import contextlib
 import time
 import unittest
@@ -167,7 +168,16 @@ def train(use_cuda, save_dirname=None):
 
     # TODO(qiao)
     # check other optimizers and check why out will be NAN
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+    global_step = fluid.layers.create_global_var(
+        shape=[1], value=0, dtype='float32', force_cpu=True, persistable=True)
+    sgd_optimizer = fluid.optimizer.SGD(
+        learning_rate=fluid.learning_rate_decay.exponential_decay(
+            learning_rate=0.0001,
+            global_step=global_step,
+            decay_steps=100000,
+            decay_rate=0.5,
+            staircase=True),
+        global_step=global_step)
     sgd_optimizer.minimize(avg_cost)
 
     # TODO(qiao)
-- 
GitLab