From be801d6c056c3435922e345d9d2ea105120b812d Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 31 Jan 2018 10:37:09 +0800
Subject: [PATCH] Add learning rate decay (#7892)

* add basic interface for learning rate decay
* add exponential_decay
* add natural_exp_decay
* add inverse_time_decay
---
 paddle/operators/elementwise_pow_op.cc        |  37 ++++++
 paddle/operators/elementwise_pow_op.cu        |  20 +++
 paddle/operators/elementwise_pow_op.h         |  37 ++++++
 python/paddle/v2/fluid/__init__.py            |   2 +
 .../paddle/v2/fluid/layers/math_op_patch.py   |   4 +-
 python/paddle/v2/fluid/layers/ops.py          |   1 +
 python/paddle/v2/fluid/layers/tensor.py       |  13 +-
 python/paddle/v2/fluid/learning_rate_decay.py | 125 ++++++++++++++++++
 python/paddle/v2/fluid/optimizer.py           |  66 +++++----
 .../tests/book/test_label_semantic_roles.py   |   2 +-
 .../v2/fluid/tests/test_elementwise_pow_op.py |  43 ++++++
 .../fluid/tests/test_learning_rate_decay.py   | 110 +++++++++++++++
 12 files changed, 432 insertions(+), 28 deletions(-)
 create mode 100644 paddle/operators/elementwise_pow_op.cc
 create mode 100644 paddle/operators/elementwise_pow_op.cu
 create mode 100644 paddle/operators/elementwise_pow_op.h
 create mode 100644 python/paddle/v2/fluid/learning_rate_decay.py
 create mode 100644 python/paddle/v2/fluid/tests/test_elementwise_pow_op.py
 create mode 100644 python/paddle/v2/fluid/tests/test_learning_rate_decay.py

diff --git a/paddle/operators/elementwise_pow_op.cc b/paddle/operators/elementwise_pow_op.cc
new file mode 100644
index 000000000..5293cc7dd
--- /dev/null
+++ b/paddle/operators/elementwise_pow_op.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/elementwise_pow_op.h"
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwisePowOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwisePowOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Pow", "Out = X ^ Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(elementwise_pow, ops::ElementwiseOp,
+                             ops::ElementwisePowOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/elementwise_pow_op.cu b/paddle/operators/elementwise_pow_op.cu
new file mode 100644
index 000000000..643c978e6
--- /dev/null
+++ b/paddle/operators/elementwise_pow_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_pow_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/elementwise_pow_op.h b/paddle/operators/elementwise_pow_op.h
new file mode 100644
index 000000000..6019e709e
--- /dev/null
+++ b/paddle/operators/elementwise_pow_op.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cmath>
+#include "paddle/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct PowFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwisePowKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index a542e3dba..18c8343d0 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -26,6 +26,7 @@ import initializer
 import layers
 import nets
 import optimizer
+import learning_rate_decay
 import backward
 import regularizer
 from param_attr import ParamAttr
@@ -44,6 +45,7 @@ __all__ = framework.__all__ + executor.__all__ + [
     'layers',
     'nets',
     'optimizer',
+    'learning_rate_decay',
     'backward',
     'regularizer',
     'LoDTensor',
diff --git a/python/paddle/v2/fluid/layers/math_op_patch.py b/python/paddle/v2/fluid/layers/math_op_patch.py
index f359e7012..79a130a3e 100644
--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
@@ -145,7 +145,9 @@ def monkey_patch_variable():
             # a*b == b*a. Do not need to reverse explicitly
         ("__rmul__", "elementwise_mul", False),
         ("__div__", "elementwise_div", False),
-        ("__rdiv__", "elementwise_div", True)):
+        ("__rdiv__", "elementwise_div", True),
+        ("__pow__", "elementwise_pow", False),
+        ("__rpow__", "elementwise_pow", True)):
         setattr(Variable, method_name,
                 _elemwise_method_creator_(method_name, op_type, reverse))
 
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
index 022a94cad..ee3172c7b 100644
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -56,6 +56,7 @@ __all__ = [
     'elementwise_mul',
     'elementwise_max',
     'elementwise_min',
+    'elementwise_pow',
     'clip',
     'clip_by_norm',
     'sequence_softmax',
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
index 6e7d09459..c435c5206 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -16,12 +16,14 @@ from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
 from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
+from ..initializer import Constant
 from ..core import DataType
 import numpy
 
 __all__ = [
     'create_tensor',
     'create_parameter',
+    'create_global_var',
     'cast',
     'concat',
     'sums',
@@ -58,13 +60,22 @@ def create_parameter(shape,
     Returns:
         Parameter: the created parameter
     """
-    helper = LayerHelper("create_parameter")
+    helper = LayerHelper("create_parameter", **locals())
     if attr is None:
         attr = ParamAttr()
     return helper.create_parameter(attr, shape, dtype, is_bias,
                                    default_initializer)
 
 
+def create_global_var(shape, value, dtype, persistable=False, name=None):
+    helper = LayerHelper("global_var", **locals())
+    var = helper.create_global_variable(
+        dtype=dtype, shape=shape, persistable=persistable, name=name)
+    helper.set_variable_initializer(
+        var, initializer=Constant(value=float(value)))
+    return var
+
+
 def cast(x, dtype):
     """
     This function takes in the input with input_dtype
diff --git a/python/paddle/v2/fluid/learning_rate_decay.py b/python/paddle/v2/fluid/learning_rate_decay.py
new file mode 100644
index 000000000..96b3e9a0d
--- /dev/null
+++ b/python/paddle/v2/fluid/learning_rate_decay.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import layers
+from framework import Variable
+
+__all__ = ['exponential_decay', 'natural_exp_decay', 'inverse_time_decay']
+"""
+When training a model, it's often useful to decay the
+learning rate during training process, this is called
+learning_rate_decay. There are many strategies to do
+this, this module will provide some classical method.
+User can also implement their own learning_rate_decay
+strategy according to this module.
+"""
+
+
+def exponential_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    """Applies exponential decay to the learning rate.
+
+    ```python
+    decayed_learning_rate = learning_rate *
+            decay_rate ^ (global_step / decay_steps)
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for exponential_decay.")
+
+    # update learning_rate
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = layers.floor(x=div_res)
+    return learning_rate * (decay_rate**div_res)
+
+
+def natural_exp_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    """Applies natural exponential decay to the initial learning rate.
+
+    ```python
+    if not staircase:
+        decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+    else:
+        decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for natural_exp_decay.")
+
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = layers.floor(x=div_res)
+    return learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+
+
+def inverse_time_decay(learning_rate,
+                       global_step,
+                       decay_steps,
+                       decay_rate,
+                       staircase=False):
+    """Applies inverse time decay to the initial learning rate.
+
+    ```python
+    if staircase:
+      decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
+    else
+      decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for inverse_time_decay.")
+
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = layers.floor(x=div_res)
+
+    return learning_rate / (1 + decay_rate * div_res)
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index 0c3533b89..7844a4e2d 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -15,6 +15,7 @@
 from collections import defaultdict
 
 import framework
+import layers
 from backward import append_backward
 from framework import unique_name, program_guard
 from initializer import Constant
@@ -33,9 +34,11 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    def __init__(self, global_step=None, regularization=None):
+    def __init__(self, learning_rate, global_step=None, regularization=None):
+        assert learning_rate is not None
         self._global_step = global_step
         self.regularization = regularization
+        self._global_learning_rate = learning_rate
         # Dictionary of accumulators. Some optimizer subclasses need to
         # allocate and manage extra variables associated with the parameters
         # to train. These variables are called accumulators.
@@ -43,6 +46,28 @@ class Optimizer(object):
         self._accumulators = defaultdict(lambda: dict())
         self.helper = None
 
+    def _create_global_learning_rate(self):
+        if isinstance(self._global_learning_rate, float):
+            self._global_learning_rate = layers.create_global_var(
+                name=unique_name("learning_rate"),
+                shape=[1],
+                value=float(self._global_learning_rate),
+                dtype='float32',
+                persistable=True)
+
+        if not isinstance(self._global_learning_rate, framework.Variable):
+            raise ValueError("learning rate should be a Variable, "
+                             "actual type is %s",
+                             type(self._global_learning_rate))
+
+    @property
+    def global_learning_rate(self):
+        """
+        get global decayed learning rate
+        :return:
+        """
+        return self._global_learning_rate
+
     def _append_optimize_op(self, block, param_and_grad):
         """ append optimize operator to block and return all the added optimize_op
         """
@@ -52,17 +77,7 @@ class Optimizer(object):
         # create learning rate variable for every parameter
         param = param_and_grad[0]
         param_lr = param.optimize_attr['learning_rate']
-        param_lr_shape = [1]
-        param_lr_var = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=param_lr_shape,
-            lod_level=1,
-            persistable=True)
-        param_lr = param_lr * self._learning_rate
-        self.helper.set_variable_initializer(
-            var=param_lr_var, initializer=Constant(param_lr))
-        return param_lr_var
+        return self._global_learning_rate * param_lr
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -163,7 +178,7 @@ class Optimizer(object):
           optimization. This will include parameter update ops, global step
           update ops and any other custom ops required by subclasses to manage
           their internal state.
-          :param startup_program: 
+          :param startup_program:
         """
         # This is a default implementation of create_optimization_pass that
         # can be shared by most optimizers. This implementation assumes that
@@ -178,6 +193,7 @@ class Optimizer(object):
             self.helper = LayerHelper(self.__class__.__name__)
             self._create_accumulators(loss.block,
                                       [p[0] for p in parameters_and_grads])
+            self._create_global_learning_rate()
 
             optimize_ops = []
             for param_and_grad in parameters_and_grads:
@@ -231,9 +247,9 @@ class SGDOptimizer(Optimizer):
 
     def __init__(self, learning_rate, **kwargs):
         assert learning_rate is not None
-        super(SGDOptimizer, self).__init__(**kwargs)
+        super(SGDOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "sgd"
-        self._learning_rate = learning_rate
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -259,9 +275,9 @@ class MomentumOptimizer(Optimizer):
     def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs):
         assert learning_rate is not None
         assert momentum is not None
-        super(MomentumOptimizer, self).__init__(**kwargs)
+        super(MomentumOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "momentum"
-        self._learning_rate = learning_rate
         self._momentum = momentum
         self._use_nesterov = bool(use_nesterov)
 
@@ -303,9 +319,9 @@ class AdagradOptimizer(Optimizer):
     def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs):
         assert learning_rate is not None
         assert epsilon is not None
-        super(AdagradOptimizer, self).__init__(**kwargs)
+        super(AdagradOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "adagrad"
-        self._learning_rate = learning_rate
         self._epsilon = epsilon
 
     def _create_accumulators(self, block, parameters):
@@ -352,9 +368,9 @@ class AdamOptimizer(Optimizer):
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(AdamOptimizer, self).__init__(**kwargs)
+        super(AdamOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "adam"
-        self._learning_rate = learning_rate
         self._beta1 = beta1
         self._beta2 = beta2
         self._epsilon = epsilon
@@ -457,9 +473,9 @@ class AdamaxOptimizer(Optimizer):
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(AdamaxOptimizer, self).__init__(**kwargs)
+        super(AdamaxOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "adamax"
-        self._learning_rate = learning_rate
         self._beta1 = beta1
         self._beta2 = beta2
         self._epsilon = epsilon
@@ -535,9 +551,9 @@ class DecayedAdagradOptimizer(Optimizer):
         assert decay is not None
         assert epsilon is not None
 
-        super(DecayedAdagradOptimizer, self).__init__(**kwargs)
+        super(DecayedAdagradOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "decayed_adagrad"
-        self._learning_rate = learning_rate
         self._decay = decay
         self._epsilon = epsilon
 
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
index 1a342bf1f..f85768de9 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -175,7 +175,7 @@ def main():
         paddle.reader.shuffle(
             paddle.dataset.conll05.test(), buf_size=8192),
         batch_size=BATCH_SIZE)
-    #place = fluid.CPUPlace()
+    # place = fluid.CPUPlace()
     place = fluid.CUDAPlace(0)
     feeder = fluid.DataFeeder(
         feed_list=[
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py b/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py
new file mode 100644
index 000000000..e31749df9
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py
@@ -0,0 +1,43 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwisePowOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestElementwisePowOp_scalar(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype('float32'),
+            'Y': np.random.rand(1).astype('float32')
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_learning_rate_decay.py b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
new file mode 100644
index 000000000..dc348cf2d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import math
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.learning_rate_decay as lr_decay
+
+
+def exponential_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    exponent = float(global_step) / float(decay_steps)
+    if staircase:
+        exponent = math.floor(exponent)
+    return learning_rate * decay_rate**exponent
+
+
+def natural_exp_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    exponent = float(global_step) / float(decay_steps)
+    if staircase:
+        exponent = math.floor(exponent)
+    return learning_rate * math.exp(-1 * decay_rate * exponent)
+
+
+def inverse_time_decay(learning_rate,
+                       global_step,
+                       decay_steps,
+                       decay_rate,
+                       staircase=False):
+    temp = float(global_step) / float(decay_steps)
+    if staircase:
+        temp = math.floor(temp)
+    return learning_rate / (1 + decay_rate * temp)
+
+
+class TestLearningRateDecay(unittest.TestCase):
+    def check_decay(self, python_decay_fn, fluid_decay_fn, staircase):
+        init_lr = 1.0
+        decay_steps = 5
+        decay_rate = 0.5
+
+        global_step = layers.create_global_var(
+            shape=[1], value=0.0, dtype='float32', persistable=True)
+
+        decayed_lr = fluid_decay_fn(
+            learning_rate=init_lr,
+            global_step=global_step,
+            decay_steps=decay_steps,
+            decay_rate=decay_rate,
+            staircase=staircase)
+        layers.increment(global_step, 1.0)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        exe.run(fluid.default_startup_program())
+        for step in range(10):
+            step_val, lr_val = exe.run(fluid.default_main_program(),
+                                       feed=[],
+                                       fetch_list=[global_step, decayed_lr])
+            python_decayed_lr = python_decay_fn(
+                learning_rate=init_lr,
+                global_step=step,
+                decay_steps=decay_steps,
+                decay_rate=decay_rate,
+                staircase=staircase)
+            self.assertAlmostEqual(python_decayed_lr, lr_val[0])
+
+    def test_decay(self):
+        decay_fns = [
+            (exponential_decay, lr_decay.exponential_decay, True),
+            (exponential_decay, lr_decay.exponential_decay, False),
+            (natural_exp_decay, lr_decay.natural_exp_decay, True),
+            (natural_exp_decay, lr_decay.natural_exp_decay, False),
+            (inverse_time_decay, lr_decay.inverse_time_decay, True),
+            (inverse_time_decay, lr_decay.inverse_time_decay, False),
+        ]
+
+        for py_decay_fn, fluid_decay_fn, staircase in decay_fns:
+            print("decay_fn=" + str(py_decay_fn) + " staircase=" + str(
+                staircase))
+            main_program = framework.Program()
+            startup_program = framework.Program()
+            with framework.program_guard(main_program, startup_program):
+                self.check_decay(py_decay_fn, fluid_decay_fn, staircase)
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab