diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 39d13d3ab5fb8340509e01b0bd1de6f66ce99c21..3f407d05768f707507e4a1339a64f3d7ae4506a9 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -26,7 +26,6 @@ import initializer
 import layers
 import nets
 import optimizer
-import learning_rate_decay
 import backward
 import regularizer
 from param_attr import ParamAttr, WeightNormParamAttr
diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py
index 906a16a49f728526a41d1bc6da3a40e30bbfa33f..14d33582f41a33da49b1e5176b2094a6a81b3dac 100644
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -28,6 +28,7 @@ import math_op_patch
 from math_op_patch import *
 import detection
 from detection import *
+from learning_rate_scheduler import *
 
 __all__ = []
 __all__ += math_op_patch.__all__
@@ -38,3 +39,4 @@ __all__ += control_flow.__all__
 __all__ += ops.__all__
 __all__ += device.__all__
 __all__ += detection.__all__
+__all__ += learning_rate_scheduler.__all__
diff --git a/python/paddle/fluid/learning_rate_decay.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
similarity index 85%
rename from python/paddle/fluid/learning_rate_decay.py
rename to python/paddle/fluid/layers/learning_rate_scheduler.py
index 631efa048740ea3d50947a321ae2e76c6a6048af..65b95a58d6546ed6d6b264443a7c802e16eef23f 100644
--- a/python/paddle/fluid/learning_rate_decay.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import layers
-from initializer import init_on_cpu
+import control_flow
+import nn
+import ops
+import tensor
+from ..initializer import init_on_cpu
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
@@ -31,9 +34,9 @@ strategy according to this module.
 
 def _decay_step_counter():
     # the first global step is zero in learning rate decay
-    global_step = layers.autoincreased_step_counter(
+    global_step = nn.autoincreased_step_counter(
         counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
-    global_step = layers.cast(global_step, 'float32')
+    global_step = tensor.cast(global_step, 'float32')
     return global_step
 
 
@@ -60,7 +63,7 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
         # update learning_rate
         div_res = global_step / decay_steps
         if staircase:
-            div_res = layers.floor(x=div_res)
+            div_res = ops.floor(div_res)
         decayed_lr = learning_rate * (decay_rate**div_res)
 
     return decayed_lr
@@ -89,8 +92,8 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     with init_on_cpu():
         div_res = global_step / decay_steps
         if staircase:
-            div_res = layers.floor(x=div_res)
-        decayed_lr = learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+            div_res = ops.floor(div_res)
+        decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
 
     return decayed_lr
 
@@ -118,7 +121,7 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     with init_on_cpu():
         div_res = global_step / decay_steps
         if staircase:
-            div_res = layers.floor(x=div_res)
+            div_res = ops.floor(div_res)
 
         decayed_lr = learning_rate / (1 + decay_rate * div_res)
 
@@ -154,21 +157,20 @@ def polynomial_decay(learning_rate,
 
     with init_on_cpu():
         if cycle:
-            div_res = layers.ceil(x=(global_step / decay_steps))
-            zero_var = layers.fill_constant(
+            div_res = ops.ceil(global_step / decay_steps)
+            zero_var = tensor.fill_constant(
                 shape=[1], dtype='float32', value=0.0)
-            one_var = layers.fill_constant(
+            one_var = tensor.fill_constant(
                 shape=[1], dtype='float32', value=1.0)
 
-            with layers.Switch() as switch:
+            with control_flow.Switch() as switch:
                 with switch.case(global_step == zero_var):
-                    layers.assign(input=one_var, output=div_res)
+                    tensor.assign(input=one_var, output=div_res)
             decay_steps = decay_steps * div_res
         else:
-            decay_steps_var = layers.fill_constant(
+            decay_steps_var = tensor.fill_constant(
                 shape=[1], dtype='float32', value=float(decay_steps))
-            global_step = layers.elementwise_min(
-                x=global_step, y=decay_steps_var)
+            global_step = ops.elementwise_min(x=global_step, y=decay_steps_var)
 
         decayed_lr = (learning_rate - end_learning_rate) * \
                      ((1 - global_step / decay_steps) ** power) + end_learning_rate
@@ -195,26 +197,26 @@ def piecewise_decay(boundaries, values):
     global_step = _decay_step_counter()
 
     with init_on_cpu():
-        lr = layers.create_global_var(
+        lr = tensor.create_global_var(
             shape=[1],
             value=0.0,
             dtype='float32',
             persistable=True,
             name="learning_rate")
 
-        with layers.Switch() as switch:
+        with control_flow.Switch() as switch:
             for i in range(len(boundaries)):
-                boundary_val = layers.fill_constant(
+                boundary_val = tensor.fill_constant(
                     shape=[1], dtype='float32', value=float(boundaries[i]))
-                value_var = layers.fill_constant(
+                value_var = tensor.fill_constant(
                     shape=[1], dtype='float32', value=float(values[i]))
                 with switch.case(global_step < boundary_val):
-                    layers.assign(value_var, lr)
-            last_value_var = layers.fill_constant(
+                    tensor.assign(value_var, lr)
+            last_value_var = tensor.fill_constant(
                 shape=[1],
                 dtype='float32',
                 value=float(values[len(values) - 1]))
             with switch.default():
-                layers.assign(last_value_var, lr)
+                tensor.assign(last_value_var, lr)
 
     return lr
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index 5c6374b93175d85c49633b73b20aa5e3b64ff9f1..f488527e0bc69059bc44422aa28188441f3d5b54 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -170,7 +170,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
     # TODO(qiao)
     # check other optimizers and check why out will be NAN
     sgd_optimizer = fluid.optimizer.SGD(
-        learning_rate=fluid.learning_rate_decay.exponential_decay(
+        learning_rate=fluid.layers.exponential_decay(
             learning_rate=0.0001,
             decay_steps=100000,
             decay_rate=0.5,
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
similarity index 86%
rename from python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
rename to python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index 5c221a0325b6cdc27ec22e5a8b02ae8eec9f6d80..ab25bfffaa45020cc854e44b593776e90638cf72 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -17,8 +17,8 @@ import math
 import unittest
 
 import paddle.fluid as fluid
+import paddle.fluid.layers as layers
 import paddle.fluid.framework as framework
-import paddle.fluid.learning_rate_decay as lr_decay
 
 
 def exponential_decay(learning_rate,
@@ -111,27 +111,24 @@ class TestLearningRateDecay(unittest.TestCase):
         common_kwargs_false["staircase"] = False
 
         decay_fns = [
-            (exponential_decay, lr_decay.exponential_decay, common_kwargs_true),
-            (exponential_decay, lr_decay.exponential_decay,
+            (exponential_decay, layers.exponential_decay, common_kwargs_true),
+            (exponential_decay, layers.exponential_decay, common_kwargs_false),
+            (natural_exp_decay, layers.natural_exp_decay, common_kwargs_true),
+            (natural_exp_decay, layers.natural_exp_decay, common_kwargs_false),
+            (inverse_time_decay, layers.inverse_time_decay, common_kwargs_true),
+            (inverse_time_decay, layers.inverse_time_decay,
              common_kwargs_false),
-            (natural_exp_decay, lr_decay.natural_exp_decay, common_kwargs_true),
-            (natural_exp_decay, lr_decay.natural_exp_decay,
-             common_kwargs_false),
-            (inverse_time_decay, lr_decay.inverse_time_decay,
-             common_kwargs_true),
-            (inverse_time_decay, lr_decay.inverse_time_decay,
-             common_kwargs_false),
-            (polynomial_decay, lr_decay.polynomial_decay, {
+            (polynomial_decay, layers.polynomial_decay, {
                 "learning_rate": 1.0,
                 "decay_steps": 5,
                 "cycle": True
             }),
-            (polynomial_decay, lr_decay.polynomial_decay, {
+            (polynomial_decay, layers.polynomial_decay, {
                 "learning_rate": 1.0,
                 "decay_steps": 5,
                 "cycle": False
             }),
-            (piecewise_decay, lr_decay.piecewise_decay, {
+            (piecewise_decay, layers.piecewise_decay, {
                 "boundaries": [3, 6, 9],
                 "values": [0.1, 0.2, 0.3, 0.4]
             }),