diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c7eb260aea8478f4833cb79253f4481e10b8685..e8ea828dd2a25f5f47b03e92ae86e083d4425dc9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,7 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
-option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
+option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
 option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 585db019d521b1699baadfae31ef95b5059c71b4..18770fe2861380ea1320aef5cb7ec3432147d7ce 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -224,12 +224,18 @@ function(cc_test TARGET_NAME)
   if(WITH_TESTING)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
+    set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
+      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
+    endif()
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
-    add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    add_test(NAME ${TARGET_NAME}
+             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
+             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 endfunction(cc_test)
 
@@ -457,7 +463,7 @@ endfunction()
 
 function(py_test TARGET_NAME)
   if(WITH_TESTING)
-    set(options STATIC static SHARED shared)
+    set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
diff --git a/doc/api/v2/fluid/data_feeder.rst b/doc/api/v2/fluid/data_feeder.rst
index 0fa78f7dfb04c13be7eb83b7fd35cb03f2f4a7fa..a591c7334fd31c98a94b50a4344f251560a0f2f9 100644
--- a/doc/api/v2/fluid/data_feeder.rst
+++ b/doc/api/v2/fluid/data_feeder.rst
@@ -1,9 +1,14 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
 ===========
-DataFeeder
+data_feeder
 ===========
 
 DataFeeder
------------
-..  automodule:: paddle.v2.fluid.data_feeder
-    :members: DataFeeder
+----------
+
+..  autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
+    :members:
     :noindex:
+
diff --git a/doc/api/v2/fluid/evaluator.rst b/doc/api/v2/fluid/evaluator.rst
index a23f3301d0331e0ea3733f06444515eb4680cd31..00dcecfd628a35d83d1c596bf0aea819a1705862 100644
--- a/doc/api/v2/fluid/evaluator.rst
+++ b/doc/api/v2/fluid/evaluator.rst
@@ -1,9 +1,21 @@
-===========
-Evaluator
-===========
-
-Evaluator
------------
-..  automodule:: paddle.v2.fluid.evaluator
-    :members: Evaluator
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=========
+evaluator
+=========
+
+Accuracy
+--------
+
+..  autoclass:: paddle.v2.fluid.evaluator.Accuracy
+    :members:
     :noindex:
+
+ChunkEvaluator
+--------------
+
+..  autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
+    :members:
+    :noindex:
+
diff --git a/doc/api/v2/fluid/executor.rst b/doc/api/v2/fluid/executor.rst
index 3a283538c120cfa1ef646c390bb71c6251c23675..a028f6283f2ca333bdf6c9857a98661c0222b41e 100644
--- a/doc/api/v2/fluid/executor.rst
+++ b/doc/api/v2/fluid/executor.rst
@@ -1,9 +1,32 @@
-===========
-Executor
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+========
+executor
+========
 
 Executor
+--------
+
+..  autoclass:: paddle.v2.fluid.executor.Executor
+    :members:
+    :noindex:
+
+global_scope
+------------
+
+..  autofunction:: paddle.v2.fluid.executor.global_scope
+    :noindex:
+
+scope_guard
 -----------
-..  automodule:: paddle.v2.fluid.executor
-    :members: Executor
+
+..  autofunction:: paddle.v2.fluid.executor.scope_guard
+    :noindex:
+
+switch_scope
+------------
+
+..  autofunction:: paddle.v2.fluid.executor.switch_scope
     :noindex:
+
diff --git a/doc/api/v2/fluid/gen_doc.py b/doc/api/v2/fluid/gen_doc.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2147fd3f7ea635d8f14210fbcd1a568ee2230ee
--- /dev/null
+++ b/doc/api/v2/fluid/gen_doc.py
@@ -0,0 +1,109 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import argparse
+import sys
+import types
+
+import paddle.v2.fluid as fluid
+
+
+def parse_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--submodules', nargs="*")
+    parser.add_argument(
+        'module', type=str, help='Generate the documentation of which module')
+    return parser.parse_args()
+
+
+class DocGenerator(object):
+    def __init__(self, module_name, stream=sys.stdout):
+        self.stream = stream
+        self.module_name = module_name
+        if not hasattr(fluid, module_name):
+            raise ValueError("Cannot find fluid.{0}".format(module_name))
+        else:
+            self.module = getattr(fluid, module_name)
+        self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+''')
+
+        self._print_header_(module_name, dot='=', is_title=True)
+
+    def print_submodule(self, submodule_name):
+        submodule = getattr(self.module, submodule_name)
+        if submodule is None:
+            raise ValueError("Cannot find submodule {0}".format(submodule_name))
+        self.print_section(submodule_name)
+
+        for item in submodule.__all__:
+            self.print_item(item)
+
+    def print_current_module(self):
+        for item in self.module.__all__:
+            self.print_item(item)
+
+    def print_section(self, name):
+        self._print_header_(name, dot='=', is_title=False)
+
+    def print_item(self, name):
+        item = getattr(self.module, name)
+        if isinstance(item, types.TypeType):
+            self.print_class(name)
+        elif isinstance(item, types.FunctionType):
+            self.print_method(name)
+        else:
+            raise RuntimeError("Unsupported item {0}".format(name))
+
+    def print_class(self, name):
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autoclass:: paddle.v2.fluid.{0}.{1}
+    :members:
+    :noindex:
+
+'''.format(self.module_name, name))
+
+    def print_method(self, name):
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autofunction:: paddle.v2.fluid.{0}.{1}
+    :noindex:
+
+'''.format(self.module_name, name))
+
+    def _print_header_(self, name, dot, is_title):
+        dot_line = dot * len(name)
+        if is_title:
+            self.stream.write(dot_line)
+            self.stream.write('\n')
+        self.stream.write(name)
+        self.stream.write('\n')
+        self.stream.write(dot_line)
+        self.stream.write('\n')
+        self.stream.write('\n')
+
+
+def main():
+    args = parse_arg()
+    gen = DocGenerator(args.module)
+    if args.submodules is None:
+        gen.print_current_module()
+    else:
+        for submodule_name in args.submodules:
+            gen.print_submodule(submodule_name)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/api/v2/fluid/gen_doc.sh b/doc/api/v2/fluid/gen_doc.sh
new file mode 100755
index 0000000000000000000000000000000000000000..ba7b7ba8e51399deb852b0a7c8ddd3128f521e85
--- /dev/null
+++ b/doc/api/v2/fluid/gen_doc.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
+
+for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
+do
+  python gen_doc.py ${module} > ${module}.rst
+done
diff --git a/doc/api/v2/fluid/initializer.rst b/doc/api/v2/fluid/initializer.rst
index 8f587837e9873370722062404f511654a9460587..c38be033fff2997930525f51c93995db09daa2b6 100644
--- a/doc/api/v2/fluid/initializer.rst
+++ b/doc/api/v2/fluid/initializer.rst
@@ -1,50 +1,35 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
 ===========
-Initializer
+initializer
 ===========
 
+Constant
+--------
 
-
-Initializer
------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: Initializer
-    :noindex:
-
-
-
-ConstantInitializer
--------------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: ConstantInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Constant
+    :members:
     :noindex:
 
+Uniform
+-------
 
-
-UniformInitializer
-------------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: UniformInitializer
-    :noindex:
-
-
-
-NormalInitializer
------------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: NormalInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Uniform
+    :members:
     :noindex:
 
+Normal
+------
 
-XavierInitializer
------------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: XavierInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Normal
+    :members:
     :noindex:
 
+Xavier
+------
 
-MSRAInitializer
----------------
-..  automodule:: paddle.v2.fluid.initializer
-    :members: MSRAInitializer
+..  autoclass:: paddle.v2.fluid.initializer.Xavier
+    :members:
     :noindex:
 
diff --git a/doc/api/v2/fluid/io.rst b/doc/api/v2/fluid/io.rst
index 67f68c4e9e16b379207b8de114cdf769e056f78e..37c9c273e369532e8ff596e9649cb695a98a2505 100644
--- a/doc/api/v2/fluid/io.rst
+++ b/doc/api/v2/fluid/io.rst
@@ -1,10 +1,61 @@
-===========
-IO
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
 
+==
+io
+==
 
+save_vars
+---------
 
-is_parameter
+..  autofunction:: paddle.v2.fluid.io.save_vars
+    :noindex:
+
+save_params
 -----------
-..  autofunction:: paddle.v2.fluid.io.is_parameter
+
+..  autofunction:: paddle.v2.fluid.io.save_params
+    :noindex:
+
+save_persistables
+-----------------
+
+..  autofunction:: paddle.v2.fluid.io.save_persistables
+    :noindex:
+
+load_vars
+---------
+
+..  autofunction:: paddle.v2.fluid.io.load_vars
+    :noindex:
+
+load_params
+-----------
+
+..  autofunction:: paddle.v2.fluid.io.load_params
     :noindex:
+
+load_persistables
+-----------------
+
+..  autofunction:: paddle.v2.fluid.io.load_persistables
+    :noindex:
+
+save_inference_model
+--------------------
+
+..  autofunction:: paddle.v2.fluid.io.save_inference_model
+    :noindex:
+
+load_inference_model
+--------------------
+
+..  autofunction:: paddle.v2.fluid.io.load_inference_model
+    :noindex:
+
+get_inference_program
+---------------------
+
+..  autofunction:: paddle.v2.fluid.io.get_inference_program
+    :noindex:
+
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 231ec2d4ba102a5d31c47cbc7a5d484ef17a7f3a..e24613b94b422b7cdf9c6383c359fa92a4faf6ff 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -1,546 +1,799 @@
-==========
-Layers
-==========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
 
+======
+layers
+======
 
-fc
----
-..  autofunction:: paddle.v2.fluid.layers.fc
+control_flow
+============
+
+split_lod_tensor
+----------------
+
+..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
     :noindex:
 
-embedding
----------
-..  autofunction:: paddle.v2.fluid.layers.embedding
+merge_lod_tensor
+----------------
+
+..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
     :noindex:
 
-dynamic_lstm
-------------
-..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+BlockGuard
+----------
+
+..  autoclass:: paddle.v2.fluid.layers.BlockGuard
+    :members:
     :noindex:
 
-dynamic_lstmp
--------------
-..  autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+BlockGuardWithCompletion
+------------------------
+
+..  autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion
+    :members:
     :noindex:
 
-dynamic_gru
------------
-..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
+StaticRNNMemoryLink
+-------------------
+
+..  autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink
+    :members:
     :noindex:
 
-data
-----
-..  autofunction:: paddle.v2.fluid.layers.data
+WhileGuard
+----------
+
+..  autoclass:: paddle.v2.fluid.layers.WhileGuard
+    :members:
     :noindex:
 
-mean
-----
-..  autofunction:: paddle.v2.fluid.layers.mean
+While
+-----
+
+..  autoclass:: paddle.v2.fluid.layers.While
+    :members:
     :noindex:
 
-mul
----
-..  autofunction:: paddle.v2.fluid.layers.mul
+lod_rank_table
+--------------
+
+..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
     :noindex:
 
-elementwise_add
----------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_add
+max_sequence_len
+----------------
+
+..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
     :noindex:
 
-elementwise_sub
----------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
+topk
+----
+
+..  autofunction:: paddle.v2.fluid.layers.topk
     :noindex:
 
-elementwise_mul
----------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
+lod_tensor_to_array
+-------------------
+
+..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
     :noindex:
 
-elementwise_div
----------------
-..  autofunction:: paddle.v2.fluid.layers.elementwise_div
+array_to_lod_tensor
+-------------------
+
+..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
     :noindex:
 
+increment
+---------
 
-dropout
--------
-..  autofunction:: paddle.v2.fluid.layers.dropout
+..  autofunction:: paddle.v2.fluid.layers.increment
     :noindex:
 
+array_write
+-----------
 
-reshape
---------
-..  autofunction:: paddle.v2.fluid.layers.reshape
+..  autofunction:: paddle.v2.fluid.layers.array_write
     :noindex:
 
+create_array
+------------
 
-sigmoid
+..  autofunction:: paddle.v2.fluid.layers.create_array
+    :noindex:
+
+less_than
 ---------
-..  autofunction:: paddle.v2.fluid.layers.sigmoid
+
+..  autofunction:: paddle.v2.fluid.layers.less_than
     :noindex:
 
+array_read
+----------
 
-scale
----------
-..  autofunction:: paddle.v2.fluid.layers.scale
+..  autofunction:: paddle.v2.fluid.layers.array_read
+    :noindex:
+
+shrink_memory
+-------------
+
+..  autofunction:: paddle.v2.fluid.layers.shrink_memory
     :noindex:
 
+array_length
+------------
 
-transpose
+..  autofunction:: paddle.v2.fluid.layers.array_length
+    :noindex:
+
+IfElse
+------
+
+..  autoclass:: paddle.v2.fluid.layers.IfElse
+    :members:
+    :noindex:
+
+DynamicRNN
+----------
+
+..  autoclass:: paddle.v2.fluid.layers.DynamicRNN
+    :members:
+    :noindex:
+
+ConditionalBlock
+----------------
+
+..  autoclass:: paddle.v2.fluid.layers.ConditionalBlock
+    :members:
+    :noindex:
+
+StaticRNN
 ---------
-..  autofunction:: paddle.v2.fluid.layers.transpose
+
+..  autoclass:: paddle.v2.fluid.layers.StaticRNN
+    :members:
     :noindex:
 
+reorder_lod_tensor_by_rank
+--------------------------
 
-sigmoid_cross_entropy_with_logits
----------------------------------
-..  autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
+..  autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank
     :noindex:
 
+ParallelDo
+----------
 
-cast
+..  autoclass:: paddle.v2.fluid.layers.ParallelDo
+    :members:
+    :noindex:
+
+Print
+-----
+
+..  autofunction:: paddle.v2.fluid.layers.Print
+    :noindex:
+
+device
+======
+
+get_places
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.get_places
+    :noindex:
+
+io
+==
+
+data
 ----
-..  autofunction:: paddle.v2.fluid.layers.cast
+
+..  autofunction:: paddle.v2.fluid.layers.data
     :noindex:
 
+BlockGuardServ
+--------------
 
-concat
--------
-..  autofunction:: paddle.v2.fluid.layers.concat
+..  autoclass:: paddle.v2.fluid.layers.BlockGuardServ
+    :members:
     :noindex:
 
+ListenAndServ
+-------------
 
-sums
+..  autoclass:: paddle.v2.fluid.layers.ListenAndServ
+    :members:
+    :noindex:
+
+Send
 ----
-..  autofunction:: paddle.v2.fluid.layers.sums
+
+..  autofunction:: paddle.v2.fluid.layers.Send
     :noindex:
 
+nn
+==
 
-linear_chain_crf
-----------------
-..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+fc
+--
+
+..  autofunction:: paddle.v2.fluid.layers.fc
     :noindex:
 
+embedding
+---------
 
-assign
--------
 ..  autofunction:: paddle.v2.fluid.layers.embedding
     :noindex:
 
+dynamic_lstm
+------------
 
-split_lod_tensor
-----------------
-..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
     :noindex:
 
+dynamic_lstmp
+-------------
 
-merge_lod_tensor
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+    :noindex:
+
+dynamic_gru
+-----------
+
+..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
+    :noindex:
+
+gru_unit
+--------
+
+..  autofunction:: paddle.v2.fluid.layers.gru_unit
+    :noindex:
+
+linear_chain_crf
 ----------------
-..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+
+..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+    :noindex:
+
+crf_decoding
+------------
+
+..  autofunction:: paddle.v2.fluid.layers.crf_decoding
     :noindex:
 
 cos_sim
---------
+-------
+
 ..  autofunction:: paddle.v2.fluid.layers.cos_sim
     :noindex:
 
-
 cross_entropy
 -------------
+
 ..  autofunction:: paddle.v2.fluid.layers.cross_entropy
     :noindex:
 
-
-
 square_error_cost
 -----------------
+
 ..  autofunction:: paddle.v2.fluid.layers.square_error_cost
     :noindex:
 
-
 accuracy
----------
+--------
+
 ..  autofunction:: paddle.v2.fluid.layers.accuracy
     :noindex:
 
+chunk_eval
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.chunk_eval
+    :noindex:
 
 sequence_conv
 -------------
+
 ..  autofunction:: paddle.v2.fluid.layers.sequence_conv
     :noindex:
 
-
 conv2d
 ------
+
 ..  autofunction:: paddle.v2.fluid.layers.conv2d
     :noindex:
 
-
 sequence_pool
 -------------
+
 ..  autofunction:: paddle.v2.fluid.layers.sequence_pool
     :noindex:
 
+pool2d
+------
 
-sequence_first_step
--------------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
+..  autofunction:: paddle.v2.fluid.layers.pool2d
     :noindex:
 
+batch_norm
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.batch_norm
+    :noindex:
 
-sequence_last_step
+beam_search_decode
 ------------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+
+..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
     :noindex:
 
+conv2d_transpose
+----------------
 
-pool2d
-------
-..  autofunction:: paddle.v2.fluid.layers.pool2d
+..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
     :noindex:
 
+sequence_expand
+---------------
 
-batch_norm
+..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+    :noindex:
+
+lstm_unit
+---------
+
+..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+    :noindex:
+
+reduce_sum
 ----------
-..  autofunction:: paddle.v2.fluid.layers.batch_norm
+
+..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+    :noindex:
+
+reduce_mean
+-----------
+
+..  autofunction:: paddle.v2.fluid.layers.reduce_mean
     :noindex:
 
+reduce_max
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.reduce_max
+    :noindex:
 
-beam_search_decode
+reduce_min
+----------
+
+..  autofunction:: paddle.v2.fluid.layers.reduce_min
+    :noindex:
+
+sequence_first_step
+-------------------
+
+..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
+    :noindex:
+
+sequence_last_step
 ------------------
-..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
+
+..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+    :noindex:
+
+dropout
+-------
+
+..  autofunction:: paddle.v2.fluid.layers.dropout
     :noindex:
 
+split
+-----
 
-lod_rank_table
---------------
-..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
+..  autofunction:: paddle.v2.fluid.layers.split
     :noindex:
 
+ctc_greedy_decoder
+------------------
 
-max_sequence_len
-----------------
-..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
+..  autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
     :noindex:
 
+edit_distance
+-------------
 
-topk
------
-..  autofunction:: paddle.v2.fluid.layers.topk
+..  autofunction:: paddle.v2.fluid.layers.edit_distance
     :noindex:
 
+l2_normalize
+------------
 
-lod_tensor_to_array
--------------------
-..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
+..  autofunction:: paddle.v2.fluid.layers.l2_normalize
     :noindex:
 
+matmul
+------
 
-
-array_to_lod_tensor
--------------------
-..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
+..  autofunction:: paddle.v2.fluid.layers.matmul
     :noindex:
 
+warpctc
+-------
 
+..  autofunction:: paddle.v2.fluid.layers.warpctc
+    :noindex:
 
+sequence_reshape
+----------------
 
-fill_constant
--------------
-..  autofunction:: paddle.v2.fluid.layers.fill_constant
+..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
     :noindex:
 
+transpose
+---------
 
+..  autofunction:: paddle.v2.fluid.layers.transpose
+    :noindex:
 
-fill_constant_batch_size_like
------------------------------
-..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+im2sequence
+-----------
+
+..  autofunction:: paddle.v2.fluid.layers.im2sequence
     :noindex:
 
+nce
+---
 
-ones
-----
-..  autofunction:: paddle.v2.fluid.layers.ones
+..  autofunction:: paddle.v2.fluid.layers.nce
     :noindex:
 
+beam_search
+-----------
 
-zeros
------
-..  autofunction:: paddle.v2.fluid.layers.zeros
+..  autofunction:: paddle.v2.fluid.layers.beam_search
     :noindex:
 
+row_conv
+--------
 
-increment
----------
-..  autofunction:: paddle.v2.fluid.layers.increment
+..  autofunction:: paddle.v2.fluid.layers.row_conv
     :noindex:
 
+multiplex
+---------
 
-array_write
------------
-..  autofunction:: paddle.v2.fluid.layers.array_write
+..  autofunction:: paddle.v2.fluid.layers.multiplex
     :noindex:
 
+ops
+===
 
+mean
+----
 
-create_array
-------------
-..  autofunction:: paddle.v2.fluid.layers.create_array
+..  autofunction:: paddle.v2.fluid.layers.mean
     :noindex:
 
+mul
+---
 
-less_than
----------
-..  autofunction:: paddle.v2.fluid.layers.less_than
+..  autofunction:: paddle.v2.fluid.layers.mul
     :noindex:
 
+reshape
+-------
 
-array_read
-----------
-..  autofunction:: paddle.v2.fluid.layers.array_read
+..  autofunction:: paddle.v2.fluid.layers.reshape
     :noindex:
 
+scale
+-----
 
-shrink_memory
---------------
-..  autofunction:: paddle.v2.fluid.layers.shrink_memory
+..  autofunction:: paddle.v2.fluid.layers.scale
     :noindex:
 
+sigmoid_cross_entropy_with_logits
+---------------------------------
 
-array_length
--------------
-..  autofunction:: paddle.v2.fluid.layers.array_length
+..  autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits
     :noindex:
 
+elementwise_add
+---------------
 
-conv2d_transpose
-----------------
-..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
+..  autofunction:: paddle.v2.fluid.layers.elementwise_add
     :noindex:
 
-
-sequence_expand
+elementwise_div
 ---------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+
+..  autofunction:: paddle.v2.fluid.layers.elementwise_div
     :noindex:
 
+elementwise_sub
+---------------
 
-gru_unit
---------
-..  autofunction:: paddle.v2.fluid.layers.gru_unit
+..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
     :noindex:
 
+elementwise_mul
+---------------
 
-lstm_unit
----------
-..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
     :noindex:
 
+elementwise_max
+---------------
 
-sequence_softmax
-----------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
+..  autofunction:: paddle.v2.fluid.layers.elementwise_max
     :noindex:
 
+elementwise_min
+---------------
 
-reduce_sum
-----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+..  autofunction:: paddle.v2.fluid.layers.elementwise_min
     :noindex:
 
+elementwise_pow
+---------------
 
-reduce_mean
------------
-..  autofunction:: paddle.v2.fluid.layers.reduce_mean
+..  autofunction:: paddle.v2.fluid.layers.elementwise_pow
     :noindex:
 
+clip
+----
 
-reduce_max
-----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_max
+..  autofunction:: paddle.v2.fluid.layers.clip
     :noindex:
 
+clip_by_norm
+------------
 
-reduce_min
-----------
-..  autofunction:: paddle.v2.fluid.layers.reduce_min
+..  autofunction:: paddle.v2.fluid.layers.clip_by_norm
     :noindex:
 
+sequence_softmax
+----------------
 
-split
------
-..  autofunction:: paddle.v2.fluid.layers.split
+..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
     :noindex:
 
+sigmoid
+-------
 
-matmul
-------
-..  autofunction:: paddle.v2.fluid.layers.matmul
+..  autofunction:: paddle.v2.fluid.layers.sigmoid
     :noindex:
 
 logsigmoid
 ----------
+
 ..  autofunction:: paddle.v2.fluid.layers.logsigmoid
     :noindex:
 
 exp
 ---
+
 ..  autofunction:: paddle.v2.fluid.layers.exp
     :noindex:
 
 relu
 ----
+
 ..  autofunction:: paddle.v2.fluid.layers.relu
     :noindex:
 
 tanh
 ----
+
 ..  autofunction:: paddle.v2.fluid.layers.tanh
     :noindex:
 
 tanh_shrink
 -----------
+
 ..  autofunction:: paddle.v2.fluid.layers.tanh_shrink
     :noindex:
 
 softshrink
 ----------
+
 ..  autofunction:: paddle.v2.fluid.layers.softshrink
     :noindex:
 
 sqrt
 ----
+
 ..  autofunction:: paddle.v2.fluid.layers.sqrt
     :noindex:
 
 abs
-----
+---
+
 ..  autofunction:: paddle.v2.fluid.layers.abs
     :noindex:
 
 ceil
 ----
+
 ..  autofunction:: paddle.v2.fluid.layers.ceil
     :noindex:
 
 floor
 -----
+
 ..  autofunction:: paddle.v2.fluid.layers.floor
     :noindex:
 
 round
 -----
+
 ..  autofunction:: paddle.v2.fluid.layers.round
     :noindex:
 
 reciprocal
 ----------
+
 ..  autofunction:: paddle.v2.fluid.layers.reciprocal
     :noindex:
 
 log
 ---
+
 ..  autofunction:: paddle.v2.fluid.layers.log
     :noindex:
 
 square
 ------
+
 ..  autofunction:: paddle.v2.fluid.layers.square
     :noindex:
 
 softplus
 --------
+
 ..  autofunction:: paddle.v2.fluid.layers.softplus
     :noindex:
 
 softsign
----------
+--------
+
 ..  autofunction:: paddle.v2.fluid.layers.softsign
     :noindex:
 
 brelu
 -----
+
 ..  autofunction:: paddle.v2.fluid.layers.brelu
     :noindex:
 
 leaky_relu
 ----------
+
 ..  autofunction:: paddle.v2.fluid.layers.leaky_relu
     :noindex:
 
 soft_relu
 ---------
+
 ..  autofunction:: paddle.v2.fluid.layers.soft_relu
     :noindex:
 
 elu
-----
+---
+
 ..  autofunction:: paddle.v2.fluid.layers.elu
     :noindex:
 
 relu6
 -----
+
 ..  autofunction:: paddle.v2.fluid.layers.relu6
     :noindex:
 
 pow
-----
+---
+
 ..  autofunction:: paddle.v2.fluid.layers.pow
     :noindex:
 
+stanh
+-----
+
+..  autofunction:: paddle.v2.fluid.layers.stanh
+    :noindex:
+
 hard_shrink
 -----------
+
 ..  autofunction:: paddle.v2.fluid.layers.hard_shrink
     :noindex:
 
 thresholded_relu
 ----------------
+
 ..  autofunction:: paddle.v2.fluid.layers.thresholded_relu
     :noindex:
 
 hard_sigmoid
--------------
+------------
+
 ..  autofunction:: paddle.v2.fluid.layers.hard_sigmoid
     :noindex:
 
 swish
-------
+-----
+
 ..  autofunction:: paddle.v2.fluid.layers.swish
     :noindex:
 
-im2sequence
+tensor
+======
+
+create_tensor
+-------------
+
+..  autofunction:: paddle.v2.fluid.layers.create_tensor
+    :noindex:
+
+create_parameter
+----------------
+
+..  autofunction:: paddle.v2.fluid.layers.create_parameter
+    :noindex:
+
+create_global_var
+-----------------
+
+..  autofunction:: paddle.v2.fluid.layers.create_global_var
+    :noindex:
+
+cast
+----
+
+..  autofunction:: paddle.v2.fluid.layers.cast
+    :noindex:
+
+concat
 ------
-..  autofunction:: paddle.v2.fluid.layers.im2sequence
+
+..  autofunction:: paddle.v2.fluid.layers.concat
     :noindex:
 
-edit_distance
----------------
-..  autofunction:: paddle.v2.fluid.layers.edit_distance_error
+sums
+----
+
+..  autofunction:: paddle.v2.fluid.layers.sums
     :noindex:
 
-ctc_greedy_decoder
----------------
-..  autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
+assign
+------
+
+..  autofunction:: paddle.v2.fluid.layers.assign
     :noindex:
 
-l2_normalize
-------------
-..  autofunction:: paddle.v2.fluid.layers.l2_normalize
+fill_constant_batch_size_like
+-----------------------------
+
+..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
     :noindex:
 
-sequence_reshape
-----------------
-..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
+fill_constant
+-------------
+
+..  autofunction:: paddle.v2.fluid.layers.fill_constant
     :noindex:
 
-row_conv
---------
-..  autofunction:: paddle.v2.fluid.layers.row_conv
+ones
+----
+
+..  autofunction:: paddle.v2.fluid.layers.ones
     :noindex:
 
-multiplex
----------
-..  autofunction:: paddle.v2.fluid.layers.multiplex
+zeros
+-----
+
+..  autofunction:: paddle.v2.fluid.layers.zeros
     :noindex:
+
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst
index 500019bc507f859c4c91de5d322a82eb1e78e2de..015581b7660848bdb0845fafe2d3fc05405e6ae6 100644
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@@ -1,33 +1,31 @@
-===========
-Nets
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+====
+nets
+====
 
 simple_img_conv_pool
 --------------------
-..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
-    :noindex:
 
-
-img_conv_group
----------------
-..  autofunction:: paddle.v2.fluid.nets.img_conv_group
+..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
     :noindex:
 
-
 sequence_conv_pool
 ------------------
+
 ..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
     :noindex:
 
-
 glu
 ---
+
 ..  autofunction:: paddle.v2.fluid.nets.glu
     :noindex:
 
-
 scaled_dot_product_attention
 ----------------------------
+
 ..  autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
     :noindex:
 
diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst
index 19b4940f08de3e2f7dc177f2961e538946d10a78..1691ebb9a7cb16da96e04147d0adea322374f529 100644
--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
@@ -1,54 +1,49 @@
-===========
-Optimizer
-===========
-
-Optimizer
------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: Optimizer
-    :noindex:
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
 
+=========
+optimizer
+=========
 
-SGDOptimizer
------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: SGDOptimizer
-    :noindex:
+SGD
+---
 
+..  autoclass:: paddle.v2.fluid.optimizer.SGD
+    :members:
+    :noindex:
 
+Momentum
+--------
 
-MomentumOptimizer
------------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: MomentumOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Momentum
+    :members:
     :noindex:
 
+Adagrad
+-------
 
-
-AdagradOptimizer
-----------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: AdagradOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Adagrad
+    :members:
     :noindex:
 
+Adam
+----
 
-AdamOptimizer
--------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: AdamOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Adam
+    :members:
     :noindex:
 
+Adamax
+------
 
-AdamaxOptimizer
------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: AdamaxOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.Adamax
+    :members:
     :noindex:
 
+DecayedAdagrad
+--------------
 
-DecayedAdagradOptimizer
------------------------
-..  automodule:: paddle.v2.fluid.optimizer
-    :members: DecayedAdagradOptimizer
+..  autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
+    :members:
     :noindex:
 
diff --git a/doc/api/v2/fluid/param_attr.rst b/doc/api/v2/fluid/param_attr.rst
index ca0c8af9e8c4f2271de7a131ad0d27c0e8635f50..8083d0d858dafcd275eaddb9b475875ee42ef724 100644
--- a/doc/api/v2/fluid/param_attr.rst
+++ b/doc/api/v2/fluid/param_attr.rst
@@ -1,11 +1,21 @@
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==========
+param_attr
+==========
+
 ParamAttr
-===========
+---------
 
+..  autoclass:: paddle.v2.fluid.param_attr.ParamAttr
+    :members:
+    :noindex:
 
+WeightNormParamAttr
+-------------------
 
-ParamAttr
------------
-..  automodule:: paddle.v2.fluid.param_attr
-    :members: ParamAttr
+..  autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
+    :members:
     :noindex:
+
diff --git a/doc/api/v2/fluid/profiler.rst b/doc/api/v2/fluid/profiler.rst
index 7d4042d1f41c12c4a551ba6576559d612116872a..4a1ff7cb6976e0054f77428b699ea679aa91394f 100644
--- a/doc/api/v2/fluid/profiler.rst
+++ b/doc/api/v2/fluid/profiler.rst
@@ -1,10 +1,25 @@
-===========
-Profiler
-===========
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
 
+========
+profiler
+========
 
+cuda_profiler
+-------------
 
-Profiler
------------
 ..  autofunction:: paddle.v2.fluid.profiler.cuda_profiler
     :noindex:
+
+reset_profiler
+--------------
+
+..  autofunction:: paddle.v2.fluid.profiler.reset_profiler
+    :noindex:
+
+profiler
+--------
+
+..  autofunction:: paddle.v2.fluid.profiler.profiler
+    :noindex:
+
diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst
index 868e225ed3d59e79aeb217fb88081ea25f80fa2c..2c17d15599baa1d02eb87c7b6c40034769ebb3a4 100644
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
@@ -1,25 +1,27 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
 ===========
-Regularizer
+regularizer
 ===========
 
-WeightDecayRegularizer
-----------------------
-..  automodule:: paddle.v2.fluid.regularizer
-    :members: WeightDecayRegularizer
-    :noindex:
-
+append_regularization_ops
+-------------------------
 
-L2DecayRegularizer
-------------------
-..  automodule:: paddle.v2.fluid.regularizer
-    :members: L2DecayRegularizer
+..  autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
     :noindex:
 
+L1Decay
+-------
 
+..  autoclass:: paddle.v2.fluid.regularizer.L1Decay
+    :members:
+    :noindex:
 
-L1DecayRegularizer
--------------------
-..  automodule:: paddle.v2.fluid.regularizer
-    :members: L1DecayRegularizer
+L2Decay
+-------
 
+..  autoclass:: paddle.v2.fluid.regularizer.L2Decay
+    :members:
+    :noindex:
 
diff --git a/doc/design/speech/README.MD b/doc/design/speech/deep_speech_2.md
similarity index 85%
rename from doc/design/speech/README.MD
rename to doc/design/speech/deep_speech_2.md
index 7304650e628dba210488cd2dc4836318b5383b2a..cfdc4d6df04344c70d3334626bd38eca997c31ff 100644
--- a/doc/design/speech/README.MD
+++ b/doc/design/speech/deep_speech_2.md
@@ -140,7 +140,19 @@ TODO by Assignees
 
 ### Beam Search with CTC and LM
 
-TODO by Assignees
+<div align="center">
+<img src="image/beam_search.png" width=600><br/>
+Figure 2. Algorithm for CTC Beam Search Decoder.
+</div>
+
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts: 
+   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; 
+   - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
+- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
+- Such external scorer consists of language model, word count or any other custom scorers.
+- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality. 
+ 
 
 ## Future Work
 
@@ -153,3 +165,4 @@ TODO by Assignees
 
 1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
 2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). 	arXiv:1512.02595.
+3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
diff --git a/doc/design/speech/image/beam_search.png b/doc/design/speech/image/beam_search.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f7e35f34223162d0f7f0ed97375909c43b830ae
Binary files /dev/null and b/doc/design/speech/image/beam_search.png differ
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
index 71904dc41ed0d946867d890cc585e1b88450ca8c..ff904b1022a41612c9680dce92d3fc2c69ad7e93 100644
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -115,7 +115,7 @@ PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种B
     "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
     "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
     "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
-    "WITH_TESTING", "是否开启单元测试", "ON"
+    "WITH_TESTING", "是否开启单元测试", "OFF"
     "WITH_DOC", "是否编译中英文文档", "OFF"
     "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
     "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
index 27f73b2e2c029b41d514e1612912ed1c335605b6..718fb869c23a1f7be82c87c726282bded9dad516 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like:
     "WITH_AVX", "Build with AVX support", "ON"
     "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
     "WITH_STYLE_CHECK", "Check code style when building", "ON"
-    "WITH_TESTING", "Build unit tests", "ON"
+    "WITH_TESTING", "Build unit tests", "OFF"
     "WITH_DOC", "Build documentations", "OFF"
     "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
     "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 98fada7bdb46f4dd2927d6f93bcbcebbe7d18604..79d214635a069a739060e0b79424729f6ff90387 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
 
      docker run -p 8888:8888 paddlepaddle/book
 
+国内用户可以使用下面的镜像源来加速访问：
+
+  .. code-block: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
 然后在浏览器中输入以下网址：
 
   .. code-block:: text
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index b1d0890b4cdddb77114a80276130afd07c22d270..e0e0559fb858a093db96a9b4ec1c5a45d6c71a38 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command:
 
      docker run -p 8888:8888 paddlepaddle/book
 
+For users in China, we provide a faster mirror:
+
+  .. code-block: bash
+
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
 Then, you would back and paste the address into the local browser:
 
   .. code-block:: text
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 318661af8bd04880577222fdc82cc1b6e79a457f..8b71f73c36c33d882b34c833031c50cd14817e76 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -22,11 +22,11 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
-nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
+nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
 
 cc_test(variable_test SRCS variable_test.cc)
 
-cc_library(threadpool SRCS threadpool.cc)
+cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
 
 cc_library(scope SRCS scope.cc DEPS glog threadpool)
diff --git a/paddle/framework/channel.h b/paddle/framework/channel.h
index 70ecccc1a1078374f3190b3956103ed8000c4fc5..0570980c5a4d7fa45e672ae5baac65d2c65ddad9 100644
--- a/paddle/framework/channel.h
+++ b/paddle/framework/channel.h
@@ -26,9 +26,7 @@ class Channel {
   virtual void Send(T*) = 0;
   virtual void Receive(T*) = 0;
   virtual size_t Cap() = 0;
-
-  // Don't delete channels; instead, call Channel::Close.
- protected:
+  virtual void Close() = 0;
   virtual ~Channel() {}
 };
 
@@ -50,11 +48,7 @@ Channel<T>* MakeChannel(size_t buffer_size) {
 
 template <typename T>
 void CloseChannel(Channel<T>* ch) {
-  if (ch->Cap() > 0) {
-    delete dynamic_cast<details::Buffered<T>*>(ch);
-  } else {
-    delete dynamic_cast<details::UnBuffered<T>*>(ch);
-  }
+  ch->Close();
 }
 
 }  // namespace framework
diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
index 9efc0172658c800d14102531332dbef68fa392f4..1510fb8abf54f05804bd404d9bd00ecc42fbef63 100644
--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@@ -14,13 +14,67 @@ limitations under the License. */
 
 #include "paddle/framework/channel.h"
 
+#include <chrono>
+#include <thread>
+
 #include "gtest/gtest.h"
 
+using paddle::framework::Channel;
+using paddle::framework::MakeChannel;
+using paddle::framework::CloseChannel;
+
 TEST(Channel, MakeAndClose) {
-  using paddle::framework::Channel;
-  using paddle::framework::MakeChannel;
-  using paddle::framework::CloseChannel;
+  using paddle::framework::details::Buffered;
+  using paddle::framework::details::UnBuffered;
+  {
+    // MakeChannel should return a buffered channel is buffer_size > 0.
+    auto ch = MakeChannel<int>(10);
+    EXPECT_NE(dynamic_cast<Buffered<int>*>(ch), nullptr);
+    EXPECT_EQ(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
+    CloseChannel(ch);
+    delete ch;
+  }
+  {
+    // MakeChannel should return an un-buffered channel is buffer_size = 0.
+    auto ch = MakeChannel<int>(0);
+    EXPECT_EQ(dynamic_cast<Buffered<int>*>(ch), nullptr);
+    EXPECT_NE(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
+    CloseChannel(ch);
+    delete ch;
+  }
+}
+
+TEST(Channel, SufficientBufferSizeDoesntBlock) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  for (size_t i = 0; i < buffer_size; ++i) {
+    ch->Send(&i);  // should not block
+  }
+
+  size_t out;
+  for (size_t i = 0; i < buffer_size; ++i) {
+    ch->Receive(&out);  // should not block
+    EXPECT_EQ(out, i);
+  }
+  CloseChannel(ch);
+  delete ch;
+}
+
+TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  size_t sum = 0;
+  std::thread t([&]() {
+    // Try to write more than buffer size.
+    for (size_t i = 0; i < 2 * buffer_size; ++i) {
+      ch->Send(&i);  // should not block
+      sum += i;
+    }
+  });
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  EXPECT_EQ(sum, 45U);
 
-  Channel<int>* ch = MakeChannel<int>(10);
   CloseChannel(ch);
+  t.join();
+  delete ch;
 }
diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h
index 572e29d44a3baec84a029d87f9b0874784aa761b..b093e1589293b030ef2bedb82504a8e86b3dc857 100644
--- a/paddle/framework/details/buffered_channel.h
+++ b/paddle/framework/details/buffered_channel.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <mutex>
 
 #include "paddle/framework/channel.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
@@ -32,6 +33,8 @@ class Buffered : public paddle::framework::Channel<T> {
   virtual void Send(T*);
   virtual void Receive(T*);
   virtual size_t Cap() { return cap_; }
+  virtual void Close();
+  virtual ~Buffered();
 
  private:
   size_t cap_;
@@ -39,9 +42,11 @@ class Buffered : public paddle::framework::Channel<T> {
   std::condition_variable empty_cond_var_;
   std::condition_variable full_cond_var_;
   std::deque<T> channel_;
+  bool closed_;
 
-  Buffered(size_t cap) : cap_(cap) {}
-  virtual ~Buffered();
+  Buffered(size_t cap) : cap_(cap), closed_(false) {
+    PADDLE_ENFORCE_GT(cap, 0);
+  }
 
   void NotifyAllSenders(std::unique_lock<std::mutex>*);
 };
@@ -49,24 +54,39 @@ class Buffered : public paddle::framework::Channel<T> {
 template <typename T>
 void Buffered<T>::Send(T* item) {
   std::unique_lock<std::mutex> lock(mu_);
-  full_cond_var_.wait(lock, [this]() { return channel_.size() < cap_; });
-  channel_.push_back(std::move(*item));
-  lock.unlock();
-  empty_cond_var_.notify_one();
+  full_cond_var_.wait(lock,
+                      [this]() { return channel_.size() < cap_ || closed_; });
+  if (!closed_) {
+    channel_.push_back(std::move(*item));
+    lock.unlock();
+    empty_cond_var_.notify_one();
+  }
 }
 
 template <typename T>
 void Buffered<T>::Receive(T* item) {
   std::unique_lock<std::mutex> lock(mu_);
-  empty_cond_var_.wait(lock, [this]() { return !channel_.empty(); });
-  *item = std::move(channel_.front());
-  channel_.pop_front();
+  empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
+  if (!closed_) {
+    *item = std::move(channel_.front());
+    channel_.pop_front();
+    NotifyAllSenders(&lock);
+  } else {
+    item = nullptr;
+  }
+}
+
+template <typename T>
+void Buffered<T>::Close() {
+  std::unique_lock<std::mutex> lock(mu_);
+  closed_ = true;
   NotifyAllSenders(&lock);
 }
 
 template <typename T>
 Buffered<T>::~Buffered() {
   std::unique_lock<std::mutex> lock(mu_);
+  closed_ = true;
   channel_.clear();
   NotifyAllSenders(&lock);
 }
@@ -74,7 +94,7 @@ Buffered<T>::~Buffered() {
 template <typename T>
 void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) {
   lock->unlock();
-  full_cond_var_.notify_one();
+  full_cond_var_.notify_all();
 }
 
 }  // namespace details
diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h
index 7ecced1fba88fea781fc342091bc71e5aa496d3a..cc2d2e587eca981307d4e522bd569fbffa450207 100644
--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
@@ -32,10 +32,11 @@ class UnBuffered : public paddle::framework::Channel<T> {
   virtual void Send(T*);
   virtual void Receive(T*);
   virtual size_t Cap() { return 0; }
+  virtual void Close();
+  virtual ~UnBuffered();
 
  private:
   UnBuffered() {}
-  virtual ~UnBuffered();
 };
 
 template <typename T>
@@ -44,6 +45,9 @@ void UnBuffered<T>::Send(T* channel_element) {}
 template <typename T>
 void UnBuffered<T>::Receive(T*) {}
 
+template <typename T>
+void UnBuffered<T>::Close() {}
+
 template <typename T>
 UnBuffered<T>::~UnBuffered() {}
 
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index cbf3ec75265fa74aaacffee684b7b7d5f73b7c02..9a232b08434d299d10bb2acdb6e96295de875d56 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/platform/place.h"
 #include "paddle/platform/profiler.h"
 
-DECLARE_bool(do_memory_benchmark);
+DECLARE_bool(benchmark);
 DEFINE_bool(check_nan_inf, false,
             "Checking whether operator produce NAN/INF or not. It will be "
             "extremely slow so please use this flag wisely.");
@@ -33,9 +33,6 @@ DEFINE_bool(check_nan_inf, false,
 namespace paddle {
 namespace framework {
 
-const std::string kFeedOpType = "feed";
-const std::string kFetchOpType = "fetch";
-
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
 static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
@@ -125,7 +122,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
 
     op->Run(*local_scope, place_);
     VLOG(3) << op->DebugStringEx(local_scope);
-    if (FLAGS_do_memory_benchmark) {
+    if (FLAGS_benchmark) {
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
               << memory::memory_usage(place_);
     }
@@ -142,7 +139,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
   if (create_vars && create_local_scope) {
     scope->DeleteScope(local_scope);
   }
-  if (FLAGS_do_memory_benchmark) {
+  if (FLAGS_benchmark) {
     VLOG(2) << "-------------------------------------------------------";
     VLOG(2) << "Memory used after deleting local scope: "
             << memory::memory_usage(place_);
diff --git a/paddle/framework/feed_fetch_type.h b/paddle/framework/feed_fetch_type.h
index 9bc4a90c44828ecb7458d524f59609f01848cc5c..168f456675af508df86dd0520cdeb5d16d94ad31 100644
--- a/paddle/framework/feed_fetch_type.h
+++ b/paddle/framework/feed_fetch_type.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include <vector>
 #include "paddle/framework/lod_tensor.h"
 
@@ -20,5 +21,8 @@ namespace paddle {
 namespace framework {
 using FeedFetchType = LoDTensor;
 using FeedFetchList = std::vector<FeedFetchType>;
+
+static const std::string kFeedOpType = "feed";
+static const std::string kFetchOpType = "fetch";
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
index 4ef82a541efaa35bcf831d5122570154f2fa2423..3f6ea121b3994979d89a7d5a8c20c59240a0c111 100644
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <string.h>  // for strdup
 #include <algorithm>
+#include <stdexcept>
 #include <string>
 
 #include "paddle/framework/init.h"
@@ -46,17 +47,23 @@ void InitDevices() {
 
   std::vector<platform::Place> places;
   places.emplace_back(platform::CPUPlace());
+  int count = 0;
 
 #ifdef PADDLE_WITH_CUDA
-  int count = platform::GetCUDADeviceCount();
-  for (int i = 0; i < count; ++i) {
-    places.emplace_back(platform::CUDAPlace(i));
+  try {
+    count = platform::GetCUDADeviceCount();
+  } catch (const std::exception &exp) {
+    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
   }
 #else
   LOG(WARNING)
-      << "'GPU' is not supported, Please re-compile with WITH_GPU option";
+      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
 #endif
 
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(platform::CUDAPlace(i));
+  }
+
   platform::DeviceContextPool::Init(places);
 }
 
diff --git a/paddle/framework/init_test.cc b/paddle/framework/init_test.cc
index f837a965d3be7d40c20803ae4462b3bfd91bffd0..01e076dd8ea24831e3ed7c8a7f8fae6818a89335 100644
--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
@@ -20,7 +20,21 @@ TEST(InitDevices, CPU) {
   using paddle::framework::InitDevices;
   using paddle::platform::DeviceContextPool;
 
+#ifndef PADDLE_WITH_CUDA
   InitDevices();
   DeviceContextPool& pool = DeviceContextPool::Instance();
-  ASSERT_GE(pool.size(), 1U);
+  ASSERT_EQ(pool.size(), 1U);
+#endif
+}
+
+TEST(InitDevices, CUDA) {
+  using paddle::framework::InitDevices;
+  using paddle::platform::DeviceContextPool;
+
+#ifdef PADDLE_WITH_CUDA
+  int count = paddle::platform::GetCUDADeviceCount();
+  InitDevices();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
+#endif
 }
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 53b0d0fe083579da4f0bb600f292765aa2aa0d8a..cb27de6991674247e6215ce64a2da5000fa78ed4 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -24,8 +24,6 @@ limitations under the License. */
 #include <algorithm>
 #include <iterator>
 
-#include <glog/logging.h>
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 9d1294fdeb9bd76bf944f7ec3687e3c5bb333241..d0ab640485baf6d76ee629ea420b603f42b031b4 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -18,11 +18,11 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 #endif
 
 #include <glog/logging.h>
 #include "paddle/framework/ddim.h"
+#include "paddle/framework/mixed_vector.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/framework/tensor_util.h"
 #include "paddle/platform/enforce.h"
@@ -31,15 +31,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-#ifndef PADDLE_WITH_CUDA
-template <typename T>
-using Vector = std::vector<T>;
-#else
-template <typename T>
-using Vector = thrust::host_vector<
-    T, thrust::system::cuda::experimental::pinned_allocator<T>>;
-#endif
-
 /*
  * LoD is short for Level of Details.
  *
@@ -55,7 +46,15 @@ using Vector = thrust::host_vector<
  *    0 2 4 7
  *    0 2 5 7 10 12 15 20
  */
-using LoD = std::vector<Vector<size_t>>;
+struct LoD : public std::vector<Vector<size_t>> {
+  using std::vector<Vector<size_t>>::vector;
+
+  void CopyFromCUDA() {
+    for (auto it = this->begin(); it != this->end(); ++it) {
+      it->CopyFromCUDA();
+    }
+  }
+};
 
 std::ostream& operator<<(std::ostream& os, const LoD& lod);
 std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
@@ -109,7 +108,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
  */
 class LoDTensor : public Tensor {
  public:
-  LoDTensor() {}
+  LoDTensor() : Tensor() {}
+
+  /* Constructor with place should only be used in pybind */
+  explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
 
   explicit LoDTensor(const LoD& lod) : lod_(lod) {}
 
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 4d172c43c7cceacb7d0dfaf1c4d3028717350268..3b63020e685436396071fa05cd7697630ae56c95 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -23,6 +23,17 @@
 namespace paddle {
 namespace framework {
 
+TEST(LoD, data) {
+  LoD lod{{0, 1, 2}};
+  lod.push_back({0, 2, 4, 5});
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
+
+  auto& v = lod[0];
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], i);
+  }
+}
+
 TEST(LodExpand, test) {
   LoD lod{{0, 2}};
   LoDTensor tensor;
diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu
index 1e253a2f6f35e827fb2e5db6270da03705b39514..d4c9f00bd9c00f3cae68858ca46c5320fc117405 100644
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -14,6 +14,8 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <stdio.h>
+#include "paddle/framework/init.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/platform/assert.h"
 
@@ -26,7 +28,48 @@ __global__ void test(size_t* a, int size) {
   }
 }
 
+TEST(Vector, Normal) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::memory;
+
+  paddle::framework::InitDevices();
+
+  paddle::framework::Vector<size_t> vec({1, 2, 3});
+  size_t* ptr = vec.data();
+  for (size_t i = 0; i < vec.size(); ++i) {
+    EXPECT_EQ(vec[i], *(ptr + i));
+  }
+
+  vec.clear();
+  vec.CopyFromCUDA();
+
+  std::vector<size_t> v = {1, 2, 3};
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], vec[i]);
+  }
+}
+
+TEST(LoD, data) {
+  paddle::framework::InitDevices();
+
+  paddle::framework::LoD lod{{0, 1, 2}};
+  lod.push_back({0, 2, 4, 5});
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
+
+  auto& v = lod[0];
+  test<<<1, 1>>>(v.cuda_data(), v.size());
+  cudaDeviceSynchronize();
+
+  v.CopyFromCUDA();
+  for (size_t i = 0; i < v.size(); ++i) {
+    EXPECT_EQ(v[i], i * 2);
+  }
+}
+
 TEST(LoDTensor, LoDInGPU) {
+  paddle::framework::InitDevices();
+
   paddle::framework::LoDTensor lod_tensor;
   paddle::platform::CUDAPlace place(0);
 
@@ -42,8 +85,9 @@ TEST(LoDTensor, LoDInGPU) {
 
   auto lod = lod_tensor.lod();
 
-  test<<<1, 8>>>(lod[0].data(), lod[0].size());
+  test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size());
   cudaDeviceSynchronize();
+  lod.CopyFromCUDA();
 
   for (size_t i = 0; i < src_lod[0].size(); ++i) {
     EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
diff --git a/paddle/framework/mixed_vector.h b/paddle/framework/mixed_vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e0e23958602343f8e0106e3a88eaac9c6d71066
--- /dev/null
+++ b/paddle/framework/mixed_vector.h
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <initializer_list>
+#include <vector>
+
+#include "paddle/memory/memcpy.h"
+#include "paddle/memory/memory.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+/**
+ * @brief Vector support both cpu and gpu.
+ * host vector lifetime is same with Vector
+ * device vector is lazily malloc and modified.
+ */
+
+template <typename T>
+class Vector : public std::vector<T> {
+ public:
+  /* NOTE(dzhwinter):
+   * Data always store and modified on Host.
+   * If the data is modified when use cuda_data interface,
+   * You need to call the CopyFromCUDA explicitly to synchronize data.
+   *
+   */
+  enum class kDataPosition {
+    kDataOnHost = 0,
+    kDataOnDevice = 1,
+  };
+
+ public:
+  using std::vector<T>::vector;
+
+  Vector() {}
+  Vector(const std::vector<T> &v) : std::vector<T>(v) {}  // NOLINT
+
+  virtual ~Vector() {
+#ifdef PADDLE_WITH_CUDA
+    if (cuda_ptr_ != nullptr) {
+      memory::Free<platform::CUDAPlace>(place_, static_cast<void *>(cuda_ptr_));
+    }
+#endif
+  }
+
+  T *cuda_data() {
+    CopyToCUDA();
+    PADDLE_ENFORCE_NOT_NULL(
+        cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
+    return static_cast<T *>(cuda_ptr_);
+  }
+
+  T *data() { return std::vector<T>::data(); }
+
+  const T *data() const { return std::vector<T>::data(); }
+
+  void CopyToCUDA();
+
+  void CopyFromCUDA();
+
+  void CopyToPeer(platform::Place);
+
+ private:
+  void *cuda_ptr_ = nullptr;
+  size_t cuda_size_ = 0;
+  /*The DataPosition is unused now,
+    if we want support random access from cpu and cuda,
+    we need to overload all the vector method */
+
+  kDataPosition position_ = kDataPosition::kDataOnHost;
+  platform::CUDAPlace place_;
+};
+
+template <typename T>
+void Vector<T>::CopyToCUDA() {
+#ifdef PADDLE_WITH_CUDA
+  if (cuda_ptr_ == nullptr) {
+    cuda_ptr_ =
+        memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
+  }
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *cuda_ctx = pool.GetByPlace(place_);
+
+  memory::Copy(place_, static_cast<void *>(cuda_ptr_), platform::CPUPlace(),
+               static_cast<const void *>(this->data()),
+               this->size() * sizeof(T), cuda_ctx->stream());
+  cuda_ctx->Wait();
+
+  cuda_size_ = this->size();
+#endif
+}
+
+template <typename T>
+void Vector<T>::CopyFromCUDA() {
+#ifdef PADDLE_WITH_CUDA
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *cuda_ctx = pool.GetByPlace(place_);
+  if (cuda_ptr_ == nullptr) {
+    LOG(WARNING) << "No uncommited cuda data.";
+    return;
+  }
+  this->resize(cuda_size_);
+  memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
+               static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
+               cuda_ctx->stream());
+  cuda_ctx->Wait();
+
+#endif
+}
+
+template <typename T>
+void Vector<T>::CopyToPeer(platform::Place peer_place) {
+  if (platform::is_cpu_place(peer_place)) {
+    return;
+  }
+#ifdef PADDLE_WITH_CUDA
+  auto *cuda_ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
+  void *peer_cuda_ptr_ = memory::Alloc<platform::CUDAPlace>(
+      boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
+  memory::Copy(boost::get<platform::CUDAPlace>(peer_place),
+               static_cast<void *>(peer_cuda_ptr_), place_,
+               static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
+               cuda_ctx->stream());
+  cuda_ctx->Wait();
+  memory::Free<platform::CUDAPlace>(place_, static_cast<void *>(cuda_ptr_));
+  place_ = boost::get<platform::CUDAPlace>(peer_place);
+  cuda_ptr_ = peer_cuda_ptr_;
+#endif
+}
+
+template class Vector<int>;
+template class Vector<unsigned>;
+template class Vector<size_t>;
+template class Vector<int64_t>;
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 831b1e2a1e10777d9e89364adcd4b1f367e86080..4e854f54dd43d760bab44fb5f7cafeb13314b27c 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,9 +22,7 @@ limitations under the License. */
 #include "paddle/framework/shape_inference.h"
 #include "paddle/framework/var_type.h"
 
-DEFINE_bool(op_sync, false,
-            "Default cuda is asynchronous device, set to True will"
-            "force op run in synchronous mode.");
+DECLARE_bool(benchmark);
 
 namespace paddle {
 namespace framework {
@@ -531,7 +529,7 @@ void OperatorWithKernel::Run(const Scope& scope,
       ExecutionContext(*this, new_scope, *new_dev_ctx));
 
   /*For profiling/benchmark only*/
-  if (FLAGS_op_sync) {
+  if (FLAGS_benchmark) {
     new_dev_ctx->Wait();
   }
 }
diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc
index b5d9e5e385c1ba57169ef885824fc23b0f130692..15ea4035c6e6193105b621210a900e74d1466941 100644
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/block_desc.h"
+#include "paddle/framework/feed_fetch_type.h"
 
 namespace paddle {
 namespace framework {
@@ -64,5 +65,27 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
   }
 }
 
+const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
+  BlockDesc *global_block = blocks_[0].get();
+  std::vector<std::string> feed_target_names;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]);
+    }
+  }
+  return feed_target_names;
+}
+
+const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
+  BlockDesc *global_block = blocks_[0].get();
+  std::vector<std::string> fetch_target_names;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      fetch_target_names.push_back(op->Input("X")[0]);
+    }
+  }
+  return fetch_target_names;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h
index 15a962bb696d6172acd1a83cf9bb1ffd0846d449..8e958eab6ee08436ca73b13bac010e66c7df2b8b 100644
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <vector>
+#include "paddle/framework/block_desc.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/proto_desc.h"
 #include "paddle/platform/macros.h"
@@ -45,6 +46,9 @@ class ProgramDesc {
 
   proto::ProgramDesc *Proto();
 
+  const std::vector<std::string> GetFeedTargetNames();
+  const std::vector<std::string> GetFetchTargetNames();
+
  private:
   proto::ProgramDesc desc_;
 
diff --git a/paddle/framework/prune.cc b/paddle/framework/prune.cc
index 25eb813ffb96e9b1e13299421ead9f85c02da59f..bff8e0bceaca9749101b2c45edddba526d565624 100644
--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include <glog/logging.h>
@@ -102,6 +103,32 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
       *op_field->Add() = input.blocks(block_id).ops(i);
     }
   }
+
+  // remove the VarDescs in BlockDesc that are not referenced in
+  // the pruned OpDescs
+  std::unordered_map<std::string, proto::VarDesc> var_map;
+  auto* var_field = output->mutable_blocks(block_id)->mutable_vars();
+  for (const auto& var : *var_field) {
+    var_map[var.name()] = var;
+  }
+
+  var_field->Clear();
+  for (const auto& op : *op_field) {
+    // add VarDescs of all input arguments for each OpDesc
+    auto& input_field = op.inputs();
+    for (auto& input_var : input_field) {
+      for (auto& arg : input_var.arguments()) {
+        *var_field->Add() = var_map[arg];
+      }
+    }
+    // add VarDescs of all output arguments for each OpDesc
+    auto& output_field = op.outputs();
+    for (auto& output_var : output_field) {
+      for (auto& arg : output_var.arguments()) {
+        *var_field->Add() = var_map[arg];
+      }
+    }
+  }
 }
 
 // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index a67ff910093d93060d07d849f6e968e5f4ce21cd..af08b2ab816f63c05d4c65df9601c787e57994f5 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -20,9 +20,11 @@ limitations under the License. */
 #include "paddle/framework/threadpool.h"
 #include "paddle/string/printf.h"
 
-DEFINE_bool(do_memory_benchmark, false,
+DEFINE_bool(benchmark, false,
             "Doing memory benchmark. It will make deleting scope synchronized, "
-            "and add some memory usage logs");
+            "and add some memory usage logs."
+            "Default cuda is asynchronous device, set to True will"
+            "force op run in synchronous mode.");
 
 namespace paddle {
 namespace framework {
@@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) {
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
   this->kids_.erase(it);
   // When making memory benchmark on Fluid, we have to delete scope sync.
-  if (FLAGS_do_memory_benchmark) {
+  if (FLAGS_benchmark) {
     delete scope;
   } else {
     Async([scope] { delete scope; });
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 4aaa29d794c95592832a1fe990e2dce274eba9d5..f0ea709a5c37e769e3ffa1b2e9d1e39721979251 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -47,6 +47,11 @@ class Tensor {
  public:
   Tensor() : offset_(0) {}
 
+  /*! Constructor with place should only be used in pybind. */
+  explicit Tensor(const platform::Place& place) : offset_(0) {
+    holder_->set_place(place);
+  }
+
   /*! Return a pointer to mutable memory block. */
   template <typename T>
   inline T* data();
@@ -137,6 +142,7 @@ class Tensor {
     virtual std::type_index type() const = 0;
     virtual platform::Place place() const = 0;
     virtual void set_type(std::type_index type) = 0;
+    virtual void set_place(platform::Place place) = 0;
   };
 
   template <typename Place>
@@ -156,6 +162,7 @@ class Tensor {
     virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual std::type_index type() const { return type_; }
     virtual void set_type(std::type_index type) { type_ = type; }
+    virtual void set_place(platform::Place place) { place_ = place; }
 
     /*! the pointer of memory block. */
     std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
diff --git a/paddle/framework/threadpool.cc b/paddle/framework/threadpool.cc
index b2f5ae4a96593fde1623dd10d3b63c984ae228db..b7d7c00bcf9d9770f58284023ca2defcda299d64 100644
--- a/paddle/framework/threadpool.cc
+++ b/paddle/framework/threadpool.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/framework/threadpool.h"
 
+#include "paddle/platform/enforce.h"
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h
index 8912b1a43a26f9df662d3b5ddf68bfb2b87f4a20..4e9b58679d9e7c84adf76b6245b397c7a8872483 100644
--- a/paddle/framework/threadpool.h
+++ b/paddle/framework/threadpool.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include <thread>
 #include <vector>
 
-#include "paddle/platform/enforce.h"
+#include "paddle/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
index ae4d3fd2f58daf87a650428e04722581610ed780..2289ddc139cbddfbaa5238e683b2f8e784a7291e 100644
--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
@@ -1,14 +1,14 @@
-set(FLUID_CORE_MODULES proto_desc paddle_memory executor prune init)
+set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
 
 cc_library(paddle_fluid_api
-    SRCS inference.cc
+    SRCS io.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
 # Merge all modules into a single static library
 cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
 # Create shared library
-add_library(paddle_fluid_shared SHARED inference.cc)
+add_library(paddle_fluid_shared SHARED io.cc)
 
 target_circle_link_libraries(paddle_fluid_shared
   ARCHIVE_START
@@ -20,23 +20,10 @@ SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 
 # install library & headers
 if(NOT WITH_C_API AND WITH_FLUID)
-  install(FILES inference.h DESTINATION include/paddle/inference)
+  install(FILES io.h DESTINATION include/paddle/inference)
   install(TARGETS paddle_fluid_shared DESTINATION lib)
 endif()
 
-add_executable(example example.cc)
-if(APPLE)
-  set(OPTIONAL_LINK_FLAGS)
-  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-    set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup")
-  endif()
-  target_link_libraries(example
-      -Wl,-force_load paddle_fluid
-      ${OPTIONAL_LINK_FLAGS}
-      ${PTOOLS_LIB})
-else()
-  target_link_libraries(example
-      -Wl,--start-group -Wl,--whole-archive paddle_fluid
-      -Wl,--no-whole-archive -Wl,--end-group
-      ${PTOOLS_LIB})
+if(WITH_TESTING)
+  add_subdirectory(tests/book)
 endif()
diff --git a/paddle/inference/example.cc b/paddle/inference/example.cc
deleted file mode 100644
index 0c18b45624dedcb5839d4b771e044b4a7b32af52..0000000000000000000000000000000000000000
--- a/paddle/inference/example.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <time.h>
-#include <iostream>
-#include "gflags/gflags.h"
-#include "paddle/inference/inference.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_dirname.empty()) {
-    // Example:
-    //   ./example --dirname=recognize_digits_mlp.inference.model
-    std::cout << "Usage: ./example --dirname=path/to/your/model" << std::endl;
-    exit(1);
-  }
-
-  std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
-  std::string dirname = FLAGS_dirname;
-
-  paddle::InferenceEngine* engine = new paddle::InferenceEngine();
-  engine->LoadInferenceModel(dirname);
-
-  paddle::framework::LoDTensor input;
-  srand(time(0));
-  float* input_ptr =
-      input.mutable_data<float>({1, 784}, paddle::platform::CPUPlace());
-  for (int i = 0; i < 784; ++i) {
-    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
-  }
-
-  std::vector<paddle::framework::LoDTensor> feeds;
-  feeds.push_back(input);
-  std::vector<paddle::framework::LoDTensor> fetchs;
-  engine->Execute(feeds, fetchs);
-
-  for (size_t i = 0; i < fetchs.size(); ++i) {
-    auto dims_i = fetchs[i].dims();
-    std::cout << "dims_i:";
-    for (int j = 0; j < dims_i.size(); ++j) {
-      std::cout << " " << dims_i[j];
-    }
-    std::cout << std::endl;
-    std::cout << "result:";
-    float* output_ptr = fetchs[i].data<float>();
-    for (int j = 0; j < paddle::framework::product(dims_i); ++j) {
-      std::cout << " " << output_ptr[j];
-    }
-    std::cout << std::endl;
-  }
-
-  delete engine;
-  return 0;
-}
diff --git a/paddle/inference/inference.cc b/paddle/inference/inference.cc
deleted file mode 100644
index b43c359ed1787143403336e8c1cb4c7f85b1d7a2..0000000000000000000000000000000000000000
--- a/paddle/inference/inference.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "inference.h"
-#include <fstream>
-#include "paddle/framework/executor.h"
-#include "paddle/framework/init.h"
-#include "paddle/framework/scope.h"
-
-namespace paddle {
-
-void InferenceEngine::LoadInferenceModel(const std::string& dirname) {
-  std::string model_filename = dirname + "/__model__";
-  LOG(INFO) << "loading model from " << model_filename;
-  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
-  std::string program_desc_str;
-  inputfs.seekg(0, std::ios::end);
-  program_desc_str.resize(inputfs.tellg());
-  inputfs.seekg(0, std::ios::beg);
-  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
-  inputfs.read(&program_desc_str[0], program_desc_str.size());
-  inputfs.close();
-
-  program_ = new framework::ProgramDesc(program_desc_str);
-  GenerateLoadProgram(dirname);
-
-  framework::BlockDesc* global_block = program_->MutableBlock(0);
-  feed_var_names_.clear();
-  fetch_var_names_.clear();
-  for (auto* op : global_block->AllOps()) {
-    if (op->Type() == "feed") {
-      feed_var_names_.insert(feed_var_names_.begin(), op->Output("Out")[0]);
-    } else if (op->Type() == "fetch") {
-      fetch_var_names_.push_back(op->Input("X")[0]);
-    }
-  }
-}
-
-bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
-  if (var->Persistable()) {
-    // There are many unreachable variables in the program
-    for (size_t i = 0; i < program_->Size(); ++i) {
-      const framework::BlockDesc& block = program_->Block(i);
-      for (auto* op : block.AllOps()) {
-        if (op->Type() == "feed") {
-          continue;
-        }
-        for (auto input_argument_name : op->InputArgumentNames()) {
-          if (input_argument_name == var->Name()) {
-            return true;
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
-
-void InferenceEngine::GenerateLoadProgram(const std::string& dirname) {
-  framework::BlockDesc* global_block = program_->MutableBlock(0);
-
-  load_program_ = new framework::ProgramDesc();
-  framework::BlockDesc* load_block = load_program_->MutableBlock(0);
-  for (auto* var : global_block->AllVars()) {
-    if (IsParameter(var)) {
-      LOG(INFO) << "parameter's name: " << var->Name();
-
-      framework::VarDesc* new_var = load_block->Var(var->Name());
-      new_var->SetShape(var->Shape());
-      new_var->SetDataType(var->GetDataType());
-      new_var->SetType(var->GetType());
-      new_var->SetLoDLevel(var->GetLoDLevel());
-      new_var->SetPersistable(true);
-
-      // append_op
-      framework::OpDesc* op = load_block->AppendOp();
-      op->SetType("load");
-      op->SetOutput("Out", {new_var->Name()});
-      op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
-      op->CheckAttrs();
-    }
-  }
-}
-
-void InferenceEngine::PrependFeedOp() {
-  if (!program_) {
-    LOG(FATAL) << "Please initialize the program_ first.";
-  }
-
-  framework::BlockDesc* global_block = program_->MutableBlock(0);
-
-  // create_var
-  framework::VarDesc* feed_var = global_block->Var("feed");
-  feed_var->SetType(framework::proto::VarDesc::FEED_MINIBATCH);
-  feed_var->SetPersistable(true);
-
-  // prepend feed_op
-  for (size_t i = 0; i < feed_var_names_.size(); ++i) {
-    std::string var_name = feed_var_names_[i];
-    LOG(INFO) << "feed var's name: " << var_name;
-
-    // prepend_op
-    framework::OpDesc* op = global_block->PrependOp();
-    op->SetType("feed");
-    op->SetInput("X", {"feed"});
-    op->SetOutput("Out", {var_name});
-    op->SetAttr("col", {static_cast<int>(i)});
-    op->CheckAttrs();
-  }
-}
-
-void InferenceEngine::AppendFetchOp() {
-  if (!program_) {
-    LOG(FATAL) << "Please initialize the program_ first.";
-  }
-
-  framework::BlockDesc* global_block = program_->MutableBlock(0);
-
-  // create_var
-  framework::VarDesc* fetch_var = global_block->Var("fetch");
-  fetch_var->SetType(framework::proto::VarDesc::FETCH_LIST);
-  fetch_var->SetPersistable(true);
-
-  // append fetch_op
-  for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
-    std::string var_name = fetch_var_names_[i];
-    LOG(INFO) << "fetch var's name: " << var_name;
-
-    // append_op
-    framework::OpDesc* op = global_block->AppendOp();
-    op->SetType("fetch");
-    op->SetInput("X", {var_name});
-    op->SetOutput("Out", {"fetch"});
-    op->SetAttr("col", {static_cast<int>(i)});
-    op->CheckAttrs();
-  }
-}
-
-void InferenceEngine::Execute(const std::vector<framework::LoDTensor>& feeds,
-                              std::vector<framework::LoDTensor>& fetchs) {
-  if (!program_ || !load_program_) {
-    LOG(FATAL) << "Please initialize the program_ and load_program_ first.";
-  }
-
-  if (feeds.size() != feed_var_names_.size()) {
-    LOG(FATAL) << "Please feed " << feed_var_names_.size() << " input Tensors.";
-  }
-
-  auto* place = new platform::CPUPlace();
-  framework::InitDevices();
-  framework::Executor* executor = new framework::Executor(*place);
-  framework::Scope* scope = new framework::Scope();
-
-  executor->Run(*load_program_, scope, 0, true, true);
-
-  std::map<std::string, const framework::LoDTensor*> feed_targets;
-  std::map<std::string, framework::LoDTensor*> fetch_targets;
-
-  // set_feed_variable
-  for (size_t i = 0; i < feed_var_names_.size(); ++i) {
-    feed_targets[feed_var_names_[i]] = &feeds[i];
-  }
-
-  // get_fetch_variable
-  fetchs.resize(fetch_var_names_.size());
-  for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
-    fetch_targets[fetch_var_names_[i]] = &fetchs[i];
-  }
-
-  executor->Run(*program_, scope, feed_targets, fetch_targets);
-
-  delete place;
-  delete scope;
-  delete executor;
-}
-}  // namespace paddle
diff --git a/paddle/inference/inference.h b/paddle/inference/inference.h
deleted file mode 100644
index 26f259824b945e260b370ced9d065842264075d5..0000000000000000000000000000000000000000
--- a/paddle/inference/inference.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/program_desc.h"
-
-namespace paddle {
-
-class InferenceEngine {
-public:
-  InferenceEngine() : program_(nullptr), load_program_(nullptr) {}
-  ~InferenceEngine() {
-    delete program_;
-    delete load_program_;
-  }
-
-  void LoadInferenceModel(const std::string& dirname);
-  void Execute(const std::vector<framework::LoDTensor>& feeds,
-               std::vector<framework::LoDTensor>& fetchs);
-
-private:
-  bool IsParameter(const framework::VarDesc* var);
-  void GenerateLoadProgram(const std::string& dirname);
-  void PrependFeedOp();
-  void AppendFetchOp();
-
-private:
-  framework::ProgramDesc* program_;
-  framework::ProgramDesc* load_program_;
-  std::vector<std::string> feed_var_names_;
-  std::vector<std::string> fetch_var_names_;
-};
-
-}  // namespace paddle
diff --git a/paddle/inference/io.cc b/paddle/inference/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..60ad7af1c0a469beb6a07bf057a8647fcb98cca8
--- /dev/null
+++ b/paddle/inference/io.cc
@@ -0,0 +1,98 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/inference/io.h"
+
+#include <fstream>
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/feed_fetch_type.h"
+
+namespace paddle {
+namespace inference {
+
+bool IsParameter(const framework::VarDesc* var,
+                 const framework::ProgramDesc& main_program) {
+  if (var->Persistable()) {
+    // There are many unreachable variables in the program
+    for (size_t i = 0; i < main_program.Size(); ++i) {
+      const framework::BlockDesc& block = main_program.Block(i);
+      for (auto* op : block.AllOps()) {
+        if (op->Type() == framework::kFeedOpType) {
+          continue;
+        }
+        for (auto input_argument_name : op->InputArgumentNames()) {
+          if (input_argument_name == var->Name()) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+void LoadPersistables(framework::Executor& executor,
+                      framework::Scope& scope,
+                      const std::string& dirname,
+                      const framework::ProgramDesc& main_program) {
+  const framework::BlockDesc& global_block = main_program.Block(0);
+
+  framework::ProgramDesc* load_program = new framework::ProgramDesc();
+  framework::BlockDesc* load_block = load_program->MutableBlock(0);
+  for (auto* var : global_block.AllVars()) {
+    if (IsParameter(var, main_program)) {
+      VLOG(3) << "parameter's name: " << var->Name();
+
+      framework::VarDesc* new_var = load_block->Var(var->Name());
+      new_var->SetShape(var->Shape());
+      new_var->SetDataType(var->GetDataType());
+      new_var->SetType(var->GetType());
+      new_var->SetLoDLevel(var->GetLoDLevel());
+      new_var->SetPersistable(true);
+
+      // append_op
+      framework::OpDesc* op = load_block->AppendOp();
+      op->SetType("load");
+      op->SetOutput("Out", {new_var->Name()});
+      op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
+      op->CheckAttrs();
+    }
+  }
+  executor.Run(*load_program, &scope, 0, true, true);
+  delete load_program;
+}
+
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
+                                             framework::Scope& scope,
+                                             const std::string& dirname) {
+  std::string model_filename = dirname + "/__model__";
+  LOG(INFO) << "loading model from " << model_filename;
+  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
+  std::string program_desc_str;
+  inputfs.seekg(0, std::ios::end);
+  program_desc_str.resize(inputfs.tellg());
+  inputfs.seekg(0, std::ios::beg);
+  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
+  inputfs.read(&program_desc_str[0], program_desc_str.size());
+  inputfs.close();
+
+  std::unique_ptr<framework::ProgramDesc> main_program(
+      new framework::ProgramDesc(program_desc_str));
+
+  LoadPersistables(executor, scope, dirname, *main_program);
+  return main_program;
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/inference/io.h b/paddle/inference/io.h
new file mode 100644
index 0000000000000000000000000000000000000000..962b6c4e20d30de3cc28eae1c8c5c33b3ab5f6ac
--- /dev/null
+++ b/paddle/inference/io.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/scope.h"
+
+namespace paddle {
+namespace inference {
+
+void LoadPersistables(framework::Executor& executor,
+                      framework::Scope& scope,
+                      const std::string& dirname,
+                      const framework::ProgramDesc& main_program);
+
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
+                                             framework::Scope& scope,
+                                             const std::string& dirname);
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/inference/tests/book/CMakeLists.txt b/paddle/inference/tests/book/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d3798fb8fd8769aef5940d4ce724cb0cc8686422
--- /dev/null
+++ b/paddle/inference/tests/book/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
+cc_test(test_inference_recognize_digits_mlp
+    SRCS test_inference_recognize_digits.cc
+    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+    ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
+set_tests_properties(test_inference_recognize_digits_mlp
+    PROPERTIES DEPENDS test_recognize_digits_mlp_cpu)
diff --git a/paddle/inference/tests/book/test_inference_recognize_digits.cc b/paddle/inference/tests/book/test_inference_recognize_digits.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26dc2aee04261d9a1fd29b4d75bfacc7870c09d8
--- /dev/null
+++ b/paddle/inference/tests/book/test_inference_recognize_digits.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <time.h>
+#include <sstream>
+#include "gflags/gflags.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/inference/io.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+template <typename Place, typename T>
+void TestInference(const std::string& dirname,
+                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
+                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
+  // 1. Define place, executor and scope
+  auto place = Place();
+  auto executor = paddle::framework::Executor(place);
+  auto* scope = new paddle::framework::Scope();
+
+  // 2. Initialize the inference_program and load all parameters from file
+  auto inference_program = paddle::inference::Load(executor, *scope, dirname);
+
+  // 3. Get the feed_target_names and fetch_target_names
+  const std::vector<std::string>& feed_target_names =
+      inference_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      inference_program->GetFetchTargetNames();
+
+  // 4. Prepare inputs: set up maps for feed targets
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  for (size_t i = 0; i < feed_target_names.size(); ++i) {
+    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+    feed_targets[feed_target_names[i]] = cpu_feeds[i];
+  }
+
+  // 5. Define Tensor to get the outputs: set up maps for fetch targets
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+  }
+
+  // 6. Run the inference program
+  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+
+  delete scope;
+}
+
+TEST(inference, recognize_digits) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor input;
+  srand(time(0));
+  float* input_ptr =
+      input.mutable_data<float>({1, 28, 28}, paddle::platform::CPUPlace());
+  for (int i = 0; i < 784; ++i) {
+    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
+  }
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace, float>(
+      dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace, float>(
+      dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  EXPECT_EQ(output1.dims(), output2.dims());
+  EXPECT_EQ(output1.numel(), output2.numel());
+
+  float err = 1E-3;
+  int count = 0;
+  for (int64_t i = 0; i < output1.numel(); ++i) {
+    if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+#endif
+}
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 50b441e75d05d985d2ab2e0c46218eacab45e2ea..e903f43ba69ee9e28b3a03e8921a41ffa81a2542 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -175,6 +175,8 @@ endif()
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
+op_library(save_combine_op DEPS lod_tensor)
+op_library(load_combine_op DEPS lod_tensor)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
@@ -194,3 +196,4 @@ if(WITH_GPU)
     cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
+cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
index 4e579387924a5b0499f29609bc6b1322030a3c0d..00cb6e9cafb4e79ed3d59cd4a6e40ea132e5efda 100644
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -82,7 +82,7 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
     math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
     auto grad_merge = merge_func(context, grad);
     auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
-    auto& merge_rows = grad_merge.rows();
+    framework::Vector<int64_t> merge_rows(grad_merge.rows());
     // 2. m += g_m * g_m
     math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
     auto grad_square = sqare_func(context, grad_merge, grad_merge);
@@ -101,8 +101,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
     SparseAdagradFunctorKernel<
         T, 256><<<grid2, threads, 0,
                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(grad_merge_data, grad_merge.rows().data(),
-                                   lr, param_data, moment_data, grad_width,
+                      .stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr,
+                                   param_data, moment_data, grad_width,
                                    epsilon);
   }
 };
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
index 9cc34bdded780e61e8700eb4fa4a295c84fb48bc..bf536687d398b8342e6ae76a07c11e5fe47483e0 100644
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
           merge_func(ctx.template device_context<DeviceContext>(), grad);
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
-      auto* rows = grad_merge.rows().data();
+      int64_t* rows = nullptr;
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        rows = grad_merge.mutable_rows()->cuda_data();
+      } else {
+        rows = grad_merge.mutable_rows()->data();
+      }
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
 
       SparseAdamFunctor<T> functor(
diff --git a/paddle/operators/ctc_align_op.cu b/paddle/operators/ctc_align_op.cu
index 45635f16745346b08f7e31db2f25905bdbc3aeeb..2a970cd9fa965b4126356eaa1519068f9c7a7f34 100644
--- a/paddle/operators/ctc_align_op.cu
+++ b/paddle/operators/ctc_align_op.cu
@@ -69,12 +69,11 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
 
     auto stream = ctx.cuda_device_context().stream();
     MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
-        num_tokens, tokens, num_seq, input_lod[level].data(), blank,
+        num_tokens, tokens, num_seq, input_lod[level].cuda_data(), blank,
         merge_repeated, dev_out_lod0_ptr, output_data);
 
     // set output lod
-    thrust::host_vector<size_t> host_out_lod0(dev_out_lod0.begin(),
-                                              dev_out_lod0.end());
+    std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end());
     framework::LoD out_lod;
     out_lod.push_back(host_out_lod0);
     output->set_lod(out_lod);
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index 35cb18797ff66cb87a6658e73ce02b0bfae29baa..5274aa204e6629c9c5ea850c433e0948c89015bd 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -51,6 +51,13 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
                          "'dropout_prob' must be between 0.0 and 1.0.");
         });
     AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<bool>("fix_seed",
+                  "A flag indicating whether to use a fixed seed to generate "
+                  "random mask. NOTE: DO NOT set this flag to true in "
+                  "training. Setting this flag to true is only useful in "
+                  "unittest or for debug that always the same output units "
+                  "will be dropped.")
+        .SetDefault(false);
     AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
 
     AddComment(R"DOC(
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
index c56930336e865079f1b96df0f35b0a051fe63a27..84d78445a4fa340ba3c066bb48b96b2a890db652 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -62,7 +62,11 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
       int size = framework::product(mask->dims());
-      int seed = context.Attr<int>("seed");
+
+      std::random_device rnd;
+      int seed =
+          context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+
       thrust::counting_iterator<unsigned int> index_sequence_begin(0);
       thrust::transform(index_sequence_begin, index_sequence_begin + size,
                         thrust::device_ptr<T>(mask_data),
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
index c90b8d277eb78048c001d36a367287146b51c636..46e5dbc64ff9ad3d04a9c1c07f4226932f661baf 100644
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -38,9 +38,15 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
     if (!context.Attr<bool>("is_test")) {
       auto* mask = context.Output<Tensor>("Mask");
       auto* mask_data = mask->mutable_data<T>(context.GetPlace());
-      int seed = context.Attr<int>("seed");
+
+      // NOTE: fixed seed should only be used in unittest or for debug.
+      // Guarantee to use random seed in training.
+      std::random_device rnd;
       std::minstd_rand engine;
+      int seed =
+          context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
       engine.seed(seed);
+
       std::uniform_real_distribution<float> dist(0, 1);
       size_t size = framework::product(mask->dims());
       for (size_t i = 0; i < size; ++i) {
diff --git a/paddle/operators/elementwise_pow_op.cc b/paddle/operators/elementwise_pow_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5293cc7dd34ccee860c50e964516da9b4d42d29c
--- /dev/null
+++ b/paddle/operators/elementwise_pow_op.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/elementwise_pow_op.h"
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwisePowOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwisePowOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Pow", "Out = X ^ Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(elementwise_pow, ops::ElementwiseOp,
+                             ops::ElementwisePowOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/elementwise_pow_op.cu b/paddle/operators/elementwise_pow_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..643c978e635bc8e9671b47774c2eac5b713f59c2
--- /dev/null
+++ b/paddle/operators/elementwise_pow_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_pow_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/elementwise_pow_op.h b/paddle/operators/elementwise_pow_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6019e709e0db0fd62b4d3350bb768095f87ef241
--- /dev/null
+++ b/paddle/operators/elementwise_pow_op.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cmath>
+#include "paddle/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct PowFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwisePowKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
index b1957fb9ce6add8628cb206abf2c569d3f615c85..a08bd4233b02d021aaa64bafe4b855f11a60d338 100644
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -30,11 +30,12 @@ using Tensor = framework::Tensor;
 
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src, const size_t* index,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index_lod,
                              framework::Tensor* dst, bool indexed_src) {
   math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index, *dst, indexed_src);
+  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
 }
 
 template <typename DeviceContext, typename T>
@@ -76,7 +77,9 @@ class GRUKernel : public framework::OpKernel<T> {
     gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
     Tensor ordered_h0;
-    const size_t* order = batch_gate->lod()[2].data();
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
     if (h0) {
       // Since the batch computing for GRU reorders the input sequences
       // according to their length. The initialized cell state also needs
@@ -159,7 +162,9 @@ class GRUGradKernel : public framework::OpKernel<T> {
     zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
 
     Tensor ordered_h0, ordered_h0_grad;
-    const size_t* order = batch_gate->lod()[2].data();
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
     if (h0) {
       ReorderInitState<DeviceContext, T>(dev_ctx, *h0, order, &ordered_h0,
                                          true);
diff --git a/paddle/operators/label_smooth_op.cc b/paddle/operators/label_smooth_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c89082f44b360cbd171eccb212674040b8688a46
--- /dev/null
+++ b/paddle/operators/label_smooth_op.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/label_smooth_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LabelSmoothOp : public framework::OperatorWithKernel {
+ public:
+  LabelSmoothOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LabelSmoothOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LabelSmoothOp should not be null.");
+    auto in_dims = ctx->GetInputDim("X");
+    if (ctx->HasInput("PriorDist")) {
+      auto noise_dims = ctx->GetInputDim("PriorDist");
+      auto noise_numel = paddle::framework::product(noise_dims);
+      PADDLE_ENFORCE(
+          in_dims[1] == noise_numel,
+          "The number of elements in Input(PriorDist) must be equal to the "
+          "dimension of each label.");
+    }
+    ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", in_dims);
+  }
+};
+
+class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LabelSmoothOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor) The input labels of LabelSmooth operator. This "
+             "input can be batched labels in one-hot encoding or output from "
+             "softmax, with shape [N x K], where N is the batch size and K is "
+             "the number of classes");
+    AddInput("PriorDist",
+             "(Tensor, optional)"
+             "The prior distribution to be added to the smoothed label. It is "
+             "fixed during training and the number of elements should be equal "
+             "to the dimension K of each label. Default is uniform "
+             "distribution and each element will be set to 1/K if not provided "
+             "in input.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(loDTensor) The smoothed label of LabelSmooth operator. It has"
+              "the same shape and LoD with the Input(LoDTensor).");
+    AddAttr<float>("epsilon",
+                   "(float, default 0.0f)"
+                   "The smoothing parameter of LabelSmooth operator.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+LabelSmooth Operator.
+
+Label smoothing is a mechanism to regularize the classifier layer. In machine 
+learning, optimizing the log-likelihood of the correct label directly may 
+cause two problems. First, it may result in overfitting: if the model learns 
+to assign full probability to the ground-truth label for each training example,
+it is not guaranteed to generalize. Second, it encourages the differences 
+between the largest logit and all others to become large, reducing the ability 
+of the model to adapt. Label smoothing is proposed to encourage the model to 
+be less confident, which replaces the ground-truth label $y$ with the weighted 
+sum of itself and some fixed distribution $\mu$, i.e.
+
+$$
+    \tilde{y} = (1 - \epsilon) * y + \epsilon * \mu,
+$$
+
+where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and 
+$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for 
+$\mu$. This change in the ground-truth label is called label-smoothing 
+regularization or LSR.
+
+See more details about label smoothing in https://arxiv.org/abs/1512.00567.
+
+)DOC");
+  }
+};
+
+class LabelSmoothGradOp : public framework::OperatorWithKernel {
+ public:
+  LabelSmoothGradOp(const std::string &type,
+                    const framework::VariableNameMap &inputs,
+                    const framework::VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
+            label_smooth_grad, ops::LabelSmoothGradOp);
+REGISTER_OP_CPU_KERNEL(
+    label_smooth,
+    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    label_smooth_grad,
+    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/label_smooth_op.cu b/paddle/operators/label_smooth_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5a0cec12bc58a56e4b0c3bd6fbc6c4754ef81fa4
--- /dev/null
+++ b/paddle/operators/label_smooth_op.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/label_smooth_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    label_smooth,
+    ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    label_smooth_grad,
+    ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/label_smooth_op.h b/paddle/operators/label_smooth_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..87bc9f793e3b4e249142710243c45d51f3a913b2
--- /dev/null
+++ b/paddle/operators/label_smooth_op.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LabelSmoothKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::LoDTensor>("Out");
+    auto* in_t = ctx.Input<framework::LoDTensor>("X");
+    auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
+    auto label_dim = in_t->dims()[1];
+    out_t->mutable_data<T>(ctx.GetPlace());
+
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto out = framework::EigenVector<T>::Flatten(*out_t);
+    auto in = framework::EigenVector<T>::Flatten(*in_t);
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    if (dist_t) {
+      auto dist = framework::EigenVector<T>::Flatten(*dist_t);
+      out.device(dev) =
+          static_cast<T>(1 - epsilon) * in +
+          epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
+    } else {
+      out.device(dev) = static_cast<T>(1 - epsilon) * in +
+                        static_cast<T>(epsilon / label_dim);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LabelSmoothGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    d_in_t->mutable_data<T>(ctx.GetPlace());
+
+    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
+    auto d_in = framework::EigenVector<T>::Flatten(*d_in_t);
+
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c6d2ae4d05becaeed34d66cad398cc90f9d3ece
--- /dev/null
+++ b/paddle/operators/layer_norm_op.cc
@@ -0,0 +1,370 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/layer_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+using EigenMatrixMapRowMajor = Eigen::Map<
+    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+template <typename T>
+using ConstEigenMatrixMapRowMajor = Eigen::Map<
+    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
+class LayerNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"),
+                   "Output(Y) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Mean"),
+                   "Output(Mean) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Variance"),
+                   "Output(Variance) of LayerNormOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
+    PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(),
+                      "'begin_norm_axis' must be less than the rank of X.");
+
+    auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    if (ctx->HasInput("Scale")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
+    }
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
+    }
+
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("Mean", {left});
+    ctx->SetOutputDim("Variance", {left});
+    ctx->ShareLoD("X", "Y");
+  }
+};
+
+class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The input tensor.");
+    AddInput("Scale",
+             "(Tensor, optional) Scale is a 1-dimensional tensor of size "
+             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
+             "It is applied to the output.")
+        .AsDispensable();
+    AddInput("Bias",
+             "(Tensor, optional) Bias is a 1-dimensional tensor of size "
+             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
+             "It is applied to the output.")
+        .AsDispensable();
+    AddOutput("Y", "(LoDTensor) Result after normalization.");
+    AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
+        .AsIntermediate();
+    AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
+        .AsIntermediate();
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1e-5) Constant for "
+                   "numerical stability")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                         "'epsilon' should be between 0.0 and 0.001.");
+        });
+    AddAttr<int>("begin_norm_axis",
+                 "(int default:1), the "
+                 "axis of `begin_norm_axis ... Rank(X) - 1` will be "
+                 "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
+                 "matrix [N,H].")
+        .SetDefault(1)
+        .AddCustomChecker([](const int &begin_norm_axis) {
+          PADDLE_ENFORCE_GT(begin_norm_axis, 0,
+                            "'begin_norm_axis' should be greater than zero.");
+        });
+
+    AddComment(R"DOC(
+Layer Normalization.
+
+Layer Norm has been implemented as discussed in the paper:
+https://arxiv.org/abs/1607.06450
+...
+)DOC");
+  }
+};
+
+template <typename T>
+class LayerNormKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto &x_dims = x->dims();
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+
+    auto *output = ctx.Output<Tensor>("Y");
+    auto *mean = ctx.Output<Tensor>("Mean");
+    auto *var = ctx.Output<Tensor>("Variance");
+    output->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+
+    auto input_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
+
+    auto mean_map = EigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
+    auto var_map = EigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
+    auto output_map = EigenMatrixMapRowMajor<T>(output->data<T>(), left, right);
+
+    auto squre = [](T ele) { return ele * ele; };
+    auto add_epslion = [epsilon](T ele) { return ele + epsilon; };
+
+    mean_map = input_map.rowwise().mean();
+    var_map = (input_map - mean_map.replicate(1, right))
+                  .unaryExpr(squre)
+                  .rowwise()
+                  .mean()
+                  .unaryExpr(add_epslion);
+
+    auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
+    // TODO(zcd): Some thinking about output_map, is it appropriate that
+    // `output_map` and `input_map` point to the same memory.
+    auto inv_std = var_map.unaryExpr(inv_std_func);
+    if (scale && bias) {
+      auto scale_map =
+          ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
+      auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
+      output_map = (input_map - mean_map.replicate(1, right))
+                       .cwiseProduct(inv_std.replicate(1, right))
+                       .cwiseProduct(scale_map.replicate(left, 1)) +
+                   bias_map.replicate(left, 1);
+    } else if (scale) {
+      auto scale_map =
+          ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
+      output_map = (input_map - mean_map.replicate(1, right))
+                       .cwiseProduct(inv_std.replicate(1, right))
+                       .cwiseProduct(scale_map.replicate(left, 1));
+    } else if (bias) {
+      auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
+      output_map = (input_map - mean_map.replicate(1, right))
+                       .cwiseProduct(inv_std.replicate(1, right)) +
+                   bias_map.replicate(left, 1);
+    } else {
+      output_map = (input_map - mean_map.replicate(1, right))
+                       .cwiseProduct(inv_std.replicate(1, right));
+    }
+  }
+};
+
+class LayerNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                   "Input(Scale) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Mean"),
+                   "Input(Mean) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Variance"),
+                   "Input(Variance) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) of LayerNormOp should not be null.");
+
+    // check output
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Scale"),
+                        ctx->GetInputDim("Scale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class LayerNormGradKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *mean = ctx.Input<Tensor>("Mean");
+    const auto *var = ctx.Input<Tensor>("Variance");
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+
+    const auto &x_dims = x->dims();
+
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto x_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
+    auto d_y_map = ConstEigenMatrixMapRowMajor<T>(d_y->data<T>(), left, right);
+    auto mean_map = ConstEigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
+    auto var_map = ConstEigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
+
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      auto d_bias_map = EigenMatrixMapRowMajor<T>(d_bias->data<T>(), 1, right);
+      d_bias_map = d_y_map.colwise().sum();
+    }
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      auto d_scale_map =
+          EigenMatrixMapRowMajor<T>(d_scale->data<T>(), 1, right);
+      auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
+      // There are two equation to compute d_scale. One uses "Y" and the other
+      // does not use "Y"
+      d_scale_map =
+          ((x_map - mean_map.replicate(1, right))
+               .cwiseProduct(
+                   var_map.unaryExpr(inv_std_func).replicate(1, right))
+               .cwiseProduct(d_y_map))
+              .colwise()
+              .sum();
+    }
+
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      auto d_x_map = EigenMatrixMapRowMajor<T>(d_x->data<T>(), left, right);
+      auto triple_product_func = [](T ele) { return ele * ele * ele; };
+      auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
+      // TODO(zcd): these code can be refined
+      if (d_scale) {
+        auto scale_map =
+            ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
+        // dy_dx
+        auto dx_end = var_map.unaryExpr(inv_std_func)
+                          .replicate(1, right)
+                          .cwiseProduct(d_y_map)
+                          .cwiseProduct(scale_map.replicate(left, 1));
+        // dy_dmean_dx
+        auto dx_mean = (T(-1.0) / right) *
+                       var_map.unaryExpr(inv_std_func)
+                           .replicate(1, right)
+                           .cwiseProduct(d_y_map)
+                           .cwiseProduct(scale_map.replicate(left, 1))
+                           .rowwise()
+                           .sum()
+                           .replicate(1, right);
+        // dy_var_dx
+        auto dvar_end_part = (x_map - mean_map.replicate(1, right))
+                                 .cwiseProduct(scale_map.replicate(left, 1))
+                                 .cwiseProduct(d_y_map)
+                                 .rowwise()
+                                 .sum();
+        auto dvar_end = var_map.unaryExpr(inv_std_func)
+                            .unaryExpr(triple_product_func)
+                            .cwiseProduct(dvar_end_part)
+                            .replicate(1, right);
+        auto dx_var =
+            (T(-1.0) / right) *
+            (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
+
+        d_x_map = dx_end + dx_mean + dx_var;
+      } else {
+        // dy_dx
+        auto dx_end = var_map.unaryExpr(inv_std_func)
+                          .replicate(1, right)
+                          .cwiseProduct(d_y_map);
+        // dy_dmean_dx
+        auto dx_mean = (T(-1.0) / right) *
+                       var_map.unaryExpr(inv_std_func)
+                           .replicate(1, right)
+                           .cwiseProduct(d_y_map)
+                           .rowwise()
+                           .sum()
+                           .replicate(1, right);
+        // dy_var_dx
+        auto dvar_end_part = (x_map - mean_map.replicate(1, right))
+                                 .cwiseProduct(d_y_map)
+                                 .rowwise()
+                                 .sum();
+        auto dvar_end = var_map.unaryExpr(inv_std_func)
+                            .unaryExpr(triple_product_func)
+                            .cwiseProduct(dvar_end_part)
+                            .replicate(1, right);
+        auto dx_var =
+            (T(-1.0) / right) *
+            (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
+
+        d_x_map = dx_end + dx_mean + dx_var;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
+            layer_norm_grad, ops::LayerNormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    layer_norm,
+    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/layer_norm_op.h b/paddle/operators/layer_norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..bca35b91e6f52d35dee14aac9d080b52914942e3
--- /dev/null
+++ b/paddle/operators/layer_norm_op.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LayerNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class LayerNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/load_combine_op.cc b/paddle/operators/load_combine_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4be793d7bf1f346c011842c57fb5b5179a697d6
--- /dev/null
+++ b/paddle/operators/load_combine_op.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fstream>
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+class LoadCombineOp : public framework::OperatorBase {
+ public:
+  LoadCombineOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto filename = Attr<std::string>("file_path");
+
+    std::ifstream fin(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fin),
+                   "Cannot open file %s for load_combine op", filename);
+
+    auto out_var_names = Outputs("Out");
+    PADDLE_ENFORCE_GT(
+        static_cast<int>(out_var_names.size()), 0,
+        "The number of output variables should be greater than 0.");
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    for (size_t i = 0; i < out_var_names.size(); i++) {
+      auto *out_var = scope.FindVar(out_var_names[i]);
+
+      PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
+                     out_var_names[i]);
+
+      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
+
+      // Error checking
+      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
+                     filename);
+
+      // Get data from fin to tensor
+      DeserializeFromStream(fin, tensor, dev_ctx);
+
+      if (platform::is_gpu_place(place)) {
+        // copy CPU to GPU
+        framework::LoDTensor cpu_tensor;
+        cpu_tensor.ShareDataWith(*tensor);
+        cpu_tensor.set_lod(tensor->lod());
+
+        // reset tensor
+        out_var->Clear();
+        tensor = out_var->GetMutable<framework::LoDTensor>();
+        tensor->set_lod(cpu_tensor.lod());
+        Copy(cpu_tensor, place, dev_ctx, tensor);
+      }
+    }
+  }
+};
+
+class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput(
+        "Out",
+        "(vector) The output LoDTensors that will be read from the input file.")
+        .AsDuplicable();
+    AddAttr<std::string>("file_path",
+                         "(string) "
+                         "LoDTensors will be loaded from \"file_path\".")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+    AddComment(R"DOC(
+LoadCombine Operator.
+
+LoadCombine operator loads LoDTensor variables from a file. The file should 
+contain one or more LoDTensors serialized using the SaveCombine operator. The 
+LoadCombine operator applies a deserialization strategy to appropriately load 
+the LodTensors, and this strategy complements the serialization strategy used 
+in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
+with the SaveCombine operator, and can only deserialize one or more LoDTensors 
+that were saved using the SaveCombine operator.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
+                  ops::LoadCombineOpProtoMaker);
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index d97390fa1c53fa0bdf16ab34cb209b994621f83c..07372808bbf078bd2e9b0bb5782b95a046253f46 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -125,8 +125,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
       new_rows.resize(ids_dim[0]);
       auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
 
-      memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data,
-                   ids_dim[0] * sizeof(int64_t), stream);
+      memory::Copy(platform::CPUPlace(), new_rows.cuda_data(), gpu_place,
+                   ids_data, ids_dim[0] * sizeof(int64_t), stream);
 
       d_table->set_rows(new_rows);
 
diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h
index c57ee414dc5b3417549c8ac3a7fd57a9c8f452df..72e95b75e29c88c5944607ceaa40435bac7a745c 100644
--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
@@ -27,11 +27,12 @@ using Tensor = framework::Tensor;
 
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src, const size_t* index,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index_lod,
                              framework::Tensor* dst, bool indexed_src) {
   math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index, *dst, indexed_src);
+  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
 }
 
 template <typename DeviceContext, typename T>
@@ -84,7 +85,9 @@ class LSTMKernel : public framework::OpKernel<T> {
     }
     lstm_value.prev_state_value = nullptr;
     Tensor ordered_c0;
-    const size_t* order = batch_gate->lod()[2].data();
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
     if (cell_t0) {
       // Since the batch computing for LSTM reorders the input sequence
       // according to their length. The initialized cell state also needs
@@ -202,7 +205,8 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
     // initialization.
     Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
-    const size_t* order = batch_gate->lod()[2].data();
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
     if (c0) {
       ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
                                          true);
diff --git a/paddle/operators/lstmp_op.h b/paddle/operators/lstmp_op.h
index ee82d5c10a5421b181e525f49a263d4808ede62f..e064a155dfadd8104fa80727a962cb2e24ade29f 100644
--- a/paddle/operators/lstmp_op.h
+++ b/paddle/operators/lstmp_op.h
@@ -34,7 +34,8 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src, const size_t* index,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index,
                              framework::Tensor* dst, bool indexed_src) {
   math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
@@ -109,7 +110,9 @@ class LSTMPKernel : public framework::OpKernel<T> {
     }
     lstmp_value.prev_state_value = nullptr;
     Tensor ordered_c0;
-    const size_t* order = batch_gate->lod()[2].data();
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
     if (cell_t0) {
       // Since the batch computing for LSTMP reorders the input sequence
       // according to their length. The initialized cell state also needs
@@ -275,7 +278,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
     // initialization.
     Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
-    const size_t* order = batch_gate->lod()[2].data();
+
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+
     if (c0) {
       ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
                                          true);
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu
index 0ee456f9bc61436bd0f2f8ef20dd1654e7e56d56..acdd87cb3550bc5f3891aed6fefd4301a3395f9f 100644
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -31,7 +31,7 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(in1_height, input2.height());
     output->set_height(in1_height);
 
-    auto& in1_rows = input1.rows();
+    framework::Vector<int64_t> in1_rows(input1.rows());
     auto& in2_rows = input2.rows();
     std::vector<int64_t> out_rows;
     out_rows.reserve(in1_rows.size() + in2_rows.size());
@@ -108,7 +108,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
 
     auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
+    framework::Vector<int64_t> in1_rows(input1.rows());
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
     PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
@@ -126,7 +126,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     dim3 grid(1, in1_rows.size());
     SelectedRowsAddTensorKernel<
         T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.data(), out_data, in1_row_numel);
+        in1_data, in1_rows.cuda_data(), out_data, in1_row_numel);
 
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
     auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
@@ -146,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     auto in1_height = input1.height();
     PADDLE_ENFORCE_EQ(in1_height, input2->height());
 
-    auto& in1_rows = input1.rows();
+    framework::Vector<int64_t> in1_rows(input1.rows());
     auto& in2_rows = *(input2->mutable_rows());
 
     auto& in1_value = input1.value();
@@ -204,7 +204,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
 
     auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
+    framework::Vector<int64_t> in1_rows(input1.rows());
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
     PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
@@ -216,7 +216,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
     dim3 grid(1, in1_rows.size());
     SelectedRowsAddToTensorKernel<
         T, block_size><<<grid, threads, 0, context.stream()>>>(
-        in1_data, in1_rows.data(), in2_data, in1_row_numel);
+        in1_data, in1_rows.cuda_data(), in2_data, in1_row_numel);
   }
 };
 
@@ -257,7 +257,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
   framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
                                      const framework::SelectedRows& input) {
     framework::SelectedRows out;
-    auto input_rows = input.rows();
+    framework::Vector<int64_t> input_rows(input.rows());
     std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
     std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
 
@@ -283,9 +283,9 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
     MergeAddKernel<
         T, 256><<<grid1, threads, 0,
                   reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(input_data, input.rows().data(), out_data,
-                                   out.rows().data(), out.rows().size(),
-                                   input_width);
+                      .stream()>>>(input_data, input_rows.cuda_data(), out_data,
+                                   out.mutable_rows()->cuda_data(),
+                                   out.rows().size(), input_width);
     return out;
   }
 };
@@ -370,8 +370,8 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
     dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
     dim3 grid(1, in1_rows.size());
     UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
-        grid, threads, 0, context.stream()>>>(in1_data, in1_rows.data(), op,
-                                              in2_data, in1_row_numel);
+        grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
+                                              op, in2_data, in1_row_numel);
   }
 };
 }  // namespace scatter
diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc
index e459a42ca251a9fc79f745f48a118ce898a0f77e..17abce1c2f809f75edb2c5dc46709094c2ce10c3 100644
--- a/paddle/operators/math/sequence2batch.cc
+++ b/paddle/operators/math/sequence2batch.cc
@@ -23,8 +23,10 @@ template <typename T>
 class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& src, const size_t* index,
-                  framework::Tensor& dst, bool is_src_index) {
+                  const framework::Tensor& src,
+                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  bool is_src_index) {
+    size_t* index = index_lod.data();
     auto src_dims = src.dims();
     auto dst_dims = dst.dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu
index 452ae8951000872b706f7e4227a62dbf98109e7e..f27631271a42b4d64abef00d7f119b85e32edda4 100644
--- a/paddle/operators/math/sequence2batch.cu
+++ b/paddle/operators/math/sequence2batch.cu
@@ -42,8 +42,10 @@ template <typename T>
 class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& src, const size_t* index,
-                  framework::Tensor& dst, bool is_src_index) {
+                  const framework::Tensor& src,
+                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  bool is_src_index) {
+    size_t* index = index_lod.cuda_data();
     auto src_dims = src.dims();
     auto dst_dims = dst.dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2,
diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h
index a5c43a2c7d4d729c35a20a27de2a23141e6019bc..6db0427b4174a09dd254d771e8d3d215cc6571a9 100644
--- a/paddle/operators/math/sequence2batch.h
+++ b/paddle/operators/math/sequence2batch.h
@@ -35,7 +35,7 @@ class CopyMatrixRowsFunctor {
   // copy the input src to the indexed rows of output dst.
   // The indexed rows are based on the input index.
   void operator()(const DeviceContext& context, const framework::Tensor& src,
-                  const size_t* index, framework::Tensor& dst,
+                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
                   bool is_src_index);
 };
 
@@ -66,7 +66,7 @@ class LoDTensor2BatchFunctor {
       PADDLE_ENFORCE_EQ(lods[1].size(),
                         static_cast<size_t>(lod_tensor.dims()[0]));
       CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
-      to_batch(context, lod_tensor, lods[1].data(), batch, true);
+      to_batch(context, lod_tensor, lods[1], batch, true);
       return;
     }
 
@@ -144,7 +144,7 @@ class LoDTensor2BatchFunctor {
     batch.set_lod(batch_lods);
 
     CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
-    to_batch(context, lod_tensor, seq2batch_idx, batch, true);
+    to_batch(context, lod_tensor, batch_lods[1], batch, true);
   }
 };
 
@@ -159,8 +159,7 @@ class Batch2LoDTensorFunctor {
     PADDLE_ENFORCE_EQ(in_lod[1].size(),
                       static_cast<size_t>(lod_tensor.dims()[0]));
     CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
-    size_t* index = in_lod[1].data();
-    to_seq(context, batch, index, lod_tensor, false);
+    to_seq(context, batch, in_lod[1], lod_tensor, false);
   }
 };
 
diff --git a/paddle/operators/math/sequence_padding.cu b/paddle/operators/math/sequence_padding.cu
index a38df26f59569c4fd54a1ba5691b2cd5f3245344..65c9cfe4a0ec14d220ad237baa71703a783ed0fa 100644
--- a/paddle/operators/math/sequence_padding.cu
+++ b/paddle/operators/math/sequence_padding.cu
@@ -120,12 +120,14 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     T* padding_data = padding.data<T>();
     if (norm_by_times) {
       SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
-          padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
-          sequence_width, max_sequence_length, num_sequences);
+          padding_data, const_cast<T*>(seq_data),
+          abs_offset_lod[level].cuda_data(), sequence_width,
+          max_sequence_length, num_sequences);
     } else {
       SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
-          padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
-          sequence_width, max_sequence_length, num_sequences);
+          padding_data, const_cast<T*>(seq_data),
+          abs_offset_lod[level].cuda_data(), sequence_width,
+          max_sequence_length, num_sequences);
     }
   }
 };
@@ -193,12 +195,14 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     T* seq_data = seq.data<T>();
     if (norm_by_times) {
       SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
-          const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
-          sequence_width, max_sequence_length, num_sequences);
+          const_cast<T*>(padding_data), seq_data,
+          abs_offset_lod[level].cuda_data(), sequence_width,
+          max_sequence_length, num_sequences);
     } else {
       SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
-          const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
-          sequence_width, max_sequence_length, num_sequences);
+          const_cast<T*>(padding_data), seq_data,
+          abs_offset_lod[level].cuda_data(), sequence_width,
+          max_sequence_length, num_sequences);
     }
   }
 };
diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu
index 4c9e6b375ce7251747b9cd443d86cca0858c84ef..f66534a6812a66c737445ea96914a393077d7d65 100644
--- a/paddle/operators/math/sequence_pooling.cu
+++ b/paddle/operators/math/sequence_pooling.cu
@@ -73,7 +73,7 @@ class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(num_seq, 1);
     auto stream = context.stream();
     KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
-        in_data, starts.data(), out_data, max_index, num_seq, dim);
+        in_data, starts.cuda_data(), out_data, max_index, num_seq, dim);
   }
 };
 
diff --git a/paddle/operators/math/sequence_scale.cu b/paddle/operators/math/sequence_scale.cu
index ceaabd8e0fd81c927fbd4333c0aa7954b8da8513..fd4e28f6113729cd1fa9dc179bd9b601d29b8a7f 100644
--- a/paddle/operators/math/sequence_scale.cu
+++ b/paddle/operators/math/sequence_scale.cu
@@ -46,7 +46,7 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
 
     SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
         num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
-        seq_data, abs_offset_lod[level].data(), scales, seq_width);
+        seq_data, abs_offset_lod[level].cuda_data(), scales, seq_width);
   }
 };
 
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index 4a06babeda00f2420df80f81f876a0047a3285ef..84f24a909597915f0eebb6c9cad37510cbe93e7b 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/reduce_op.h"
-#include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
@@ -38,10 +37,14 @@ class ReduceOp : public framework::OperatorWithKernel {
         dim, x_rank,
         "The dim should be in the range [-rank(input), rank(input)).");
     bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
     if (reduce_all) {
-      ctx->SetOutputDim("Out", {1});
+      if (keep_dim)
+        ctx->SetOutputDim(
+            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
     } else {
-      bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
       auto dims_vector = vectorize(x_dims);
       if (keep_dim || x_rank == 1) {
         dims_vector[dim] = 1;
diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu
index 41f2c5b9de91ade15b4010f56377675cfd1b611c..b3825212e1ac41b13a2f4cad2c128da39c5f6e71 100644
--- a/paddle/operators/row_conv_op.cu
+++ b/paddle/operators/row_conv_op.cu
@@ -307,7 +307,7 @@ class RowConvKernel<platform::CUDADeviceContext, T>
     int input_dim = X->dims()[1];
     int num_sequence = batch_indices.size() - 1;
     int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.data();
+    size_t *idx = batch_indices.cuda_data();
     auto stream = context.cuda_device_context().stream();
 
     if (future_context <= 32) {
@@ -345,7 +345,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
     int input_dim = X->dims()[1];
     int num_sequence = batch_indices.size() - 1;
     int future_context = Filter->dims()[0];
-    size_t *idx = batch_indices.data();
+    size_t *idx = batch_indices.cuda_data();
 
     auto &device_ctx = context.cuda_device_context();
     math::SetConstant<platform::CUDADeviceContext, T> zero;
diff --git a/paddle/operators/save_combine_op.cc b/paddle/operators/save_combine_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bffa2908bc42d73332f22fa3706d24ab49cd4b38
--- /dev/null
+++ b/paddle/operators/save_combine_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <numeric>
+#include <sstream>
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+// TODO(sidgoyal78): These function are needed by other files (save_op), move
+// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
+constexpr char kSEP = '/';
+static bool FileExists(const std::string &filepath) {
+  struct stat buffer;
+  return (stat(filepath.c_str(), &buffer) == 0);
+}
+
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(kSEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static void MkDir(const char *path) {
+  if (mkdir(path, 0755)) {
+    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
+  }
+}
+
+static void MkDirRecursively(const char *fullpath) {
+  if (*fullpath == '\0') return;  // empty string
+  if (FileExists(fullpath)) return;
+
+  MkDirRecursively(DirName(fullpath).c_str());
+  MkDir(fullpath);
+}
+
+class SaveCombineOp : public framework::OperatorBase {
+ public:
+  SaveCombineOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto filename = Attr<std::string>("file_path");
+    auto overwrite = Attr<bool>("overwrite");
+
+    bool is_present = FileExists(filename);
+    if (is_present && !overwrite) {
+      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
+                   filename, overwrite);
+    }
+
+    MkDirRecursively(DirName(filename).c_str());
+    std::ofstream fout(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+
+    auto inp_var_names = Inputs("X");
+    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
+                      "The number of input variables should be greater than 0");
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    for (size_t i = 0; i < inp_var_names.size(); i++) {
+      auto *var = scope.FindVar(inp_var_names[i]);
+
+      PADDLE_ENFORCE(var != nullptr,
+                     "Cannot find variable %s for save_combine_op",
+                     inp_var_names[i]);
+      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
+                     inp_var_names[i]);
+
+      auto &tensor = var->Get<framework::LoDTensor>();
+      // Serialize tensor
+      framework::SerializeToStream(fout, tensor, dev_ctx);
+    }
+    fout.close();
+  }
+};
+
+class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(vector) Input LoDTensors that need to be saved together in a file.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SaveCombine operator
+
+This operator will serialize and write a list of input LoDTensor variables 
+to a file on disk.
+)DOC");
+    AddAttr<bool>("overwrite",
+                  "(boolean, default true)"
+                  "Overwrite the output file if it exists.")
+        .SetDefault(true);
+    AddAttr<std::string>(
+        "file_path",
+        "(string)"
+        "The \"file_path\" where the LoDTensor variables will be saved.")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
+                  ops::SaveCombineOpProtoMaker);
diff --git a/paddle/operators/save_load_combine_op_test.cc b/paddle/operators/save_load_combine_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f3ddc4a6c55d72e4e444869a1ebcd7662c892317
--- /dev/null
+++ b/paddle/operators/save_load_combine_op_test.cc
@@ -0,0 +1,180 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+
+USE_NO_KERNEL_OP(save_combine);
+USE_NO_KERNEL_OP(load_combine);
+
+int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
+                            std::string var_name,
+                            paddle::platform::CPUPlace& place,
+                            paddle::framework::Scope& scope,
+                            paddle::framework::LoD& expect_lod) {
+  auto var = scope.Var(var_name);
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({x, y});
+  expect_lod.resize(1);
+  for (size_t i = 0; i < lod_info.size(); i++) {
+    expect_lod[0].push_back(lod_info[i]);
+  }
+  tensor->set_lod(expect_lod);
+  int* expect = tensor->mutable_data<int>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<int>(i);
+  }
+  return expect;
+}
+
+paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
+    const std::string out_var_name, paddle::framework::Scope& scope) {
+  auto load_var = scope.Var(out_var_name);
+  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
+  return target;
+}
+
+int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
+                                 paddle::framework::Scope& scope,
+                                 paddle::framework::LoD& actual_lod) {
+  int* actual = target->data<int>();
+  actual_lod = target->lod();
+  return actual;
+}
+
+void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod,
+                 paddle::framework::LoD actual_lod, const int& numel) {
+  for (int64_t i = 0; i < numel; ++i) {
+    EXPECT_EQ(expect[i], actual[i]);
+  }
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
+
+// Here, we create 4 LoDTensors and use save_combine_op to first save these
+// in a single file. Then, we use load_combine_op to load these sequentially
+TEST(SaveLoadCombineOp, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  std::vector<int> lod1 = {0, 1, 2, 3, 10};
+  int numel1 = 100;
+  paddle::framework::LoD expect_lod1;
+  int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope,
+                                        expect_lod1);
+
+  std::vector<int> lod2 = {0, 2, 5, 10};
+  int numel2 = 200;
+  paddle::framework::LoD expect_lod2;
+  int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope,
+                                        expect_lod2);
+
+  std::vector<int> lod3 = {0, 2, 3, 20};
+  int numel3 = 4000;
+  paddle::framework::LoD expect_lod3;
+  int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
+                                        scope, expect_lod3);
+
+  std::vector<int> lod4 = {0, 1, 20};
+  int numel4 = 1000;
+  paddle::framework::LoD expect_lod4;
+  int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope,
+                                        expect_lod4);
+
+  // Set attributes
+  std::string filename = "check_tensor.ls";
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string(filename)});
+
+  // Run the save_combine_op
+  auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "save_combine",
+      {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
+  save_combine_op->Run(scope, place);
+
+  // Set up output vars
+  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope);
+  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope);
+  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope);
+  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope);
+
+  // Run the load_combine_op
+  auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "load_combine", {},
+      {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
+  load_combine_op->Run(scope, place);
+
+  paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
+  int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1);
+  int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2);
+  int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3);
+  int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4);
+
+  CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
+  CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
+  CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3);
+  CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4);
+}
+
+// Test with original SaveLoadTest
+TEST(SaveLoadTestWithCombineOp, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  auto var = scope.Var("test_var");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({3, 10});
+  paddle::framework::LoD expect_lod;
+  expect_lod.resize(1);
+  expect_lod[0].push_back(0);
+  expect_lod[0].push_back(1);
+  expect_lod[0].push_back(2);
+  expect_lod[0].push_back(3);
+
+  tensor->set_lod(expect_lod);
+  int* expect = tensor->mutable_data<int>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<int>(i);
+  }
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string("check_t.save")});
+
+  auto save_op = paddle::framework::OpRegistry::CreateOp(
+      "save_combine", {{"X", {"test_var"}}}, {}, attrs);
+  save_op->Run(scope, place);
+
+  auto load_var = scope.Var("out_var");
+  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
+  auto load_op = paddle::framework::OpRegistry::CreateOp(
+      "load_combine", {}, {{"Out", {"out_var"}}}, attrs);
+  load_op->Run(scope, place);
+  int* actual = target->data<int>();
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    EXPECT_EQ(expect[i], actual[i]);
+  }
+  auto& actual_lod = target->lod();
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
diff --git a/paddle/operators/save_load_op_test.cc b/paddle/operators/save_load_op_test.cc
index 40103d864fb58804b39ca5f3c63e802a430ce886..d829d5da174b73613da9dcfcd308a5b05e12bce9 100644
--- a/paddle/operators/save_load_op_test.cc
+++ b/paddle/operators/save_load_op_test.cc
@@ -24,7 +24,7 @@ TEST(SaveLoadOp, CPU) {
 
   auto var = scope.Var("test_var");
   auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({10, 10});
+  tensor->Resize({3, 10});
   paddle::framework::LoD expect_lod;
   expect_lod.resize(1);
   expect_lod[0].push_back(0);
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
index 51e6b9de58b64f6aa9d86d037abe907bba37d3ff..291d19ba8bd6113fb69719d5385792ad5cde387d 100644
--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -42,17 +42,25 @@ class SendOp : public framework::OperatorBase {
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
+
+    auto client_var_name = Output("RPCClient");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
+                            "Can not find variable '%s' in the scope.",
+                            client_var_name);
+    auto* client_var = scope.FindVar(client_var_name);
+    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+
     for (size_t i = 0; i < ins.size(); i++) {
       VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-      client_.AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+      rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
     }
-    PADDLE_ENFORCE(client_.Wait());
+    PADDLE_ENFORCE(rpc_client->Wait());
 
     for (auto& ep : endpoints) {
       VLOG(3) << "batch barrier, ep: " << ep;
-      client_.AsyncSendBatchBarrier(ep);
+      rpc_client->AsyncSendBatchBarrier(ep);
     }
-    PADDLE_ENFORCE(client_.Wait());
+    PADDLE_ENFORCE(rpc_client->Wait());
 
     if (outs.size() > 0) {
       for (size_t i = 0; i < outs.size(); i++) {
@@ -62,11 +70,6 @@ class SendOp : public framework::OperatorBase {
       PADDLE_ENFORCE(client_.Wait());
     }
   }
-
- private:
-  // TODO(typhoonzero): put RPCClient in a Variable, so that
-  // send and recv can use the same connection.
-  mutable detail::RPCClient client_;
 };
 
 class SendOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -76,6 +79,9 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
     AddOutput("Out", "(Tensor) Output tensor to be received from server")
         .AsDuplicable();
+    AddOutput("RPCClient",
+              "(RPCClient) The RPC client object which is"
+              "initialized at most once.");
     AddComment(R"DOC(
 Send operator
 
diff --git a/paddle/operators/sequence_erase_op.cu b/paddle/operators/sequence_erase_op.cu
index f1e3b96acd0259de2b3ca1348834bd17e1e174a2..a5311f15f0c607c880a6f12c0bef10b2dd8c8a79 100644
--- a/paddle/operators/sequence_erase_op.cu
+++ b/paddle/operators/sequence_erase_op.cu
@@ -96,9 +96,8 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
     GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
                 PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
         num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
-
     // Set LoD for output
-    thrust::host_vector<size_t> out_lod0 = dev_out_lod;
+    std::vector<size_t> out_lod0(dev_out_lod.begin(), dev_out_lod.end());
     framework::LoD out_lod;
     out_lod.push_back(out_lod0);
     out->set_lod(out_lod);
diff --git a/paddle/operators/sequence_reshape_op.cc b/paddle/operators/sequence_reshape_op.cc
index 57cca13105537d88fe942b850cae10650d3096e2..d89a46a712c9c84a142e1e347219ed171556d761 100644
--- a/paddle/operators/sequence_reshape_op.cc
+++ b/paddle/operators/sequence_reshape_op.cc
@@ -30,8 +30,13 @@ class SequenceReshapeOp : public framework::OperatorWithKernel {
     auto x_numel = product(x_dims);
     PADDLE_ENFORCE_EQ(x_dims.size(), 2U, "Rank of Input(X) should be 2.");
     int new_dim = ctx->Attrs().Get<int>("new_dim");
-    ctx->SetOutputDim("Out",
-                      {x_numel / new_dim, static_cast<int64_t>(new_dim)});
+    if (ctx->IsRuntime()) {
+      ctx->SetOutputDim("Out",
+                        {x_numel / new_dim, static_cast<int64_t>(new_dim)});
+    } else {
+      // when compiling, the batch size is undetermined, just set to -1
+      ctx->SetOutputDim("Out", {-1, static_cast<int64_t>(new_dim)});
+    }
   }
 };
 
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index 42f8f8b2f072f9d204dfadcd732926b5c98dc617..29f5aa3542c26c76a1b80da61ec6752019216131 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
 
       auto& in_value = grad->value();
-      auto& in_rows = grad->rows();
+      framework::Vector<int64_t> in_rows(grad->rows());
 
       int64_t in_row_numel = in_value.numel() / in_rows.size();
       PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
@@ -102,7 +102,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
       dim3 grid(1, in_rows.size());
       SparseSGDFunctorKernel<
           T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
-          in_data, in_rows.data(), learning_rate->data<T>(), out_data,
+          in_data, in_rows.cuda_data(), learning_rate->data<T>(), out_data,
           in_row_numel);
 
     } else {
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index 48201b344de0d3bd2b121a12389876dad095f10d..3d8102c3ae20c8b714cd48b4fc78dc18a0cf89a7 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -68,7 +68,32 @@ class SumKernel : public framework::OpKernel<T> {
         }
       }
     } else if (out_var->IsType<framework::SelectedRows>()) {
-      PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now");
+      std::unique_ptr<framework::SelectedRows> in0;
+      if (in_place) {
+        // If is in_place, we store the input[0] to in0
+        auto &in_sel0 = in_vars[0]->Get<SelectedRows>();
+        auto &rows = in_sel0.rows();
+#ifdef PADDLE_WITH_CUDA
+        std::vector<int64_t> rows_in_cpu;
+        rows_in_cpu.reserve(rows.size());
+        for (auto item : rows) {
+          rows_in_cpu.push_back(item);
+        }
+        in0.reset(new framework::SelectedRows(rows_in_cpu, in_sel0.height()));
+#else
+        in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
+#endif
+        in0->mutable_value()->ShareDataWith(in_sel0.value());
+      }
+
+      auto get_selected_row = [&](size_t i) -> const SelectedRows & {
+        if (i == 0 && in0) {
+          return *in0.get();
+        } else {
+          return in_vars[i]->Get<SelectedRows>();
+        }
+      };
+
       auto *out = context.Output<SelectedRows>("Out");
       out->mutable_rows()->clear();
       auto *out_value = out->mutable_value();
@@ -76,24 +101,26 @@ class SumKernel : public framework::OpKernel<T> {
       // Runtime InferShape
       size_t first_dim = 0;
       for (int i = 0; i < N; i++) {
-        first_dim += in_vars[i]->Get<SelectedRows>().rows().size();
+        auto &sel_row = get_selected_row(i);
+        first_dim += sel_row.rows().size();
       }
-      auto in_dim = in_vars[0]->Get<SelectedRows>().value().dims();
-      auto in_dim_vec = framework::vectorize(in_dim);
-      in_dim_vec[0] = static_cast<int64_t>(first_dim);
+      auto in_dim =
+          framework::vectorize(get_selected_row(N - 1).value().dims());
+      in_dim[0] = static_cast<int64_t>(first_dim);
 
-      out_value->Resize(framework::make_ddim(in_dim_vec));
+      out_value->Resize(framework::make_ddim(in_dim));
       out_value->mutable_data<T>(context.GetPlace());
 
       math::SelectedRowsAddTo<DeviceContext, T> functor;
 
       int64_t offset = 0;
       for (int i = 0; i < N; i++) {
-        PADDLE_ENFORCE_EQ(out->height(),
-                          in_vars[i]->Get<SelectedRows>().height());
-        functor(context.template device_context<DeviceContext>(),
-                in_vars[i]->Get<SelectedRows>(), offset, out);
-        offset += in_vars[i]->Get<SelectedRows>().value().numel();
+        auto &sel_row = get_selected_row(i);
+
+        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
+        functor(context.template device_context<DeviceContext>(), sel_row,
+                offset, out);
+        offset += sel_row.value().numel();
       }
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
       auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 490397afdd4de0cc1aafde746d31b1d800eded3b..a880d9bdbc63aacc1f2cdbc0d7da001a59c7b372 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -124,44 +124,25 @@ PYBIND11_PLUGIN(core) {
       .def(
           "__init__",
           [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
-#ifndef PADDLE_WITH_CUDA
-            new (&instance) LoDTensor(lod);
-#else
-             LoD new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             new (&instance) LoDTensor(new_lod);
-#endif
+            LoD new_lod;
+            new_lod.reserve(lod.size());
+            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+            new (&instance) LoDTensor(new_lod);
           })
       .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
       .def("set_lod",
            [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
-#ifndef PADDLE_WITH_CUDA
-             self.set_lod(lod);
-#else
              LoD new_lod;
              new_lod.reserve(lod.size());
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
              self.set_lod(new_lod);
-#endif
            })
       .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-#ifndef PADDLE_WITH_CUDA
-        return self.lod();
-#else
-           auto lod = self.lod();
-           std::vector<std::vector<size_t>> new_lod;
-           new_lod.reserve(lod.size());
-           std::transform(lod.begin(), lod.end(), std::back_inserter(new_lod),
-               [](Vector<size_t> item) ->
-                   std::vector<size_t> {
-                 std::vector<size_t> v;
-                 v.reserve(item.size());
-                 std::copy(item.begin(), item.end(), std::back_inserter(v));
-                 return v;
-               });
-           return new_lod;
-#endif
+        auto lod = self.lod();
+        std::vector<std::vector<size_t>> new_lod;
+        new_lod.reserve(lod.size());
+        std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+        return new_lod;
       });
 
   py::class_<SelectedRows>(m, "SelectedRows")
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index f0620498cfa6775ce2949cc02fa9f6c9529dec2e..65c46745556bc5ea91fdd4e33060f2535422e8e8 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -56,7 +56,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
 | ------ | -------- | ----------- |
 | `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
 | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
-| `WITH_TESTING` | ON | Build unit tests binaries. |
+| `WITH_TESTING` | OFF | Build unit tests binaries. |
 | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
 | `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
 | `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index a7fb50ee4149a3c36077f83383f45f3106e7e0f1..a2f21e37e415ccaa0d9624656728d89739972905 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -22,7 +22,9 @@ limitations under the License. */
 int main(int argc, char** argv) {
   std::vector<char*> new_argv;
   std::string gflags_env;
-  new_argv.push_back(argv[0]);
+  for (int i = 0; i < argc; ++i) {
+    new_argv.push_back(argv[i]);
+  }
 #ifdef PADDLE_WITH_CUDA
   new_argv.push_back(
       strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 4fdf4090212e31adcccf6b119c937e70d5cbf995..186b91c226accbe1c2d5465d6244b9438eec9979 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -140,8 +140,13 @@ def init_config_environment(
         g_submodel_stack=[],
         g_add_submodel_suffix=False, ):
 
-    for k, v in locals().iteritems():
-        globals()[k] = copy.deepcopy(v)
+    # directly iterate through locals().iteritems() will change
+    # the size of locals() due to introducing k, v into scope
+    # which will break the process in some env
+
+    local_vars = copy.deepcopy(locals())
+    for k, v in local_vars.iteritems():
+        globals()[k] = v
 
 
 # Because type is widely used as a variable name in this code.
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index 787416aed1acf81138df06110317614dfe77fb48..f52346c3b59264370f46844d0e6b1e2d489299c7 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -26,6 +26,7 @@ import initializer
 import layers
 import nets
 import optimizer
+import learning_rate_decay
 import backward
 import regularizer
 from param_attr import ParamAttr
@@ -35,27 +36,16 @@ from distribute_transpiler import DistributeTranspiler
 from distribute_transpiler_simple import SimpleDistributeTranspiler
 import clip
 from memory_optimization_transpiler import memory_optimize
+import profiler
 
 Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + [
-    'io',
-    'initializer',
-    'layers',
-    'nets',
-    'optimizer',
-    'backward',
-    'regularizer',
-    'LoDTensor',
-    'CPUPlace',
-    'CUDAPlace',
-    'Tensor',
+    'io', 'initializer', 'layers', 'nets', 'optimizer', 'learning_rate_decay',
+    'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor',
     'ParamAttr'
-    'DataFeeder',
-    'clip',
-    'SimpleDistributeTranspiler',
-    'DistributeTranspiler',
-    'memory_optimize',
+    'DataFeeder', 'clip', 'SimpleDistributeTranspiler', 'DistributeTranspiler',
+    'memory_optimize', 'profiler'
 ]
 
 
@@ -86,11 +76,9 @@ def __bootstrap__():
 
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
 
-    read_env_flags = [
-        'use_pinned_memory', 'check_nan_inf', 'do_memory_benchmark'
-    ]
+    read_env_flags = ['use_pinned_memory', 'check_nan_inf', 'benchmark']
     if core.is_compiled_with_cuda():
-        read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync']
+        read_env_flags += ['fraction_of_gpu_memory_to_use']
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])
diff --git a/python/paddle/v2/fluid/clip.py b/python/paddle/v2/fluid/clip.py
index 3028029e60fde2f481b4348ab1b0a4980ebb2b60..fdbc8524abb7d6687983b026ca8e65e61c3dfd1a 100644
--- a/python/paddle/v2/fluid/clip.py
+++ b/python/paddle/v2/fluid/clip.py
@@ -30,6 +30,9 @@ __all__ = [
 
 
 class BaseErrorClipAttr(object):
+    def __str__(self):
+        raise NotImplementedError()
+
     def append_clip_op(self, block, grad_name):
         raise NotImplementedError()
 
@@ -44,6 +47,9 @@ class ErrorClipByValue(BaseErrorClipAttr):
         self.max = max
         self.min = min
 
+    def __str__(self):
+        return "ByValue, min=%f, max=%f" % (self.min, self.max)
+
     def append_clip_op(self, block, grad_name):
         clip_op_desc = block.desc.append_op()
         clip_op_desc.set_type("clip")
@@ -71,6 +77,9 @@ def error_clip_callback(block, context):
 
 
 class BaseGradientClipAttr(object):
+    def __str__(self):
+        raise NotImplementedError()
+
     def process_context(self, context, param, grad):
         raise NotImplementedError()
 
@@ -79,6 +88,9 @@ class BaseGradientClipAttr(object):
 
 
 class NullGradientClipAttr(BaseGradientClipAttr):
+    def __str__(self):
+        return "Null"
+
     def process_context(self, context, param, grad):
         pass
 
@@ -96,6 +108,9 @@ class GradientClipByValue(BaseGradientClipAttr):
         self.max = max
         self.min = min
 
+    def __str__(self):
+        return "ByValue, min=%f, max=%f" % (self.min, self.max)
+
     def process_context(self, context, param, grad):
         pass
 
@@ -108,6 +123,9 @@ class GradientClipByNorm(BaseGradientClipAttr):
     def __init__(self, clip_norm):
         self.clip_norm = clip_norm
 
+    def __str__(self):
+        return "ByNorm, clip_norm=%f" % self.clip_norm
+
     def process_context(self, context, param, grad):
         pass
 
@@ -124,6 +142,10 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
         self.clip_norm = clip_norm
         self.group_name = group_name
 
+    def __str__(self):
+        return "ByGlobalNorm, group_name=%s, clip_norm=%f" % (self.group_name,
+                                                              self.clip_norm)
+
     def process_context(self, context, param, grad):
         if self.group_name not in context:
             context[self.group_name] = []
@@ -160,6 +182,17 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
 
 
 def set_gradient_clip(clip, param_list=None, program=None):
+    """
+        To specify parameters that require gradient clip.
+        Args:
+            clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr, 
+                    which describes the type and detailed attributes of required gradient clip.
+            param_list(list, None by default): Parameters that require gradient clip. 
+                    It can be a list of parameter or a list of parameter's name. 
+                    When it's None, all parameters in the program will be included. 
+            program(Program, None by default): The program where parameters are. 
+                    Will be the default main program when assigned with None.
+    """
     if not isinstance(clip, BaseGradientClipAttr):
         raise TypeError(
             "'clip' should be an instance of BaseGradientClipAttr's derived class"
@@ -199,3 +232,5 @@ def append_gradient_clip_ops(param_grad):
 
 
 ClipByValue = GradientClipByValue
+ClipByNorm = GradientClipByNorm
+ClipByGlobalNorm = GradientClipByGlobalNorm
diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py
index d366a7086df0794eb68e0ac1f7b95ff695577cc9..121b407cae41fa477843b7252ebacc9053d5f7aa 100644
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -153,11 +153,18 @@ class DistributeTranspiler:
             self.param_grad_ep_mapping[ep]["params"].append(param)
             self.param_grad_ep_mapping[ep]["grads"].append(grad)
 
+        rpc_client_var = program.global_block().create_var(
+            name="RPC_CLIENT_VAR",
+            psersistable=True,
+            dtype='float32',  # dtype and shape is not used in fact
+            shape=[0])
+
         # create send_op
         send_op = program.global_block().append_op(
             type="send",
             inputs={"X": send_inputs},
-            outputs={"Out": send_outputs},
+            outputs={"Out": send_outputs,
+                     "RPCClient": rpc_client_var},
             attrs={"endpoints": pserver_endpoints,
                    "epmap": eplist})
         # step4
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 3d85d4550f8e53ed52aa933a17fe85171b81cc6b..ae98e299a4838db8b7089e09683c33c940cfde3d 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -14,6 +14,7 @@
 
 import collections
 import contextlib
+import re
 
 import numpy as np
 
@@ -239,20 +240,30 @@ class Variable(object):
     def __str__(self):
         return self.to_string(True)
 
-    def to_string(self, throw_on_error):
+    def to_string(self, throw_on_error, with_details=False):
         """
         Get debug string.
 
         Args:
             throw_on_error(bool): True if raise an exception when self is not
                 intialized.
+            with_details(bool): more details about variables and parameters
+                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
 
         Returns(str): The debug string.
 
         """
+        assert isinstance(throw_on_error, bool) and isinstance(with_details,
+                                                               bool)
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.VarDesc.FromString(str(protostr))
-        return _debug_string_(proto, throw_on_error)
+        res_str = _debug_string_(proto, throw_on_error)
+        if with_details:
+            additional_attr = ("error_clip", "stop_gradient")
+            for attr_name in additional_attr:
+                res_str += "%s: %s\n" % (attr_name,
+                                         str(getattr(self, attr_name)))
+        return res_str
 
     __repr__ = __str__
 
@@ -629,10 +640,36 @@ class Block(object):
     def __str__(self):
         return self.to_string(True)
 
-    def to_string(self, throw_on_error):
-        protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.BlockDesc.FromString(str(protostr))
-        return _debug_string_(proto, throw_on_error)
+    def to_string(self, throw_on_error, with_details=False):
+        """
+        To debug string.
+        Args:
+            throw_on_error(bool): raise exception when self is not initialized
+                when throw_on_error is True
+            with_details(bool): more details about variables and parameters
+                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
+
+        Returns(str): The debug string.
+
+        """
+        assert isinstance(throw_on_error, bool) and isinstance(with_details,
+                                                               bool)
+        if with_details:
+            re_add_indent = re.compile(r"\n(.)")
+            res_str = "blocks {\n  idx: %d\n  parent_idx: %d" % (
+                self.idx, self.parent_idx)
+            for var in self.vars.itervalues():
+                res_str += "\n  vars {\n    %s  }" % re_add_indent.sub(
+                    r"\n    \1", var.to_string(throw_on_error, with_details))
+            for op in self.ops:
+                res_str += "\n  ops {\n    %s  }" % re_add_indent.sub(
+                    r"\n    \1", op.to_string(throw_on_error))
+            res_str += "\n}"
+        else:
+            protostr = self.desc.serialize_to_string()
+            proto = framework_pb2.BlockDesc.FromString(str(protostr))
+            res_str = _debug_string_(proto, throw_on_error)
+        return res_str
 
     __repr__ = __str__
 
@@ -796,10 +833,29 @@ class Program(object):
     def __str__(self):
         return self.to_string(True)
 
-    def to_string(self, throw_on_error):
-        protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.ProgramDesc.FromString(str(protostr))
-        return _debug_string_(proto, throw_on_error)
+    def to_string(self, throw_on_error, with_details=False):
+        """
+        To debug string.
+        Args:
+            throw_on_error(bool): raise exception when self is not initialized
+                when throw_on_error is True
+            with_details(bool): more details about variables and parameters
+                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
+
+        Returns(str): The debug string.
+
+        """
+        assert isinstance(throw_on_error, bool) and isinstance(with_details,
+                                                               bool)
+        if with_details:
+            res_str = ""
+            for block in self.blocks:
+                res_str += block.to_string(throw_on_error, with_details)
+        else:
+            protostr = self.desc.serialize_to_string()
+            proto = framework_pb2.ProgramDesc.FromString(str(protostr))
+            res_str = _debug_string_(proto, throw_on_error)
+        return res_str
 
     def get_desc(self):
         return self.desc
@@ -950,6 +1006,36 @@ class Parameter(Variable):
 
         self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
 
+    def __str__(self):
+        return self.to_string(True)
+
+    def to_string(self, throw_on_error, with_details=False):
+        """
+        To debug string.
+        Args:
+            throw_on_error(bool): raise exception when self is not initialized
+                when throw_on_error is True
+            with_details(bool): more details about variables and parameters
+                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
+
+        Returns(str): The debug string.
+
+        """
+        assert isinstance(throw_on_error, bool) and isinstance(with_details,
+                                                               bool)
+        if with_details:
+            res_str = Variable.to_string(self, throw_on_error, True)
+            additional_attr = ("trainable", "optimize_attr", "regularizer",
+                               "gradient_clip_attr")
+            for attr_name in additional_attr:
+                res_str += "%s: %s\n" % (attr_name,
+                                         str(getattr(self, attr_name)))
+        else:
+            res_str = Variable.to_string(self, throw_on_error, False)
+        return res_str
+
+    __repr__ = __str__
+
 
 # program is a global instance.
 _main_program_ = Program()
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
index 7d9ae53d94b6c82890150346f138e48a0dfbf15c..2119ca12c8dea6463934aa68cb1b46ec687e3f72 100644
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -18,7 +18,7 @@ import itertools
 from framework import Variable, Parameter, default_main_program, default_startup_program, \
     unique_name, dtype_is_floating
 from paddle.v2.fluid.initializer import Constant, Xavier
-from param_attr import ParamAttr
+from param_attr import ParamAttr, WeightNormParamAttr
 
 
 class LayerHelper(object):
@@ -104,6 +104,177 @@ class LayerHelper(object):
                                  (dtype, each.dtype))
         return dtype
 
+    def _create_weight_normalize(self, attr, shape, dtype):
+        from .layers import elementwise_mul, elementwise_div, reshape
+
+        # Remove these ops when LayerHelper and layers support indicating
+        # program and block.
+        def __norm_op(x,
+                      out=None,
+                      p=2,
+                      dim=None,
+                      keep_dim=False,
+                      block=self.startup_program.global_block()):
+            if out is None:
+                out = block.create_var(
+                    name=unique_name(".".join([self.name, 'weight_norm_norm'])),
+                    dtype=dtype,
+                    persistable=False)
+            abs_out = block.create_var(
+                name=unique_name(".".join([self.name, 'weight_norm_abs'])),
+                dtype=dtype,
+                persistable=False)
+            block.append_op(
+                type='abs', inputs={'X': x}, outputs={'Out': abs_out})
+            pow_out = block.create_var(
+                name=unique_name(".".join([self.name, 'weight_norm_pow'])),
+                dtype=dtype,
+                persistable=False)
+            block.append_op(
+                type='pow',
+                inputs={'X': abs_out},
+                outputs={'Out': pow_out},
+                attrs={'factor': float(p)})
+            sum_out = block.create_var(
+                name=unique_name(".".join([self.name, 'weight_norm_sum'])),
+                dtype=dtype,
+                persistable=False)
+            block.append_op(
+                type='reduce_sum',
+                inputs={'X': pow_out},
+                outputs={'Out': sum_out},
+                attrs={
+                    'dim': dim,
+                    'keep_dim': keep_dim,
+                    'reduce_all': True if dim is None else False
+                })
+            block.append_op(
+                type='pow',
+                inputs={'X': sum_out},
+                outputs={'Out': out},
+                attrs={'factor': 1. / p})
+            return out
+
+        def __reshape_op(x,
+                         shape,
+                         out=None,
+                         block=self.startup_program.global_block()):
+            if out is None:
+                out = block.create_var(
+                    name=unique_name(".".join(
+                        [self.name, 'weight_norm_reshape'])),
+                    dtype=dtype,
+                    persistable=False)
+            block.append_op(
+                type='reshape',
+                inputs={'X': x},
+                outputs={'Out': out},
+                attrs={'shape': shape})
+            return out
+
+        def __transpose_op(x,
+                           axis,
+                           out=None,
+                           block=self.startup_program.global_block()):
+            if out is None:
+                out = block.create_var(
+                    name=unique_name(".".join(
+                        [self.name, 'weight_norm_transpose'])),
+                    dtype=dtype,
+                    persistable=False)
+            block.append_op(
+                type='transpose',
+                inputs={'X': x},
+                outputs={'Out': out},
+                attrs={'axis': axis})
+            return out
+
+        def __norm_except_dim(x,
+                              out=None,
+                              dim=None,
+                              block=self.startup_program.global_block()):
+            """Computes the norm over all dimensions except dim"""
+            if out is None:
+                out = block.create_var(
+                    name=unique_name(".".join([self.name, 'weight_norm_norm'])),
+                    dtype=dtype,
+                    persistable=False)
+            if dim is None:
+                __norm_op(x, out, dim=dim, block=block)
+            elif dim == 0:
+                out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1)
+                reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block)
+                norm = __norm_op(reshape, dim=1, block=block)
+                __reshape_op(norm, out=out, shape=out_shape, block=block)
+            elif dim == len(x.shape) - 1:
+                out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]]
+                reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block)
+                norm = __norm_op(reshape, dim=0, block=block)
+                __reshape_op(norm, out=out, shape=out_shape, block=block)
+            else:
+                perm = range(len(x.shape))
+                perm[0], perm[dim] = dim, 0
+                transpose = __transpose_op(x, perm, block=block)
+                norm = __norm_op(transpose, dim=0, block=block)
+                __transpose_op(norm, perm, out=out, block=block)
+            return out
+
+        def __weight_normalize(g, v, dim):
+            """Calculations for weight normalization"""
+            norm = __norm_except_dim(
+                v, dim=dim, block=self.main_program.current_block())
+            scale = elementwise_div(
+                x=g, y=norm)  # The shapes of g and norm are the same.
+            # Currently, elementwise_mul only support broadcast when the shape
+            # of y is a subset of the shape of x. Thus, we reshape y to squeeze
+            # to achive the subset.
+            w = elementwise_mul(
+                x=v,
+                y=scale if dim is None else reshape(
+                    x=scale, shape=[v.shape[dim]]),
+                axis=-1 if dim is None else dim)
+            # To serialize the original parameter for inference, maybe a
+            # parameter rather than a variable should be returned.
+            return w
+
+        g_param_attr = copy.deepcopy(attr)
+        g_param_attr.name = attr.name + '_g'
+        g_param_shape = [1] * len(shape)
+        if attr.dim is not None:
+            g_param_shape[attr.dim] = shape[attr.dim]
+        v_param_attr = copy.deepcopy(attr)
+        v_param_attr.name = attr.name + '_v'
+        v_param_shape = shape
+
+        # Add to startup_program to initialize g and v.
+        # Try to reconstruct the initializer of w by initializing g and v.
+        # Set the initializers of g and v as below, then the distribution
+        # of w is the same as initializing w with the given initializer.
+        # For Data-Dependent Initialization, please compute the init-values
+        # of g and v in external and then feed the values to g and v by
+        # executing an extra program.
+        g_param = self.startup_program.global_block().create_parameter(
+            dtype=dtype,
+            shape=g_param_shape,
+            **g_param_attr.to_kwargs(with_initializer=False))
+        v_param = self.startup_program.global_block().create_parameter(
+            dtype=dtype,
+            shape=v_param_shape,
+            **v_param_attr.to_kwargs(with_initializer=True))
+        __norm_except_dim(
+            x=v_param,
+            out=g_param,
+            dim=attr.dim,
+            block=self.startup_program.global_block())
+
+        # Add weight normalization to main_program
+        g_param = self.main_program.global_block().create_parameter(
+            dtype=dtype, shape=g_param_shape, **g_param_attr.to_kwargs())
+        v_param = self.main_program.global_block().create_parameter(
+            dtype=dtype, shape=v_param_shape, **v_param_attr.to_kwargs())
+        w_param = __weight_normalize(g_param, v_param, dim=attr.dim)
+        return w_param
+
     def create_parameter(self,
                          attr,
                          shape,
@@ -114,16 +285,23 @@ class LayerHelper(object):
         attr = copy.deepcopy(attr)
         assert isinstance(attr, ParamAttr)
         suffix = 'b' if is_bias else 'w'
+        if attr.name is None:
+            attr.name = unique_name(".".join([self.name, suffix]))
 
-        if default_initializer is None:
+        if default_initializer is None and attr.initializer is None:
             if is_bias:
                 attr.set_default_bias_initializer()
             else:
                 attr.set_default_param_initializer()
         else:
             attr.set_default_initializer(default_initializer)
-        if attr.name is None:
-            attr.name = unique_name(".".join([self.name, suffix]))
+
+        # If weight normalization is set, insert extra parameters and ops.
+        # Refer to https://arxiv.org/pdf/1602.07868.pdf
+        if isinstance(attr, WeightNormParamAttr):
+            param = self._create_weight_normalize(attr, shape, dtype)
+            WeightNormParamAttr.params_with_weight_norm.append(param)
+            return param
 
         self.startup_program.global_block().create_parameter(
             dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True))
diff --git a/python/paddle/v2/fluid/layers/math_op_patch.py b/python/paddle/v2/fluid/layers/math_op_patch.py
index f359e70126f7601b75261e795b5a37bdc241112e..79a130a3eb148e6c5a8fa3cdf174780b354c23c9 100644
--- a/python/paddle/v2/fluid/layers/math_op_patch.py
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
@@ -145,7 +145,9 @@ def monkey_patch_variable():
             # a*b == b*a. Do not need to reverse explicitly
         ("__rmul__", "elementwise_mul", False),
         ("__div__", "elementwise_div", False),
-        ("__rdiv__", "elementwise_div", True)):
+        ("__rdiv__", "elementwise_div", True),
+        ("__pow__", "elementwise_pow", False),
+        ("__rpow__", "elementwise_pow", True)):
         setattr(Variable, method_name,
                 _elemwise_method_creator_(method_name, op_type, reverse))
 
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index d11dccfd22124d58d8634c01a00527c373b92f00..c38e21087de1bf7076ce5aaf23d4d4faaebb50a7 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -847,7 +847,35 @@ def cos_sim(X, Y, **kwargs):
     return out
 
 
-def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs):
+def dropout(x, dropout_prob, is_test=False, seed=None, **kwargs):
+    """
+    Computes dropout.
+
+    Drop or keep each element of `x` independently. Dropout is a regularization
+    technique for reducing overfitting by preventing neuron co-adaption during
+    training. The dropout operator randomly set (according to the given dropout
+    probability) the outputs of some units to zero, while others are remain
+    unchanged.
+
+    Args:
+       x(variable): The input tensor.
+       dropout_prob(float): Probability of setting units to zero.
+       is_test(bool): A flag indicating whether it is in test phrase or not.
+       seed(int): A Python integer used to create random seeds. If this
+                  parameter is set to None, a random seed is used.
+                  NOTE: If an integer seed is given, always the same output
+                  units will be dropped. DO NOT use a fixed seed in training.
+
+    Returns:
+        Variable: A tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          droped = fluid.layers.dropout(input=x, dropout_rate=0.5)
+    """
+
     helper = LayerHelper('dropout', **kwargs)
     out = helper.create_tmp_variable(dtype=x.dtype)
     mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
@@ -856,9 +884,12 @@ def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs):
         inputs={'X': [x]},
         outputs={'Out': [out],
                  'Mask': [mask]},
-        attrs={'dropout_prob': dropout_prob,
-               'is_test': is_test,
-               'seed': seed})
+        attrs={
+            'dropout_prob': dropout_prob,
+            'is_test': is_test,
+            'fix_seed': seed is not None,
+            'seed': seed if seed is not None else 0
+        })
     return out
 
 
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
index 022a94cad440f13383a927233195bb008a688843..ee3172c7b8dfd65c693e5aee9b55179e654ce7be 100644
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -56,6 +56,7 @@ __all__ = [
     'elementwise_mul',
     'elementwise_max',
     'elementwise_min',
+    'elementwise_pow',
     'clip',
     'clip_by_norm',
     'sequence_softmax',
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
index 6e7d09459c07c77a8579300a1c67ae36dc3d2ba2..c435c5206d1ef1ef57683a1a47bf089be6526f38 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -16,12 +16,14 @@ from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
 from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
+from ..initializer import Constant
 from ..core import DataType
 import numpy
 
 __all__ = [
     'create_tensor',
     'create_parameter',
+    'create_global_var',
     'cast',
     'concat',
     'sums',
@@ -58,13 +60,22 @@ def create_parameter(shape,
     Returns:
         Parameter: the created parameter
     """
-    helper = LayerHelper("create_parameter")
+    helper = LayerHelper("create_parameter", **locals())
     if attr is None:
         attr = ParamAttr()
     return helper.create_parameter(attr, shape, dtype, is_bias,
                                    default_initializer)
 
 
+def create_global_var(shape, value, dtype, persistable=False, name=None):
+    helper = LayerHelper("global_var", **locals())
+    var = helper.create_global_variable(
+        dtype=dtype, shape=shape, persistable=persistable, name=name)
+    helper.set_variable_initializer(
+        var, initializer=Constant(value=float(value)))
+    return var
+
+
 def cast(x, dtype):
     """
     This function takes in the input with input_dtype
diff --git a/python/paddle/v2/fluid/learning_rate_decay.py b/python/paddle/v2/fluid/learning_rate_decay.py
new file mode 100644
index 0000000000000000000000000000000000000000..96b3e9a0d73cede5d6e36308a53ab8927a95a6da
--- /dev/null
+++ b/python/paddle/v2/fluid/learning_rate_decay.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import layers
+from framework import Variable
+
+__all__ = ['exponential_decay', 'natural_exp_decay', 'inverse_time_decay']
+"""
+When training a model, it's often useful to decay the
+learning rate during training process, this is called
+learning_rate_decay. There are many strategies to do
+this, this module will provide some classical method.
+User can also implement their own learning_rate_decay
+strategy according to this module.
+"""
+
+
+def exponential_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    """Applies exponential decay to the learning rate.
+
+    ```python
+    decayed_learning_rate = learning_rate *
+            decay_rate ^ (global_step / decay_steps)
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for exponential_decay.")
+
+    # update learning_rate
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = layers.floor(x=div_res)
+    return learning_rate * (decay_rate**div_res)
+
+
+def natural_exp_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    """Applies natural exponential decay to the initial learning rate.
+
+    ```python
+    if not staircase:
+        decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+    else:
+        decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for natural_exp_decay.")
+
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = layers.floor(x=div_res)
+    return learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+
+
+def inverse_time_decay(learning_rate,
+                       global_step,
+                       decay_steps,
+                       decay_rate,
+                       staircase=False):
+    """Applies inverse time decay to the initial learning rate.
+
+    ```python
+    if staircase:
+      decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
+    else
+      decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for inverse_time_decay.")
+
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = layers.floor(x=div_res)
+
+    return learning_rate / (1 + decay_rate * div_res)
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index 0c3533b892176edd5dfd111fdd771cc17d468168..7844a4e2df1ce3989e48082f6472292560fbf1ee 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -15,6 +15,7 @@
 from collections import defaultdict
 
 import framework
+import layers
 from backward import append_backward
 from framework import unique_name, program_guard
 from initializer import Constant
@@ -33,9 +34,11 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    def __init__(self, global_step=None, regularization=None):
+    def __init__(self, learning_rate, global_step=None, regularization=None):
+        assert learning_rate is not None
         self._global_step = global_step
         self.regularization = regularization
+        self._global_learning_rate = learning_rate
         # Dictionary of accumulators. Some optimizer subclasses need to
         # allocate and manage extra variables associated with the parameters
         # to train. These variables are called accumulators.
@@ -43,6 +46,28 @@ class Optimizer(object):
         self._accumulators = defaultdict(lambda: dict())
         self.helper = None
 
+    def _create_global_learning_rate(self):
+        if isinstance(self._global_learning_rate, float):
+            self._global_learning_rate = layers.create_global_var(
+                name=unique_name("learning_rate"),
+                shape=[1],
+                value=float(self._global_learning_rate),
+                dtype='float32',
+                persistable=True)
+
+        if not isinstance(self._global_learning_rate, framework.Variable):
+            raise ValueError("learning rate should be a Variable, "
+                             "actual type is %s",
+                             type(self._global_learning_rate))
+
+    @property
+    def global_learning_rate(self):
+        """
+        get global decayed learning rate
+        :return:
+        """
+        return self._global_learning_rate
+
     def _append_optimize_op(self, block, param_and_grad):
         """ append optimize operator to block and return all the added optimize_op
         """
@@ -52,17 +77,7 @@ class Optimizer(object):
         # create learning rate variable for every parameter
         param = param_and_grad[0]
         param_lr = param.optimize_attr['learning_rate']
-        param_lr_shape = [1]
-        param_lr_var = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=param_lr_shape,
-            lod_level=1,
-            persistable=True)
-        param_lr = param_lr * self._learning_rate
-        self.helper.set_variable_initializer(
-            var=param_lr_var, initializer=Constant(param_lr))
-        return param_lr_var
+        return self._global_learning_rate * param_lr
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -163,7 +178,7 @@ class Optimizer(object):
           optimization. This will include parameter update ops, global step
           update ops and any other custom ops required by subclasses to manage
           their internal state.
-          :param startup_program: 
+          :param startup_program:
         """
         # This is a default implementation of create_optimization_pass that
         # can be shared by most optimizers. This implementation assumes that
@@ -178,6 +193,7 @@ class Optimizer(object):
             self.helper = LayerHelper(self.__class__.__name__)
             self._create_accumulators(loss.block,
                                       [p[0] for p in parameters_and_grads])
+            self._create_global_learning_rate()
 
             optimize_ops = []
             for param_and_grad in parameters_and_grads:
@@ -231,9 +247,9 @@ class SGDOptimizer(Optimizer):
 
     def __init__(self, learning_rate, **kwargs):
         assert learning_rate is not None
-        super(SGDOptimizer, self).__init__(**kwargs)
+        super(SGDOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "sgd"
-        self._learning_rate = learning_rate
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -259,9 +275,9 @@ class MomentumOptimizer(Optimizer):
     def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs):
         assert learning_rate is not None
         assert momentum is not None
-        super(MomentumOptimizer, self).__init__(**kwargs)
+        super(MomentumOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "momentum"
-        self._learning_rate = learning_rate
         self._momentum = momentum
         self._use_nesterov = bool(use_nesterov)
 
@@ -303,9 +319,9 @@ class AdagradOptimizer(Optimizer):
     def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs):
         assert learning_rate is not None
         assert epsilon is not None
-        super(AdagradOptimizer, self).__init__(**kwargs)
+        super(AdagradOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "adagrad"
-        self._learning_rate = learning_rate
         self._epsilon = epsilon
 
     def _create_accumulators(self, block, parameters):
@@ -352,9 +368,9 @@ class AdamOptimizer(Optimizer):
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(AdamOptimizer, self).__init__(**kwargs)
+        super(AdamOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "adam"
-        self._learning_rate = learning_rate
         self._beta1 = beta1
         self._beta2 = beta2
         self._epsilon = epsilon
@@ -457,9 +473,9 @@ class AdamaxOptimizer(Optimizer):
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        super(AdamaxOptimizer, self).__init__(**kwargs)
+        super(AdamaxOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "adamax"
-        self._learning_rate = learning_rate
         self._beta1 = beta1
         self._beta2 = beta2
         self._epsilon = epsilon
@@ -535,9 +551,9 @@ class DecayedAdagradOptimizer(Optimizer):
         assert decay is not None
         assert epsilon is not None
 
-        super(DecayedAdagradOptimizer, self).__init__(**kwargs)
+        super(DecayedAdagradOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
         self.type = "decayed_adagrad"
-        self._learning_rate = learning_rate
         self._decay = decay
         self._epsilon = epsilon
 
diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py
index dcca8b6c547d10864ff4cd0af1c217d89e3b522f..fc566b8a2480ce9256d610b4731405cd6d89b7e4 100644
--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
@@ -15,7 +15,10 @@
 from initializer import Initializer, Xavier, Constant
 from regularizer import WeightDecayRegularizer
 
-__all__ = ['ParamAttr']
+__all__ = [
+    'ParamAttr',
+    'WeightNormParamAttr',
+]
 
 
 class ParamAttr(object):
@@ -82,3 +85,20 @@ class ParamAttr(object):
         if with_initializer:
             kwargs['initializer'] = self.initializer
         return kwargs
+
+
+class WeightNormParamAttr(ParamAttr):
+    """
+    Used for weight normalization. Any field in ParamAttr can also be set here.
+    Besides, an extra field dim can be set to indicate the dimension except 
+    which to normalize.
+    """
+    # List to record the parameters reparameterized by weight normalization.
+    # If these parameters are treated as Variable rather than Parameter,
+    # it can be used to discriminate these parameters and help to serialize
+    # these paramters for inference.
+    params_with_weight_norm = []
+
+    def __init__(self, dim=None, **kwargs):
+        super(WeightNormParamAttr, self).__init__(**kwargs)
+        self.dim = dim
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py
index 51c1c8aa705513825b46fb936c6c99090c50fb7d..d4a2cd7eeabecb60699b5be94d89cf7a916749e7 100644
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2.fluid.core as core
+import core
 from contextlib import contextmanager
 import os
 
-__all__ = ['CudaProfiler']
+__all__ = ['cuda_profiler', 'reset_profiler', 'profiler']
 
 NVPROF_CONFIG = [
     "gpustarttimestamp",
diff --git a/python/paddle/v2/fluid/regularizer.py b/python/paddle/v2/fluid/regularizer.py
index c2f28eecfda71e305d96c5a6b62c4f5f0fbf3fa6..0273da647afb6e95a136b5ecd0975347d9a378ff 100644
--- a/python/paddle/v2/fluid/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
@@ -87,6 +87,11 @@ class WeightDecayRegularizer(object):
         """
         raise NotImplementedError()
 
+    def __str__(self):
+        """Debug string
+        """
+        raise NotImplementedError()
+
 
 class L2DecayRegularizer(WeightDecayRegularizer):
     """Implements the L2 Weight Decay Regularization
@@ -123,6 +128,9 @@ class L2DecayRegularizer(WeightDecayRegularizer):
 
         return decay
 
+    def __str__(self):
+        return "L2Decay, regularization_coeff=%f" % self._regularization_coeff
+
 
 class L1DecayRegularizer(WeightDecayRegularizer):
     """Implements the L1 Weight Decay Regularization
@@ -163,6 +171,9 @@ class L1DecayRegularizer(WeightDecayRegularizer):
 
         return decay
 
+    def __str__(self):
+        return "L1Decay, regularization_coeff=%f" % self._regularization_coeff
+
 
 # We short the class name, since users will use the regulaizer with the package
 # name. The sample code:
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
index 1a342bf1fbbc0e5f4e3c7d440424b66c4b9f732f..f85768de99adb8b5005b23278ad807a24c5bff65 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -175,7 +175,7 @@ def main():
         paddle.reader.shuffle(
             paddle.dataset.conll05.test(), buf_size=8192),
         batch_size=BATCH_SIZE)
-    #place = fluid.CPUPlace()
+    # place = fluid.CPUPlace()
     place = fluid.CUDAPlace(0)
     feeder = fluid.DataFeeder(
         feed_list=[
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
index ac7ef4046f9ff55c2cbfc28b50784b9bffb80d53..b4b6020f58e7538dfe0f98c17d61f3614c3c6fc4 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
@@ -45,8 +45,9 @@ BATCH_SIZE = 64
 def loss_net(hidden, label):
     prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
     loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    return fluid.layers.mean(x=loss), fluid.layers.accuracy(
-        input=prediction, label=label)
+    avg_loss = fluid.layers.mean(x=loss)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    return prediction, avg_loss, acc
 
 
 def mlp(img, label):
@@ -73,8 +74,7 @@ def conv_net(img, label):
     return loss_net(conv_pool_2, label)
 
 
-def main():
-    args = parse_arg()
+def train(args, save_dirname=None):
     print("recognize digits with args: {0}".format(" ".join(sys.argv[1:])))
 
     img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
@@ -91,7 +91,8 @@ def main():
         with pd.do():
             img_ = pd.read_input(img)
             label_ = pd.read_input(label)
-            for o in net_conf(img_, label_):
+            prediction, avg_loss, acc = net_conf(img_, label_)
+            for o in [avg_loss, acc]:
                 pd.write_output(o)
 
         avg_loss, acc = pd()
@@ -99,7 +100,7 @@ def main():
         avg_loss = fluid.layers.mean(x=avg_loss)
         acc = fluid.layers.mean(x=acc)
     else:
-        avg_loss, acc = net_conf(img, label)
+        prediction, avg_loss, acc = net_conf(img, label)
 
     test_program = fluid.default_main_program().clone()
 
@@ -137,7 +138,10 @@ def main():
                 acc_val = numpy.array(acc_set).mean()
                 avg_loss_val = numpy.array(avg_loss_set).mean()
                 if float(acc_val) > 0.85:  # test acc > 85%
-                    exit(0)
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, ["img"],
+                                                      [prediction], exe)
+                    return
                 else:
                     print(
                         'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
@@ -145,5 +149,36 @@ def main():
                                float(avg_loss_val), float(acc_val)))
 
 
+def infer(args, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    # The input's dimension of conv should be 4-D or 5-D.
+    tensor_img = numpy.random.rand(1, 1, 28, 28).astype("float32")
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: tensor_img},
+                      fetch_list=fetch_targets)
+    print("infer results: ", results[0])
+
+
 if __name__ == '__main__':
-    main()
+    args = parse_arg()
+    if not args.use_cuda and not args.parallel:
+        save_dirname = "recognize_digits_" + args.nn_type + ".inference.model"
+    else:
+        save_dirname = None
+    train(args, save_dirname)
+    infer(args, save_dirname)
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
similarity index 52%
rename from python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
rename to python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
index 529223eba8af6d968b490068f34559880312515d..2ba9077a26202b1c16cc480823115f7ad55c2c67 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,9 +12,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
-import paddle.v2 as paddle
+import unittest
 import paddle.v2.fluid as fluid
+import paddle.v2 as paddle
+import contextlib
+
+
+def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
+                    hid_dim=32):
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=3,
+        act="tanh",
+        pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=4,
+        act="tanh",
+        pool_type="sqrt")
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, accuracy
 
 
 def stacked_lstm_net(data,
@@ -51,63 +78,77 @@ def stacked_lstm_net(data,
     avg_cost = fluid.layers.mean(x=cost)
     adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
     adam_optimizer.minimize(avg_cost)
-    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, accuracy.metrics[0]
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def main():
-    BATCH_SIZE = 100
-    PASS_NUM = 5
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, accuracy
 
-    word_dict = paddle.dataset.imdb.word_dict()
-    print "load word dict successfully"
+
+def main(word_dict, net_method, use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    BATCH_SIZE = 128
+    PASS_NUM = 5
     dict_dim = len(word_dict)
     class_dim = 2
 
     data = fluid.layers.data(
         name="words", shape=[1], dtype="int64", lod_level=1)
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost, accuracy, acc_out = stacked_lstm_net(
+    cost, acc_out = net_method(
         data, label, input_dim=dict_dim, class_dim=class_dim)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.imdb.train(word_dict), buf_size=1000),
         batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
     feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
 
     exe.run(fluid.default_startup_program())
 
     for pass_id in xrange(PASS_NUM):
-        accuracy.reset(exe)
         for data in train_data():
             cost_val, acc_val = exe.run(fluid.default_main_program(),
                                         feed=feeder.feed(data),
                                         fetch_list=[cost, acc_out])
-            pass_acc = accuracy.eval(exe)
-            print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
-                  " pass_acc=" + str(pass_acc))
-            if cost_val < 1.0 and acc_val > 0.8:
-                exit(0)
-    exit(1)
+            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
+            if cost_val < 0.4 and acc_val > 0.8:
+                return
+    raise AssertionError("Cost is too large for {0}".format(
+        net_method.__name__))
+
+
+class TestUnderstandSentiment(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.word_dict = paddle.dataset.imdb.word_dict()
+
+    @contextlib.contextmanager
+    def new_program_scope(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+    def test_conv_cpu(self):
+        with self.new_program_scope():
+            main(self.word_dict, net_method=convolution_net, use_cuda=False)
+
+    def test_stacked_lstm_cpu(self):
+        with self.new_program_scope():
+            main(self.word_dict, net_method=stacked_lstm_net, use_cuda=False)
+
+    def test_conv_gpu(self):
+        with self.new_program_scope():
+            main(self.word_dict, net_method=convolution_net, use_cuda=True)
+
+    def test_stacked_lstm_gpu(self):
+        with self.new_program_scope():
+            main(self.word_dict, net_method=stacked_lstm_net, use_cuda=True)
 
 
 if __name__ == '__main__':
-    main()
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
deleted file mode 100644
index df27399dd215a579d7e3f8a1659180a06b1e7f64..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-
-
-def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
-                    hid_dim=32):
-    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=3,
-        act="tanh",
-        pool_type="sqrt")
-    conv_4 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=4,
-        act="tanh",
-        pool_type="sqrt")
-    prediction = fluid.layers.fc(input=[conv_3, conv_4],
-                                 size=class_dim,
-                                 act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
-    adam_optimizer.minimize(avg_cost)
-    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, accuracy.metrics[0]
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def main():
-    BATCH_SIZE = 100
-    PASS_NUM = 5
-
-    word_dict = paddle.dataset.imdb.word_dict()
-    dict_dim = len(word_dict)
-    class_dim = 2
-
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-    cost, accuracy, acc_out = convolution_net(
-        data, label, input_dim=dict_dim, class_dim=class_dim)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-
-    exe.run(fluid.default_startup_program())
-
-    for pass_id in xrange(PASS_NUM):
-        accuracy.reset(exe)
-        for data in train_data():
-            cost_val, acc_val = exe.run(fluid.default_main_program(),
-                                        feed=feeder.feed(data),
-                                        fetch_list=[cost, acc_out])
-            pass_acc = accuracy.eval(exe)
-            print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
-                  " pass_acc=" + str(pass_acc))
-            if cost_val < 1.0 and pass_acc > 0.8:
-                exit(0)
-    exit(1)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
deleted file mode 100644
index 117f74c59ad5bf6bb67711801cd7b9a41f39f1f8..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ /dev/null
@@ -1,160 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-from paddle.v2.fluid.layer_helper import LayerHelper
-
-
-def lstm(x, c_pre_init, hidden_dim, forget_bias=None):
-    """
-    This function helps create an operator for the LSTM (Long Short Term
-    Memory) cell that can be used inside an RNN.
-    """
-    helper = LayerHelper('lstm_unit', **locals())
-    rnn = fluid.layers.StaticRNN()
-    with rnn.step():
-        c_pre = rnn.memory(init=c_pre_init)
-        x_t = rnn.step_input(x)
-
-        before_fc = fluid.layers.concat(input=[x_t, c_pre], axis=1)
-        after_fc = fluid.layers.fc(input=before_fc, size=hidden_dim * 4)
-
-        dtype = x.dtype
-        c = helper.create_tmp_variable(dtype)
-        h = helper.create_tmp_variable(dtype)
-
-        helper.append_op(
-            type='lstm_unit',
-            inputs={"X": after_fc,
-                    "C_prev": c_pre},
-            outputs={"C": c,
-                     "H": h},
-            attrs={"forget_bias": forget_bias})
-
-        rnn.update_memory(c_pre, c)
-        rnn.output(h)
-
-    return rnn()
-
-
-def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
-    data = fluid.layers.data(
-        name="words",
-        shape=[seq_len * batch_size, 1],
-        append_batch_size=False,
-        dtype="int64",
-        lod_level=1)
-    label = fluid.layers.data(
-        name="label",
-        shape=[batch_size, 1],
-        append_batch_size=False,
-        dtype="int64")
-
-    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
-    emb = fluid.layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
-    emb = fluid.layers.transpose(x=emb, perm=[1, 0, 2])
-
-    c_pre_init = fluid.layers.fill_constant(
-        dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0)
-    c_pre_init.stop_gradient = False
-    layer_1_out = lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
-    layer_1_out = fluid.layers.transpose(x=layer_1_out, perm=[1, 0, 2])
-
-    prediction = fluid.layers.fc(input=layer_1_out,
-                                 size=class_dim,
-                                 act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-
-    avg_cost = fluid.layers.mean(x=cost)
-    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
-    adam_optimizer.minimize(avg_cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def chop_data(data, chop_len=80, batch_size=50):
-    data = [(x[0][:chop_len], x[1]) for x in data if len(x[0]) >= chop_len]
-
-    return data[:batch_size]
-
-
-def prepare_feed_data(data, place):
-    tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
-
-    label = np.array(map(lambda x: x[1], data)).astype("int64")
-    label = label.reshape([len(label), 1])
-    tensor_label = fluid.LoDTensor()
-    tensor_label.set(label, place)
-
-    return tensor_words, tensor_label
-
-
-def main():
-    BATCH_SIZE = 100
-    PASS_NUM = 5
-
-    word_dict = paddle.dataset.imdb.word_dict()
-    print "load word dict successfully"
-    dict_dim = len(word_dict)
-    class_dim = 2
-
-    cost, acc = lstm_net(dict_dim=dict_dim, class_dim=class_dim)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=BATCH_SIZE * 10),
-        batch_size=BATCH_SIZE)
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    exe.run(fluid.default_startup_program())
-
-    for pass_id in xrange(PASS_NUM):
-        for data in train_data():
-            chopped_data = chop_data(data)
-            tensor_words, tensor_label = prepare_feed_data(chopped_data, place)
-
-            outs = exe.run(fluid.default_main_program(),
-                           feed={"words": tensor_words,
-                                 "label": tensor_label},
-                           fetch_list=[cost, acc])
-            cost_val = np.array(outs[0])
-            acc_val = np.array(outs[1])
-
-            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
-            if acc_val > 0.7:
-                exit(0)
-    exit(1)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
index 8cf54846fe5dba2742ce69e34e0788e124a1a85d..766ba9681d1bb816170e0458f540b32511c02933 100644
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -12,76 +12,145 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import unittest
+import os
 
-PASS_NUM = 100
-EMBED_SIZE = 32
-HIDDEN_SIZE = 256
-N = 5
-BATCH_SIZE = 32
-IS_SPARSE = True
-
-word_dict = paddle.dataset.imikolov.build_dict()
-dict_size = len(word_dict)
-
-first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
-second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
-third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
-forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
-next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
-
-embed_first = fluid.layers.embedding(
-    input=first_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-embed_second = fluid.layers.embedding(
-    input=second_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-embed_third = fluid.layers.embedding(
-    input=third_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-embed_forth = fluid.layers.embedding(
-    input=forth_word,
-    size=[dict_size, EMBED_SIZE],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr='shared_w')
-
-concat_embed = fluid.layers.concat(
-    input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
-hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
-predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
-cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
-avg_cost = fluid.layers.mean(x=cost)
-sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-sgd_optimizer.minimize(avg_cost)
-
-train_reader = paddle.batch(
-    paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-feeder = fluid.DataFeeder(
-    feed_list=[first_word, second_word, third_word, forth_word, next_word],
-    place=place)
-
-exe.run(fluid.default_startup_program())
-
-for pass_id in range(PASS_NUM):
-    for data in train_reader():
-        avg_cost_np = exe.run(fluid.default_main_program(),
-                              feed=feeder.feed(data),
-                              fetch_list=[avg_cost])
-        if avg_cost_np[0] < 5.0:
-            exit(0)  # if avg cost less than 10.0, we think our code is good.
-exit(1)
+
+def main(use_cuda, is_sparse, parallel):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    PASS_NUM = 100
+    EMBED_SIZE = 32
+    HIDDEN_SIZE = 256
+    N = 5
+    BATCH_SIZE = 32
+    IS_SPARSE = is_sparse
+
+    def __network__(words):
+        embed_first = fluid.layers.embedding(
+            input=words[0],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_second = fluid.layers.embedding(
+            input=words[1],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_third = fluid.layers.embedding(
+            input=words[2],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_forth = fluid.layers.embedding(
+            input=words[3],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+
+        concat_embed = fluid.layers.concat(
+            input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+        hidden1 = fluid.layers.fc(input=concat_embed,
+                                  size=HIDDEN_SIZE,
+                                  act='sigmoid')
+        predict_word = fluid.layers.fc(input=hidden1,
+                                       size=dict_size,
+                                       act='softmax')
+        cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
+        avg_cost = fluid.layers.mean(x=cost)
+        return avg_cost
+
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+
+    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+    forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+
+    if not parallel:
+        avg_cost = __network__(
+            [first_word, second_word, third_word, forth_word, next_word])
+    else:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            avg_cost = __network__(
+                map(pd.read_input, [
+                    first_word, second_word, third_word, forth_word, next_word
+                ]))
+            pd.write_output(avg_cost)
+
+        avg_cost = fluid.layers.mean(x=pd())
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(
+        feed_list=[first_word, second_word, third_word, forth_word, next_word],
+        place=place)
+
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in range(PASS_NUM):
+        for data in train_reader():
+            avg_cost_np = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+            if avg_cost_np[0] < 5.0:
+                return
+    raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
+
+
+FULL_TEST = os.getenv('FULL_TEST',
+                      '0').lower() in ['true', '1', 't', 'y', 'yes', 'on']
+SKIP_REASON = "Only run minimum number of tests in CI server, to make CI faster"
+
+
+class W2VTest(unittest.TestCase):
+    pass
+
+
+def inject_test_method(use_cuda, is_sparse, parallel):
+    fn_name = "test_{0}_{1}_{2}".format("cuda" if use_cuda else "cpu", "sparse"
+                                        if is_sparse else "dense", "parallel"
+                                        if parallel else "normal")
+
+    def __impl__(*args, **kwargs):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                main(use_cuda=use_cuda, is_sparse=is_sparse, parallel=parallel)
+
+    if use_cuda and is_sparse and parallel:
+        fn = __impl__
+    else:
+        # skip the other test when on CI server
+        fn = unittest.skipUnless(
+            condition=FULL_TEST, reason=SKIP_REASON)(__impl__)
+
+    setattr(W2VTest, fn_name, fn)
+
+
+for use_cuda in (False, True):
+    for is_sparse in (False, True):
+        for parallel in (False, True):
+            inject_test_method(use_cuda, is_sparse, parallel)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_dropout_op.py b/python/paddle/v2/fluid/tests/test_dropout_op.py
index 107b9567dc4a8539532c2fff40df437cc72cc163..b0c55df9f58834688846c5362113464996eb286a 100644
--- a/python/paddle/v2/fluid/tests/test_dropout_op.py
+++ b/python/paddle/v2/fluid/tests/test_dropout_op.py
@@ -21,7 +21,7 @@ class TestDropoutOp(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_test': False}
+        self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
         self.outputs = {
             'Out': self.inputs['X'],
             'Mask': np.ones((32, 64)).astype('float32')
@@ -38,7 +38,7 @@ class TestDropoutOp2(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 1.0, 'is_test': False}
+        self.attrs = {'dropout_prob': 1.0, 'fix_seed': True, 'is_test': False}
         self.outputs = {
             'Out': np.zeros((32, 64)).astype('float32'),
             'Mask': np.zeros((32, 64)).astype('float32')
@@ -49,7 +49,7 @@ class TestDropoutOp3(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.0, 'is_test': False}
+        self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
         self.outputs = {
             'Out': self.inputs['X'],
             'Mask': np.ones((32, 64, 2)).astype('float32')
@@ -60,7 +60,7 @@ class TestDropoutOp4(OpTest):
     def setUp(self):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
-        self.attrs = {'dropout_prob': 0.35, 'is_test': True}
+        self.attrs = {'dropout_prob': 0.35, 'fix_seed': True, 'is_test': True}
         self.outputs = {
             'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
         }
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py b/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e31749df9baf10215fcd0cca3c1097f00c163ec7
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py
@@ -0,0 +1,43 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwisePowOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestElementwisePowOp_scalar(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype('float32'),
+            'Y': np.random.rand(1).astype('float32')
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_label_smooth_op.py b/python/paddle/v2/fluid/tests/test_label_smooth_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..19a4df57446c0c83b415909df3e0246bf2716881
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_label_smooth_op.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLabelSmoothOp(OpTest):
+    def config(self):
+        self.op_type = "label_smooth"
+        self.epsilon = 0.1
+        batch_size, self.label_dim = 5, 10
+        self.label = np.zeros((batch_size, self.label_dim)).astype("float64")
+        nonzero_index = np.random.randint(self.label_dim, size=(batch_size))
+        self.label[np.arange(batch_size), nonzero_index] = 1
+
+    def setUp(self):
+        self.config()
+        smoothed_label = (1 - self.epsilon
+                          ) * self.label + self.epsilon / self.label_dim
+        self.inputs = {'X': self.label}
+        self.attrs = {'epsilon': self.epsilon}
+        self.outputs = {'Out': smoothed_label}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp):
+    def setUp(self):
+        self.config()
+        dist = np.random.random((1, self.label_dim))
+        smoothed_label = (1 - self.epsilon) * self.label + self.epsilon * dist
+        self.inputs = {'X': self.label, 'PriorDist': dist}
+        self.attrs = {'epsilon': self.epsilon}
+        self.outputs = {'Out': smoothed_label}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..68cf8673cd46677065588f652482cd0df08b3450
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
@@ -0,0 +1,252 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+
+from operator import mul
+from op_test import OpTest
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
+from paddle.v2.fluid.framework import grad_var_name
+
+
+def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
+    x_shape = x.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+    x.shape = [N, D]
+
+    mean = np.mean(x, axis=1)
+    var = np.var(x, axis=1) + epsilon
+    output = scale.reshape([1, D]) * np.divide(
+        (x - mean.reshape([N, 1])),
+        (np.sqrt(var)).reshape([N, 1])) + beta.reshape([1, D])
+
+    x.shape, output.shape = x_shape, x_shape
+    return output, mean, var
+
+
+def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
+    x_shape = x.shape
+    scale_shape = scale.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+    x.shape, grad_y.shape = [N, D], [N, D]
+    var.shape, mean.shape = [N, 1], [N, 1]
+    scale.shape = [1, D]
+
+    # d_bias
+    d_bias = np.sum(grad_y, axis=0).reshape([1, D])
+    # d_scale
+    d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y,
+                     axis=0).reshape([1, D])
+    # dx
+    dx_end = scale * np.sqrt(1.0 / var) * grad_y
+    d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape(
+        [N, 1])  # the second part equals to zero.
+    d_mean = 1.0 / D * d_mean_0
+    d_std = np.sum(
+        -(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * (
+            1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean))
+
+    grad_x = dx_end + d_mean + d_std
+
+    grad_y.shape = x_shape
+    x.shape = x_shape
+    scale.shape = scale_shape
+    return grad_x, d_scale, d_bias
+
+
+def get_backward_op(scope, op, no_grad_set):
+    backward_op = core.Operator.backward(op, no_grad_set)
+    for input in backward_op.input_vars():
+        var = scope.var(input)
+        var.get_tensor()
+    for output in backward_op.output_vars():
+        var = scope.var(output)
+        var.get_tensor()
+    return backward_op
+
+
+def create_or_get_tensor(scope, var_name, var, place):
+    tensor = scope.var(var_name).get_tensor()
+    if var is not None:
+        assert isinstance(var, np.ndarray)
+        tensor.set_lod([[]])
+        tensor.set_dims(var.shape)
+        tensor.set(var, place)
+    return tensor
+
+
+def set_output_grad(scope, outputs, place, feed_dict=None):
+    def __set_tensor__(name, data=None):
+        out_tensor = scope.find_var(name).get_tensor()
+        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
+        out_dtype = out_tensor.dtype()
+        if data is None:
+            if out_dtype == core.DataType.FP64:
+                data = np.ones(out_tensor.shape(), dtype=np.float64)
+            elif out_dtype == core.DataType.FP32:
+                data = np.ones(out_tensor.shape(), dtype=np.float32)
+            else:
+                raise ValueError("Not supported data type " + str(out_dtype))
+        grad_tensor.set(data, place)
+
+    for output in outputs:
+        data = None
+        if output in feed_dict:
+            data = feed_dict[output]
+        __set_tensor__(output, data)
+
+
+class TestLayerNormdOp(OpTest):
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(
+            np.allclose(
+                np.array(tensor).reshape(np_array.shape), np_array, atol=atol),
+            msg)
+
+    def __assert_grad_close(self,
+                            tensor,
+                            np_array,
+                            name,
+                            place,
+                            max_relative_error=0.02):
+        a = np.array(tensor).reshape(np_array.shape)
+        b = np_array
+        abs_a = np.abs(a)
+        abs_a[abs_a < 1e-5] = 1
+
+        diff_mat = np.abs(a - b) / abs_a
+        max_diff = np.max(diff_mat)
+
+        def err_msg():
+            offset = np.argmax(diff_mat > max_relative_error)
+            return ("%s Variable %s max gradient diff %f over limit %f, "
+                    "the first error element is %d, %f, %f") % (
+                        "Gradient Check On %s" % str(place), name, max_diff,
+                        max_relative_error, offset, a.flatten()[offset],
+                        b.flatten()[offset])
+
+        self.assertLessEqual(max_diff, max_relative_error, err_msg())
+
+    def check_forward_backward(self, shape, begin_norm_axis):
+        def test_with_place(place, shape, begin_norm_axis=1):
+            # setUp
+            assert begin_norm_axis > 0 and begin_norm_axis < len(
+                shape), 'begin_norm_axis must be between 0 and len(shape)-1.'
+            # attr
+            epsilon = 0.00001
+            x_shape = shape
+            D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+            scale_shape = [D]
+            np.random.random(123)
+            x_val = np.random.random_sample(x_shape).astype(np.float32)
+            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+            y_grad = np.random.random_sample(x_shape).astype(np.float32)
+
+            # run forward
+            y_out, saved_mean, var_ref = _reference_layer_norm_naive(
+                x_val, scale_val, bias_val, epsilon, begin_norm_axis)
+            naive_fw = {"Y": y_out, "Mean": saved_mean, "Variance": var_ref}
+
+            # get gradient
+            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad(
+                x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis)
+            naive_grad = {
+                "X": x_grad_ref,
+                "Scale": scale_grad_ref,
+                "Bias": bias_grad_ref
+            }
+
+            scope = core.Scope()
+
+            # create input
+            input_map = {"X": x_val, "Scale": scale_val, "Bias": bias_val}
+            for i_name in input_map:
+                create_or_get_tensor(scope, i_name, input_map[i_name], place)
+
+            # create output
+            output_map = {"Y": None, "Mean": None, "Variance": None}
+            output_tensor = {}
+            for o_name in output_map:
+                output_tensor[o_name] = create_or_get_tensor(
+                    scope, o_name, output_map[o_name], place)
+
+            layer_norm_op = Operator(
+                "layer_norm",
+                # inputs
+                X="X",
+                Scale="Scale",
+                Bias="Bias",
+                # outputs
+                Y="Y",
+                Mean="Mean",
+                Variance="Variance",
+                # attrs
+                epsilon=epsilon,
+                begin_norm_axis=begin_norm_axis)
+
+            layer_norm_op.run(scope, place)
+
+            # check forward result
+            atol = 5e-2 if isinstance(place, core.CUDAPlace) else 1e-4
+            for o_tensor in output_tensor:
+                self.__assert_close(output_tensor[o_tensor], naive_fw[o_tensor],
+                                    o_tensor, atol)
+
+            # run backward
+            layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set())
+            set_output_grad(
+                scope, ["Y", "Mean", "Variance"],
+                place,
+                feed_dict={"Y": y_grad})
+            layer_norm_op_grad.run(scope, place)
+
+            # get output
+            grad_tensor = {}
+            for o_name in naive_grad:
+                grad_tensor[o_name] = x_ = create_or_get_tensor(
+                    scope, grad_var_name(o_name), None, place)
+
+            # check gradient output
+            for o_grad in naive_grad:
+                self.__assert_grad_close(grad_tensor[o_grad],
+                                         naive_grad[o_grad], o_grad + "@GRAD",
+                                         place)
+
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            test_with_place(place, shape, begin_norm_axis)
+
+    def test_check_forward_backward_with_scale_and_bias(self):
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
+
+    def test_check_forward_backward_with_scale(self):
+        pass  # TODO(zcd)
+
+    def test_check_forward_backward_with_bias(self):
+        pass  # TODO(zcd)
+
+    def test_check_forward_backward(self):
+        pass  # TODO(zcd)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_learning_rate_decay.py b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc348cf2d21693290095900f8ab63c29923b4673
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import math
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.learning_rate_decay as lr_decay
+
+
+def exponential_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    exponent = float(global_step) / float(decay_steps)
+    if staircase:
+        exponent = math.floor(exponent)
+    return learning_rate * decay_rate**exponent
+
+
+def natural_exp_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    exponent = float(global_step) / float(decay_steps)
+    if staircase:
+        exponent = math.floor(exponent)
+    return learning_rate * math.exp(-1 * decay_rate * exponent)
+
+
+def inverse_time_decay(learning_rate,
+                       global_step,
+                       decay_steps,
+                       decay_rate,
+                       staircase=False):
+    temp = float(global_step) / float(decay_steps)
+    if staircase:
+        temp = math.floor(temp)
+    return learning_rate / (1 + decay_rate * temp)
+
+
+class TestLearningRateDecay(unittest.TestCase):
+    def check_decay(self, python_decay_fn, fluid_decay_fn, staircase):
+        init_lr = 1.0
+        decay_steps = 5
+        decay_rate = 0.5
+
+        global_step = layers.create_global_var(
+            shape=[1], value=0.0, dtype='float32', persistable=True)
+
+        decayed_lr = fluid_decay_fn(
+            learning_rate=init_lr,
+            global_step=global_step,
+            decay_steps=decay_steps,
+            decay_rate=decay_rate,
+            staircase=staircase)
+        layers.increment(global_step, 1.0)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        exe.run(fluid.default_startup_program())
+        for step in range(10):
+            step_val, lr_val = exe.run(fluid.default_main_program(),
+                                       feed=[],
+                                       fetch_list=[global_step, decayed_lr])
+            python_decayed_lr = python_decay_fn(
+                learning_rate=init_lr,
+                global_step=step,
+                decay_steps=decay_steps,
+                decay_rate=decay_rate,
+                staircase=staircase)
+            self.assertAlmostEqual(python_decayed_lr, lr_val[0])
+
+    def test_decay(self):
+        decay_fns = [
+            (exponential_decay, lr_decay.exponential_decay, True),
+            (exponential_decay, lr_decay.exponential_decay, False),
+            (natural_exp_decay, lr_decay.natural_exp_decay, True),
+            (natural_exp_decay, lr_decay.natural_exp_decay, False),
+            (inverse_time_decay, lr_decay.inverse_time_decay, True),
+            (inverse_time_decay, lr_decay.inverse_time_decay, False),
+        ]
+
+        for py_decay_fn, fluid_decay_fn, staircase in decay_fns:
+            print("decay_fn=" + str(py_decay_fn) + " staircase=" + str(
+                staircase))
+            main_program = framework.Program()
+            startup_program = framework.Program()
+            with framework.program_guard(main_program, startup_program):
+                self.check_decay(py_decay_fn, fluid_decay_fn, staircase)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_tensor.py b/python/paddle/v2/fluid/tests/test_tensor.py
index d5cc235f588ad37b0d1293dc9894952c97411757..0219bef42b3ba133dda7412c1036cf989a170a36 100644
--- a/python/paddle/v2/fluid/tests/test_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_tensor.py
@@ -108,9 +108,31 @@ class TestTensor(unittest.TestCase):
         scope = core.Scope()
         place = core.CPUPlace()
         lod_py = [[0, 2, 5], [0, 2, 4, 5]]
-        lod_tensor = core.LoDTensor(lod_py)
+        lod_tensor = core.LoDTensor()
 
         lod_tensor.set_dims([5, 2, 3, 4])
+        lod_tensor.set_lod(lod_py)
+        lod_tensor.alloc_float(place)
+        tensor_array = numpy.array(lod_tensor)
+        tensor_array[0, 0, 0, 0] = 1.0
+        tensor_array[0, 0, 0, 1] = 2.0
+        lod_tensor.set(tensor_array, place)
+
+        lod_v = numpy.array(lod_tensor)
+        self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
+        self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
+        self.assertListEqual(lod_py, lod_tensor.lod())
+
+    def test_lod_tensor_gpu_init(self):
+        if not core.is_compiled_with_cuda():
+            return
+        scope = core.Scope()
+        place = core.CUDAPlace(0)
+        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
+        lod_tensor = core.LoDTensor()
+
+        lod_tensor.set_dims([5, 2, 3, 4])
+        lod_tensor.set_lod(lod_py)
         lod_tensor.alloc_float(place)
         tensor_array = numpy.array(lod_tensor)
         tensor_array[0, 0, 0, 0] = 1.0
diff --git a/python/paddle/v2/fluid/tests/test_weight_normalization.py b/python/paddle/v2/fluid/tests/test_weight_normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..80ad8285d8a3c2ced814cc3588a814c14ec60855
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_weight_normalization.py
@@ -0,0 +1,121 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy
+import collections
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.initializer import ConstantInitializer
+from paddle.v2.fluid.param_attr import WeightNormParamAttr
+
+
+class TestWeightNormalization(unittest.TestCase):
+    batch_size = 3
+    hidden_size = 5
+    data_desc = (['x', [10], 0], )
+
+    @classmethod
+    def setUpClass(cls):
+        cls.set_program()
+
+    @classmethod
+    def set_program(cls):
+        data = fluid.layers.data(
+            name=cls.data_desc[0][0], shape=cls.data_desc[0][1])
+        out = fluid.layers.fc(input=data,
+                              size=cls.hidden_size,
+                              param_attr=WeightNormParamAttr(
+                                  dim=None,
+                                  name='weight_norm_param',
+                                  initializer=ConstantInitializer(1.0)),
+                              bias_attr=False,
+                              act=None)
+        loss = fluid.layers.reduce_sum(out)
+        fluid.backward.append_backward(loss=loss)
+        cls.fetch_list = [
+            'weight_norm_param_g', 'weight_norm_param_v',
+            'weight_norm_param_g@GRAD'
+        ]
+
+    def run_program(self):
+        outputs = []
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.set_inputs(place)
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            output = exe.run(fluid.default_main_program(),
+                             feed=self.inputs,
+                             fetch_list=self.fetch_list,
+                             return_numpy=False)
+            outputs.append(output)
+        self.actual_outputs = outputs
+
+    def set_data(self):
+        self.data = collections.OrderedDict()
+        for desc in self.data_desc:
+            data_name = desc[0]
+            data_shape = desc[1]
+            data_lod_level = desc[2]
+            data_lod = []
+            for i in range(data_lod_level):
+                lod_level_i = numpy.random.randint(
+                    low=1,
+                    high=5,
+                    size=self.batch_size if i == 0 else lod_level_i[-1])
+                lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
+                data_lod.append(lod_level_i)
+            data_value = numpy.random.random(
+                size=[data_lod[-1][-1] if data_lod else self.batch_size
+                      ] + data_shape).astype('float32')
+            self.data[data_name] = (data_value, data_lod)
+
+    def set_inputs(self, place):
+        self.inputs = {}
+        for desc in self.data_desc:
+            tensor = fluid.Tensor()
+            tensor.set(self.data[desc[0]][0], place)
+            if self.data[desc[0]][1]:
+                tensor.set_lod(self.data[desc[0]][1])
+            self.inputs[desc[0]] = tensor
+
+    def weight_normalize(self):
+        v = numpy.ones((self.data[self.data_desc[0][0]][0].shape[-1],
+                        self.hidden_size))
+        g = numpy.linalg.norm(v, axis=None, keepdims=True)
+        w = g * v / numpy.linalg.norm(v, axis=None, keepdims=True)
+        x = self.data[self.data_desc[0][0]][0]
+        out = numpy.dot(x, w)
+        g_grad = (numpy.dot(x.T, numpy.ones_like(out)) * (v / numpy.linalg.norm(
+            v, axis=None, keepdims=True))).sum(axis=None, keepdims=True)
+        return g, v, g_grad
+
+    def test_weight_normalization(self):
+        self.set_data()
+        self.run_program()
+        expect_output = self.weight_normalize()
+        for actual_output in self.actual_outputs:
+            [
+                self.assertTrue(
+                    numpy.allclose(
+                        numpy.array(actual), expect, atol=0.001))
+                for expect, actual in zip(expect_output, actual_output)
+            ]
+
+
+if __name__ == '__main__':
+    unittest.main()