diff --git a/AUTHORS.md b/AUTHORS.md
index 11f227be7148d8d6e055538347a8c31679406c84..8c4a113fc276783c945867ceae9612339b7f0bbc 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -22,6 +22,7 @@
 | jczaja | Jacek Czaja |
 | JiayiFeng | Jia-Yi Feng |
 | kbinias | Krzysztof Binias |
+| kexinzhao | Ke-Xin Zhao |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
 | lipeng-unisound | Peng Li |
diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py
index 69541adf6b7e53fcc1ac9d3c82b5a60ca0a72879..17f6b03826ae818a3671ea7f9355a8e8c04b50be 100644
--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -173,21 +173,6 @@ def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
         return avg_cost, feeding_list
 
 
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    lod_t = core.LoDTensor()
-    lod_t.set(flattened_data, place)
-    lod_t.set_lod([lod])
-    return lod_t, lod[-1]
-
-
 def lodtensor_to_ndarray(lod_tensor):
     dims = lod_tensor.get_dims()
     ndarray = np.zeros(shape=dims).astype('float32')
diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
index 211869af4e8d7180cb485811d3363c50d32f0f74..3231542a17ace99a17c9f9b9bdb3c2527637d9ef 100644
--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -125,18 +125,3 @@ def get_model(args):
         batch_size=args.batch_size)
 
     return loss, inference_program, adam, train_reader, test_reader, batch_acc
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 236a55d332a91c88d1c5515e7aca4142930a079f..cd44fe2542bfa8c53721d61b70778226e640d375 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -39,7 +39,7 @@ function(copy TARGET)
         message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
     endif()
     math(EXPR len "${copy_lib_SRCS_len} - 1")
-    
+
     add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
     foreach(index RANGE ${len})
         list(GET copy_lib_SRCS ${index} src)
@@ -155,6 +155,15 @@ copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}
 )
 
+if(WITH_CONTRIB)
+   set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
+   copy(contrib_inference_lib DEPS paddle_inference_api
+        SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
+        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
+        DSTS ${contrib_dst_dir} ${contrib_dst_dir}
+   )
+endif()
+
 set(module "platform")
 copy(platform_lib DEPS profiler_py_proto
   SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh
index 27f2419c06b3ba2d29c471c4928d098ccee9ea02..acc8b4aa3fb258e5beef2d1e54919d429cf7ea6f 100755
--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst
 
 for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
 do
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
index c49a98c744cdf907630ea8c74791ff2021d996e8..57efc9823ca0300018b4704e2e32105176970e6b 100644
--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -33,6 +33,13 @@ Xavier
     :members:
     :noindex:
 
+Bilinear
+--------
+
+..  autoclass:: paddle.fluid.initializer.Bilinear
+    :members:
+    :noindex:
+
 force_init_on_cpu
 -----------------
 
@@ -73,3 +80,10 @@ XavierInitializer
     :members:
     :noindex:
 
+BilinearInitializer
+-------------------
+
+..  autoclass:: paddle.fluid.initializer.BilinearInitializer
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst
index dd9d88b669957c22cd0a07fa4b7e219e2d6e5d61..21334c9edaada4398ec53455e31625d29f67dc54 100644
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -59,3 +59,39 @@ get_inference_program
 ..  autofunction:: paddle.fluid.io.get_inference_program
     :noindex:
 
+save_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.save_checkpoint
+    :noindex:
+
+load_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.load_checkpoint
+    :noindex:
+
+clean_checkpoint
+----------------
+
+..  autofunction:: paddle.fluid.io.clean_checkpoint
+    :noindex:
+
+load_persist_vars_without_grad
+------------------------------
+
+..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
+    :noindex:
+
+save_persist_vars_without_grad
+------------------------------
+
+..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
+    :noindex:
+
+get_latest_checkpoint_serial
+----------------------------
+
+..  autofunction:: paddle.fluid.io.get_latest_checkpoint_serial
+    :noindex:
+
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index e5ced9c04c3f702733635ad0397c8c52ec4b3970..1f8f6360404f96b328e9018704bd970165b5e42c 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -181,6 +181,12 @@ Print
 ..  autofunction:: paddle.fluid.layers.Print
     :noindex:
 
+is_empty
+--------
+
+..  autofunction:: paddle.fluid.layers.is_empty
+    :noindex:
+
 device
 ======
 
@@ -219,6 +225,12 @@ Send
 ..  autofunction:: paddle.fluid.layers.Send
     :noindex:
 
+Recv
+----
+
+..  autofunction:: paddle.fluid.layers.Recv
+    :noindex:
+
 open_recordio_file
 ------------------
 
@@ -255,6 +267,25 @@ double_buffer
 ..  autofunction:: paddle.fluid.layers.double_buffer
     :noindex:
 
+random_data_generator
+---------------------
+
+..  autofunction:: paddle.fluid.layers.random_data_generator
+    :noindex:
+
+Preprocessor
+------------
+
+..  autoclass:: paddle.fluid.layers.Preprocessor
+    :members:
+    :noindex:
+
+load
+----
+
+..  autofunction:: paddle.fluid.layers.load
+    :noindex:
+
 nn
 ==
 
@@ -342,6 +373,12 @@ conv2d
 ..  autofunction:: paddle.fluid.layers.conv2d
     :noindex:
 
+conv3d
+------
+
+..  autofunction:: paddle.fluid.layers.conv3d
+    :noindex:
+
 sequence_pool
 -------------
 
@@ -366,6 +403,12 @@ pool2d
 ..  autofunction:: paddle.fluid.layers.pool2d
     :noindex:
 
+pool3d
+------
+
+..  autofunction:: paddle.fluid.layers.pool3d
+    :noindex:
+
 batch_norm
 ----------
 
@@ -384,6 +427,12 @@ conv2d_transpose
 ..  autofunction:: paddle.fluid.layers.conv2d_transpose
     :noindex:
 
+conv3d_transpose
+----------------
+
+..  autofunction:: paddle.fluid.layers.conv3d_transpose
+    :noindex:
+
 sequence_expand
 ---------------
 
@@ -594,6 +643,48 @@ roi_pool
 ..  autofunction:: paddle.fluid.layers.roi_pool
     :noindex:
 
+dice_loss
+---------
+
+..  autofunction:: paddle.fluid.layers.dice_loss
+    :noindex:
+
+image_resize
+------------
+
+..  autofunction:: paddle.fluid.layers.image_resize
+    :noindex:
+
+image_resize_short
+------------------
+
+..  autofunction:: paddle.fluid.layers.image_resize_short
+    :noindex:
+
+resize_bilinear
+---------------
+
+..  autofunction:: paddle.fluid.layers.resize_bilinear
+    :noindex:
+
+gather
+------
+
+..  autofunction:: paddle.fluid.layers.gather
+    :noindex:
+
+random_crop
+-----------
+
+..  autofunction:: paddle.fluid.layers.random_crop
+    :noindex:
+
+mean_iou
+--------
+
+..  autofunction:: paddle.fluid.layers.mean_iou
+    :noindex:
+
 ops
 ===
 
@@ -699,12 +790,6 @@ logical_not
 ..  autofunction:: paddle.fluid.layers.logical_not
     :noindex:
 
-uniform_random
---------------
-
-..  autofunction:: paddle.fluid.layers.uniform_random
-    :noindex:
-
 uniform_random_batch_size_like
 ------------------------------
 
@@ -723,12 +808,6 @@ gaussian_random_batch_size_like
 ..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
     :noindex:
 
-cumsum
-------
-
-..  autofunction:: paddle.fluid.layers.cumsum
-    :noindex:
-
 scatter
 -------
 
@@ -741,6 +820,30 @@ sum
 ..  autofunction:: paddle.fluid.layers.sum
     :noindex:
 
+slice
+-----
+
+..  autofunction:: paddle.fluid.layers.slice
+    :noindex:
+
+polygon_box_transform
+---------------------
+
+..  autofunction:: paddle.fluid.layers.polygon_box_transform
+    :noindex:
+
+shape
+-----
+
+..  autofunction:: paddle.fluid.layers.shape
+    :noindex:
+
+maxout
+------
+
+..  autofunction:: paddle.fluid.layers.maxout
+    :noindex:
+
 sigmoid
 -------
 
@@ -897,18 +1000,6 @@ stanh
 ..  autofunction:: paddle.fluid.layers.stanh
     :noindex:
 
-hard_shrink
------------
-
-..  autofunction:: paddle.fluid.layers.hard_shrink
-    :noindex:
-
-thresholded_relu
-----------------
-
-..  autofunction:: paddle.fluid.layers.thresholded_relu
-    :noindex:
-
 hard_sigmoid
 ------------
 
@@ -921,6 +1012,30 @@ swish
 ..  autofunction:: paddle.fluid.layers.swish
     :noindex:
 
+uniform_random
+--------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random
+    :noindex:
+
+hard_shrink
+-----------
+
+..  autofunction:: paddle.fluid.layers.hard_shrink
+    :noindex:
+
+cumsum
+------
+
+..  autofunction:: paddle.fluid.layers.cumsum
+    :noindex:
+
+thresholded_relu
+----------------
+
+..  autofunction:: paddle.fluid.layers.thresholded_relu
+    :noindex:
+
 tensor
 ======
 
@@ -978,6 +1093,18 @@ fill_constant
 ..  autofunction:: paddle.fluid.layers.fill_constant
     :noindex:
 
+argmin
+------
+
+..  autofunction:: paddle.fluid.layers.argmin
+    :noindex:
+
+argmax
+------
+
+..  autofunction:: paddle.fluid.layers.argmax
+    :noindex:
+
 ones
 ----
 
@@ -993,6 +1120,12 @@ zeros
 detection
 =========
 
+prior_box
+---------
+
+..  autofunction:: paddle.fluid.layers.prior_box
+    :noindex:
+
 multi_box_head
 --------------
 
@@ -1080,3 +1213,18 @@ noam_decay
 ..  autofunction:: paddle.fluid.layers.noam_decay
     :noindex:
 
+metric
+======
+
+accuracy
+--------
+
+..  autofunction:: paddle.fluid.layers.accuracy
+    :noindex:
+
+auc
+---
+
+..  autofunction:: paddle.fluid.layers.auc
+    :noindex:
+
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
index 79a0995fce303518d989693976c4e92e05795ca2..6ad44bb6905b6e3f2b6e4aeb3701ced5d18e2005 100644
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -89,6 +89,13 @@ DecayedAdagradOptimizer
     :members:
     :noindex:
 
+RMSPropOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
+
 Adadelta
 --------
 
diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst
index 74d102dcb0db35766c34e3d14939a8aa5861686b..39fda65863471a78895503184848a754828b71a1 100644
--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -23,3 +23,15 @@ profiler
 ..  autofunction:: paddle.fluid.profiler.profiler
     :noindex:
 
+start_profiler
+--------------
+
+..  autofunction:: paddle.fluid.profiler.start_profiler
+    :noindex:
+
+stop_profiler
+-------------
+
+..  autofunction:: paddle.fluid.profiler.stop_profiler
+    :noindex:
+
diff --git a/doc/v2/dev/contribute_to_paddle_cn.md b/doc/v2/dev/contribute_to_paddle_cn.md
index add06e42f1bbd221b48eb83e4e84d4a7c89e7483..3244eedf918b93f9351258f1218dfb2d507c1a9c 100644
--- a/doc/v2/dev/contribute_to_paddle_cn.md
+++ b/doc/v2/dev/contribute_to_paddle_cn.md
@@ -104,7 +104,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
 ➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
 ```
 
-关于构建和测试的更多信息，请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。
+关于构建和测试的更多信息，请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)。
 
 ## 提交（commit）
 
diff --git a/paddle/contrib/CMakeLists.txt b/paddle/contrib/CMakeLists.txt
index 70e3a0583d8ecf9db19a85c0978aae0ce0625570..4b19256ef4533a09162edf907f6cd51146517e46 100644
--- a/paddle/contrib/CMakeLists.txt
+++ b/paddle/contrib/CMakeLists.txt
@@ -14,4 +14,3 @@
 #
 
 add_subdirectory(inference)
-add_subdirectory(tape)
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 277b0b175b29f682eed5a6584867ffa239d9d081..0f56d648b1939e1d6af3368bb2423477a3b638fc 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -50,7 +50,7 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_paddle_inference_api_impl
                     ARGS test_word2vec test_image_classification)
 
-if (WITH_ANAKIN)
+if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI
     # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
     # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
     # compile the libinference_anakin_api.a and compile with anakin.so.
diff --git a/paddle/contrib/tape/CMakeLists.txt b/paddle/contrib/tape/CMakeLists.txt
deleted file mode 100644
index 0acef17d6a2cd69d334ce57dc388a5a8d67e1936..0000000000000000000000000000000000000000
--- a/paddle/contrib/tape/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-if(APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
-endif(APPLE)
-
-cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES})
-cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable)
-
-cc_test(test_tape
-        SRCS test_tape.cc
-        DEPS tape tape_variable)
diff --git a/paddle/contrib/tape/README.md b/paddle/contrib/tape/README.md
deleted file mode 100644
index 16c22a45d59664e44c83923371c0f0d957a8ca7f..0000000000000000000000000000000000000000
--- a/paddle/contrib/tape/README.md
+++ /dev/null
@@ -1,252 +0,0 @@
-# Dynamic Graph on Fluid
-
-PaddlePaddle Fluid is targeting the autodiff without tape, which, however, is very
-challenging and we are still way from there. DyNet and PyTorch provide a good design
-idea, the *tape*, that significantly eases the challenge.  Also, DyNet provides
-a C++ API that is as convenient as Python but with higher efficiency and could
-conveniently integrate with industrial/production systems. This package, `tape`,
-combines the good of
-
-1. tape from PyTorch and DyNet
-2. C++ API and core from DyNet
-3. rich set of operators from PaddlePaddle
-
-## Overview
-
-We can implement Dynet-like Tape(See this [survey](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/survey/dynamic_graph.md))
-by wrapping Paddle Fluid's `Operator` and `Variable`.
-
-The user API is straight forward since
-
-1. it is imperative. And it uses host language's control flow logic.
-1. it avoids extra concepts such as `Scope` and `Executor`.
-
-All of these benefits come at the cost of just adding one line `reset_global_tape`
-at every iteration.
-
-## Code Structure
-
-In short, the `Tape` contains a vector of `OpHandle`s. And an `OpHandle` contains its
-`type`, the pointers to the `Variable`s, and necessary attributes.
-
-```c++
-class Variable {
-public:
-  VriableHandle Grad(); // returns its gradient variable
-private:
-  framework::VarDesc desc_; // compile time infershape, necessary for lazy execution
-  framework::Variable var_; // run time variable, holds data memory
-};
-
-using VariableHandle = shared_ptr<Variable>;
-
-struct OpHandle {
-  string type_;
-  map<string, vector<VariableHandle>> inputs_;
-  map<string, vector<VariableHandle>> outputs_;
-  AttributeMap attrs_;
-};
-
-class Tape {
-public:
-  void AddOp(OpHandle); // add op
-  void Forward();       // execute the tape_
-  void Backward();      // execute the backward of the tape_
-private:
-  vector<OpHandle> tape_;
-};
-```
-
-We uses `Function` to indicate layers. It takes care of parameter
-initialization and `AddOp` to the Tape when it is called.
-
-```c++
-class Linear {
- public:
-  Linear(int in_dim, int out_dim, const std::string &act)
-      : w_(new Variable("LinearWeight")),
-        b_(new Variable("LinearBias")),
-        act_(act) {
-    Tape init_tape;
-
-    std::string initializer = "fill_constant";
-    framework::AttributeMap attrs;
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{in_dim, out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
-
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
-
-    init_tape.Forward();
-  }
-
-  VariableHandle operator()(VariableHandle input) {
-    VariableHandle pre_bias(new Variable("linear"));
-    get_global_tape().AddOp("mul",
-                            {{"X", {input}}, {"Y", {w_}}},
-                            {{"Out", {pre_bias}}},
-                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
-    VariableHandle pre_act(new Variable("linear"));
-    get_global_tape().AddOp("elementwise_add",
-                            {{"X", {pre_bias}}, {"Y", {b_}}},
-                            {{"Out", {pre_act}}},
-                            {{"axis", 1}});
-    VariableHandle post_act(new Variable("linear"));
-    get_global_tape().AddOp(act_,
-                            {{"X", {pre_act}}},
-                            {{"Out", {post_act}}},
-                            {});
-    return post_act;
-  }
-
-  std::vector<VariableHandle> Params() { return {w_, b_}; }
-
- private:
-  VariableHandle w_;
-  VariableHandle b_;
-  std::string act_;
-};
-```
-
-## User API
-
-```c++
-// Model function
-paddle::tape::Linear linear1(3, 3, "relu"); // init weight and bias
-paddle::tape::Linear linear2(3, 3, "relu"); // init weight and bias
-paddle::tape::Mean mean;
-
-// Optimizer
-paddle::tape::SGD sgd(0.001);
-
-// Data Feeder
-paddle::tape::Fill data_feeder(...);
-VariableHandle input(new paddle::tape::Variable("input"));
-VariableHandle label(new paddle::tape::Variable("label"));
-
-for (int i = 0; i < 2; ++i) {
-  reset_global_tape();
-
-  data_feeder(input, label);
-
-  auto loss = softmax(linear2(linear1(input)), label); // compile time InferShape & InferVarType
-  LOG(INFO) << loss.value(); // Run forward up to loss
-
-  // Run backward, store gradient of w at w->Grad()
-  get_global_tape.Backward(loss);
-
-  // Update w
-  sgd(linear1.Params());
-  sgd(linear2.Params());
-}
-```
-
-<details>
-  <summary></summary>
-digraph G {
-
-	subgraph cluster_0 {
-                node [shape=record,style=filled];
-		style=filled;
-		color=lightgrey;
-                linear1 [label="{type: mul | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1}} |  {output |<before_bias1> Out: before_bias1}}"];
-                elementwise_add1 [label="{type: elementwise_add | {input | {<before_bias1>X: before_bias1 |<bias1> Y: bias1}} |  {output |<before_act1> Out: before_act1}}"];
-                relu1 [label="{type: relu | {input | {<before_act1>X: before_act1 }} |  {output |<after_act1> Out: after_act1}}"];
-
-		linear1 -> elementwise_add1->relu1;
-		label = "forward tape";
-	}
-
-        linear1:before_mul1->before_mul1
-        linear1:weight1->weight1
-        linear1:before_bias1->before_bias1
-
-        elementwise_add1:bias1->bias1
-        elementwise_add1:before_bias1->before_bias1
-        elementwise_add1:before_act1->before_act1
-
-        relu1:before_act1->before_act1
-        relu1:after_act1->after_act1
-
-	subgraph cluster_1 {
-                node [shape=record,style=filled];
-		style=filled;
-		color=lightgrey;
-                linear1_grad [label="{type: mul_grad | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1|<before_bias1_grad> Out_grad: before_bias1_grad}} |  {output |{<before_mul1_grad>X_grad: before_mul1_grad |<weight1_grad> Y_grad: weight1_grad}}}"];
-
-                elementwise_add1_grad [label="{type: elementwise_add_grad | {input | <before_act1_grad> Out_grad: before_act1_grad} |  {output |{<before_bias1_grad>X_grad: before_bias1_grad |<bias1_grad> Y_grad: bias1_grad}}}"];
-
-                relu1_grad [label="{type: relu_grad |  {input |<after_act1_grad> Out_grad: after_act1_grad} | {ouput | {<before_act1_grad>X_grad: before_act1_grad }}}"];
-
-		linear1_grad -> elementwise_add1_grad ->relu1_grad [dir=back];
-                label = "backward tape";
-	}
-
-        relu1_grad:after_act1_grad->after_act1_grad
-        relu1_grad:before_act1_grad->before_act1_grad
-
-        elementwise_add1_grad:before_act1_grad->before_act1_grad
-        elementwise_add1_grad:before_bias1_grad->before_bias1_grad
-        elementwise_add1_grad:bias1_grad->bias1_grad
-
-        linear1_grad:before_mul1->before_mul1
-        linear1_grad:weight1->weight1
-        linear1_grad:before_bias1_grad->before_bias1_grad
-        linear1_grad:before_mul1_grad->before_mul1_grad
-        linear1_grad:weight1_grad->weight1_grad
-
-
-	subgraph cluster_2 {
-                node [shape=record];
-                label = "Linear1";
-                weight1
-                bias1
-	}
-
-        weight1 -> weight1_grad [ label="Grad()", style="dashed" ];
-        bias1 -> bias1_grad [ label="Grad()", style="dashed"];
-
-	
-
-}
-</details>
-
-![Image](https://github.com/tonyyang-svail/Paddle/blob/cpp_tap/paddle/contrib/tape/computation_graph.png)
-
-## Code Reuse
-
-We want to stay close to Paddle Fluid as much as possible.
-
-### Reuse All Operators
-
-As all Ops are registered at `OpInfoMap`, the effort of adding a new `Function`
-is about 10 lines of code, similar to expose an operator to Python.
-
-### Reuse Compile Time InferShape and InferVarType
-
-Note that all the symbolic information is stored at `tape::Varaible::desc_`, instead
-of `ProgramDesc.block.vars`, we create a temporary `BlockDesc` to do `InferShape` and
-`InferVarType` every time we `AddOp` to the tape.
-
-### Reuse Operator::Run
-
-We use smart pointer, instead of `Scope`, to manage memory. So we create a temporary
-`Scope` for every `Operator::Run()`.
-
-## Possible Feature
-
-### Release Memory on Backward
-
-We can release memory aggressively. During backward, we can delete the OpHandle once
-we have finished its backward. Since all the variable is managed by smart pointer, the
-memory is automatically released when its `ref_count` goes to 0.
-
-### Kernel Fusion
-
-As a symbolic representation of the Tape is constructed first before the actual
-execution, it would be possible to perform graph optimization. One use case is kernel
-fusion.
diff --git a/paddle/contrib/tape/computation_graph.png b/paddle/contrib/tape/computation_graph.png
deleted file mode 100644
index 6cf5ead735d5d18b204b079771e53d44483cf016..0000000000000000000000000000000000000000
Binary files a/paddle/contrib/tape/computation_graph.png and /dev/null differ
diff --git a/paddle/contrib/tape/function.h b/paddle/contrib/tape/function.h
deleted file mode 100644
index 8c9694d9a21b5948361164eab60a663ec4fd3803..0000000000000000000000000000000000000000
--- a/paddle/contrib/tape/function.h
+++ /dev/null
@@ -1,131 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "paddle/contrib/tape/tape.h"
-#include "paddle/contrib/tape/variable.h"
-#include "paddle/fluid/framework/type_defs.h"
-
-namespace paddle {
-namespace tape {
-
-class Function {};
-
-class Fill {
- public:
-  Fill(const std::string &initializer, const framework::AttributeMap &attrs)
-      : initializer_(initializer), attrs_(attrs) {}
-
-  void operator()(VariableHandle var) {
-    get_global_tape().AddOp(initializer_, {}, {{"Out", {var}}}, attrs_);
-  }
-
- private:
-  const std::string initializer_;
-  const framework::AttributeMap attrs_;
-};
-
-class Mean {
- public:
-  VariableHandle operator()(VariableHandle var) {
-    VariableHandle out(new Variable("mean"));
-    get_global_tape().AddOp("mean", {{"X", {var}}}, {{"Out", {out}}}, {});
-    return out;
-  }
-};
-
-class Linear {
- public:
-  Linear(int in_dim, int out_dim, const std::string &act)
-      : w_(new Variable("LinearWeight")),
-        b_(new Variable("LinearBias")),
-        act_(act) {
-    Tape init_tape;
-
-    std::string initializer = "fill_constant";
-    framework::AttributeMap attrs;
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{in_dim, out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
-
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
-
-    init_tape.Forward();
-  }
-
-  VariableHandle operator()(VariableHandle input) {
-    VariableHandle pre_bias(new Variable("linear"));
-    get_global_tape().AddOp("mul",
-                            {{"X", {input}}, {"Y", {w_}}},
-                            {{"Out", {pre_bias}}},
-                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
-    VariableHandle pre_act(new Variable("linear"));
-    get_global_tape().AddOp("elementwise_add",
-                            {{"X", {pre_bias}}, {"Y", {b_}}},
-                            {{"Out", {pre_act}}},
-                            {{"axis", 1}});
-    VariableHandle post_act(new Variable("linear"));
-    get_global_tape().AddOp(
-        act_, {{"X", {pre_act}}}, {{"Out", {post_act}}}, {});
-    return post_act;
-  }
-
-  std::vector<VariableHandle> Params() { return {w_, b_}; }
-
- private:
-  VariableHandle w_;
-  VariableHandle b_;
-  std::string act_;
-};
-
-class SGD {
- public:
-  SGD(float learning_rate) : learning_rate_(new Variable("sgd")) {
-    Tape init_tape;
-
-    std::string initializer = "fill_constant";
-    framework::AttributeMap attrs;
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{1};
-    attrs["value"] = learning_rate;
-    init_tape.AddOp(initializer, {}, {{"Out", {learning_rate_}}}, attrs);
-
-    init_tape.Forward();
-  }
-
-  void operator()(VariableHandle input) {
-    PADDLE_ENFORCE(get_global_tape().HasBeenBackwarded(),
-                   "optimization must happen after the backward");
-    Tape temp_tape;
-    temp_tape.AddOp("sgd",
-                    {{"Param", {input}},
-                     {"LearningRate", {learning_rate_}},
-                     {"Grad", {input->Grad()}}},
-                    {{"ParamOut", {input}}},
-                    {});
-    temp_tape.Forward();
-  }
-
- private:
-  VariableHandle learning_rate_;
-};
-}
-}
diff --git a/paddle/contrib/tape/tape.cc b/paddle/contrib/tape/tape.cc
deleted file mode 100644
index 531499b6fe02abf200b7d4401494fd6350646622..0000000000000000000000000000000000000000
--- a/paddle/contrib/tape/tape.cc
+++ /dev/null
@@ -1,265 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/contrib/tape/tape.h"
-
-#include <list>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/dim.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/pybind/pybind.h"
-
-namespace paddle {
-namespace tape {
-
-// borrowed from
-// https://stackoverflow.com/questions/874134/find-if-string-ends-with-another-string-in-c
-inline bool ends_with(std::string const &value, std::string const &ending) {
-  if (ending.size() > value.size()) return false;
-  return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
-}
-
-std::ostream &operator<<(std::ostream &os, const framework::VarDesc &var_desc) {
-  os << var_desc.Name();
-  os << "[" << var_desc.GetType() << "]";
-  os << "[" << var_desc.GetDataType() << "]";
-  os << "{";
-  for (auto &i : var_desc.GetShape()) {
-    os << i << ",";
-  }
-  os << "}";
-  return os;
-}
-
-std::string to_string(const std::string &type,
-                      const VariableHandleMap &in_vars,
-                      const VariableHandleMap &out_vars,
-                      const framework::AttributeMap &attrs) {
-  std::stringstream ss;
-  ss << type << " ";
-  for (auto &param_name : in_vars) {
-    for (auto &var : param_name.second) {
-      ss << param_name.first << ":(" << var->Desc() << ") ";
-    }
-  }
-  for (auto &param_name : out_vars) {
-    for (auto &var : param_name.second) {
-      ss << param_name.first << ":(" << var->Desc() << ") ";
-    }
-  }
-  return ss.str();
-}
-
-framework::OpDesc CreateOpDesc(const std::string &type,
-                               const VariableHandleMap &in_vars,
-                               const VariableHandleMap &out_vars,
-                               const framework::AttributeMap &attrs) {
-  framework::VariableNameMap inputs;
-  for (auto &param_name : in_vars) {
-    for (auto &var : param_name.second) {
-      inputs[param_name.first].emplace_back(var->Name());
-    }
-  }
-  framework::VariableNameMap outputs;
-  for (auto &param_name : out_vars) {
-    for (auto &var : param_name.second) {
-      outputs[param_name.first].emplace_back(var->Name());
-    }
-  }
-  return framework::OpDesc(type, inputs, outputs, attrs);
-}
-
-void InferShapeAndVarType(const std::string &type,
-                          const VariableHandleMap &in_vars,
-                          VariableHandleMap *out_vars,
-                          const framework::AttributeMap &attrs) {
-  framework::OpDesc op_desc = CreateOpDesc(type, in_vars, *out_vars, attrs);
-
-  // Create a temporary block for compile-time
-  framework::ProgramDesc program_desc;
-  framework::BlockDesc *block_desc = program_desc.MutableBlock(0);
-  PADDLE_ENFORCE(block_desc);
-
-  for (auto &param_name : in_vars) {
-    for (auto &var : param_name.second) {
-      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
-    }
-  }
-  for (auto &param_name : *out_vars) {
-    for (auto &var : param_name.second) {
-      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
-    }
-  }
-
-  LOG(INFO) << "- " << to_string(type, in_vars, *out_vars, attrs);
-  op_desc.InferShape(*block_desc);
-  op_desc.InferVarType(block_desc);
-  for (auto &param_name : *out_vars) {
-    for (auto &var : param_name.second) {
-      *var->MutableDesc()->Proto() = *block_desc->Var(var->Name())->Proto();
-    }
-  }
-  LOG(INFO) << "+ " << to_string(type, in_vars, *out_vars, attrs);
-}
-
-void Tape::AddOp(const std::string &type,
-                 const VariableHandleMap &in_vars,
-                 VariableHandleMap out_vars,
-                 const framework::AttributeMap &attrs) {
-  InferShapeAndVarType(type, in_vars, &out_vars, attrs);
-  tape_.emplace_back(type, in_vars, out_vars, attrs);
-}
-
-// Temporary Scope for Operator::Run()
-class ScopeWrapper : public framework::Scope {
- public:
-  ScopeWrapper(const VariableHandleMap &in_vars,
-               const VariableHandleMap &out_vars) {
-    for (auto &v : in_vars) {
-      for (auto &vv : v.second) {
-        if (!vars_.count(vv->Name())) {
-          vars_[vv->Name()].reset(vv->Var());
-        }
-      }
-    }
-    for (auto &v : out_vars) {
-      for (auto &vv : v.second) {
-        if (!vars_.count(vv->Name())) {
-          vars_[vv->Name()].reset(vv->Var());
-        }
-      }
-    }
-  }
-
-  ~ScopeWrapper() {
-    for (auto &pair : vars_) {
-      pair.second.release();
-    }
-  }
-};
-
-void Tape::Forward() {
-  LOG(INFO) << "Starting forward -------------------------";
-  PADDLE_ENFORCE(!has_been_backwarded_);
-  while (current_position_ < tape_.size()) {
-    OpHandle &op = tape_[current_position_];
-
-    // Create Output Tensor, this is only necessary for OpWithKernel
-    for (auto &param2var : op.outputs_) {
-      for (auto &var : param2var.second) {
-        var->InitializeVariable();
-      }
-    }
-
-    framework::OpDesc op_desc =
-        CreateOpDesc(op.type_, op.inputs_, op.outputs_, op.attrs_);
-    ScopeWrapper scope(op.inputs_, op.outputs_);
-    framework::OpRegistry::CreateOp(op_desc)->Run(scope, platform::CPUPlace());
-    current_position_++;
-  }
-
-  LOG(INFO) << "Finishing forward -------------------------";
-}
-
-void Tape::Backward(VariableHandle target) {
-  PADDLE_ENFORCE(!has_been_backwarded_);
-
-  Forward();
-
-  // TODO(tonyyang-svail): check output of last op is target
-  backward_tape_.reset(new Tape());
-
-  framework::AttributeMap attrs;
-
-  // FIXME(tonyyang-svail): Need to infer_data_type
-  attrs["dtype"] = framework::proto::VarType::Type::VarType_Type_FP32;
-  attrs["shape"] = std::vector<int>{1};
-  attrs["value"] = 1.0f;
-  backward_tape_->AddOp(
-      "fill_constant", {}, {{"Out", {target->Grad()}}}, attrs);
-
-  for (auto it = tape_.rbegin(); it != tape_.rend(); ++it) {
-    framework::OpDesc op_desc =
-        CreateOpDesc(it->type_, it->inputs_, it->outputs_, it->attrs_);
-    std::unordered_map<std::string, std::string> grad_to_var;
-    std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
-        framework::OpInfoMap::Instance()
-            .Get(op_desc.Type())
-            .GradOpMaker()(op_desc, {}, &grad_to_var, {});
-
-    for (auto &op_desc : grad_op_descs) {
-      std::unordered_map<std::string, VariableHandle> name2var;
-      for (auto &param2vars : it->inputs_) {
-        for (auto &a : param2vars.second) {
-          name2var[a->Name()] = a;
-        }
-      }
-      for (auto &param2vars : it->outputs_) {
-        for (auto &a : param2vars.second) {
-          name2var[a->Name()] = a;
-        }
-      }
-
-      VariableHandleMap in_vars;
-      VariableHandleMap out_vars;
-      std::map<const framework::VariableNameMap *, VariableHandleMap *>
-          loop_over{{&op_desc->Inputs(), &in_vars},
-                    {&op_desc->Outputs(), &out_vars}};
-      for (auto &each : loop_over) {
-        auto &vmp = *each.first;
-        auto &vhm = *each.second;
-        for (auto &p2a : vmp) {
-          for (auto &argu : p2a.second) {
-            if (name2var.count(argu)) {
-              vhm[p2a.first].push_back(name2var[argu]);
-            } else {
-              PADDLE_ENFORCE(ends_with(argu, framework::kGradVarSuffix),
-                             argu.c_str());
-              std::string name = argu.substr(
-                  0, argu.size() - std::strlen(framework::kGradVarSuffix));
-              PADDLE_ENFORCE(name2var.count(name), name.c_str());
-              vhm[p2a.first].push_back(name2var[name]->Grad());
-            }
-          }
-        }
-      }
-
-      backward_tape_->AddOp(
-          op_desc->Type(), in_vars, out_vars, op_desc->GetAttrMap());
-    }
-
-    // TODO(tonyyang-svail): how to fill empty grad?
-    // TODO(tonyyang-svail): Sum var grad is necessary
-  }
-
-  backward_tape_->Forward();
-  has_been_backwarded_ = true;
-}
-
-Tape &get_global_tape() {
-  static Tape T;
-  return T;
-}
-
-void reset_global_tape() { get_global_tape() = Tape(); }
-}
-}
diff --git a/paddle/contrib/tape/tape.h b/paddle/contrib/tape/tape.h
deleted file mode 100644
index ed79de17a7fca58a2c542831560f0dd5ad34f960..0000000000000000000000000000000000000000
--- a/paddle/contrib/tape/tape.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/contrib/tape/variable.h"
-
-namespace paddle {
-namespace tape {
-
-using VariableHandleMap = std::map<std::string, std::vector<VariableHandle>>;
-
-struct OpHandle {
-  OpHandle(const std::string &type,
-           const VariableHandleMap &in_vars,
-           const VariableHandleMap &out_vars,
-           const framework::AttributeMap &attrs)
-      : type_(type), inputs_(in_vars), outputs_(out_vars), attrs_(attrs) {}
-
-  std::string type_;
-  VariableHandleMap inputs_;
-  VariableHandleMap outputs_;
-  framework::AttributeMap attrs_;
-};
-
-class Tape {
- public:
-  void AddOp(const std::string &type,
-             const VariableHandleMap &in_vars,
-             VariableHandleMap out_vars,
-             const framework::AttributeMap &attrs);
-  void Forward();
-  void Backward(VariableHandle target);
-
-  bool HasBeenBackwarded() { return has_been_backwarded_; }
-
- private:
-  bool has_been_backwarded_ = false;
-  size_t current_position_ = 0;
-
-  std::vector<OpHandle> tape_;
-  std::shared_ptr<Tape> backward_tape_;
-};
-
-Tape &get_global_tape();
-
-void reset_global_tape();
-}
-}
diff --git a/paddle/contrib/tape/test_tape.cc b/paddle/contrib/tape/test_tape.cc
deleted file mode 100644
index e9bfd21a7189c5867a52d2b25db09a462d5c7ba7..0000000000000000000000000000000000000000
--- a/paddle/contrib/tape/test_tape.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gtest/gtest.h"
-#include "paddle/contrib/tape/function.h"
-
-using namespace paddle::tape;
-
-TEST(Tape, TestMLP) {
-  LOG(INFO) << "TestMLP";
-  Linear linear1(3, 3, "relu");
-  Linear linear2(3, 3, "relu");
-  Mean mean;
-
-  SGD sgd(0.001);
-
-  std::string initializer = "fill_constant";
-  paddle::framework::AttributeMap attrs;
-  attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-  attrs["shape"] = std::vector<int>{3, 3};
-  attrs["value"] = 1.0f;
-  Fill filler(initializer, attrs);
-
-  for (int i = 0; i < 2; ++i) {
-    reset_global_tape();
-
-    VariableHandle input(new Variable("input"));
-    filler(input);
-
-    auto loss = mean(linear2(linear1(input)));
-
-    get_global_tape().Backward(loss);
-
-    for (auto w : linear1.Params()) {
-      sgd(w);
-    }
-    for (auto w : linear2.Params()) {
-      sgd(w);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  std::vector<paddle::platform::Place> places;
-  places.emplace_back(paddle::platform::CPUPlace());
-  paddle::platform::DeviceContextPool::Init(places);
-
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/contrib/tape/variable.h b/paddle/contrib/tape/variable.h
deleted file mode 100644
index 35c328e69c9ebe25e907a59e4d67b999aff1d876..0000000000000000000000000000000000000000
--- a/paddle/contrib/tape/variable.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <memory>
-
-#include "paddle/fluid/framework/operator.h"  // framework::kGradVarSuffix
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/variable.h"
-
-namespace paddle {
-namespace tape {
-
-class Variable;
-using VariableHandle = std::shared_ptr<Variable>;
-
-/*
- * Combination of
- *     framework::VarDesc desc_;
- *     framework::Variable var_;
- */
-class Variable {
- public:
-  Variable(const std::string pre_fix)
-      : desc_(pre_fix + std::to_string(count())) {}
-
-  Variable(const std::string pre_fix, bool is_grad)
-      : desc_(pre_fix + (is_grad ? framework::kGradVarSuffix
-                                 : std::to_string(count()))) {}
-
-  ~Variable() { LOG(INFO) << "Deleting " << Name(); }
-
-  // Instantiate LoDTensor/SelectedRow
-  void InitializeVariable();
-
-  VariableHandle Grad() {
-    if (grad_.expired()) {
-      VariableHandle new_grad(new Variable(desc_.Name(), true));
-      grad_ = new_grad;
-      return new_grad;
-    } else {
-      return VariableHandle(grad_);
-    }
-  }
-
-  // Stochastic Gradient Descent with Momentum
-  //  VariableHandle Momentum ();
-
-  //  void init(const std::string& initializer,
-  //            const framework::AttributeMap& attrs);
-
-  // void value() {};
-
-  const framework::VarDesc& Desc() const { return desc_; }
-  framework::VarDesc* MutableDesc() { return &desc_; }
-
-  // TODO(tonyyang-svail): No need to expose name
-  std::string Name() const { return desc_.Name(); }
-
-  framework::Variable* Var() { return &var_; }
-
- private:
-  int count() {
-    static int counter = 0;
-    return counter++;
-  }
-
-  framework::VarDesc desc_;
-  framework::Variable var_;
-
-  std::weak_ptr<Variable> grad_;
-};
-}
-}
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index e15232a77bb9c3e325b55737ea7abc55e3121708..b30a9806eb19ee12d2a70afe3ca806224b0f75d6 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -295,13 +295,14 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
     const ProgramDesc& program, int block_id) {
-  auto* ctx = new ExecutorPrepareContext(program, block_id);
+  std::unique_ptr<ExecutorPrepareContext> ctx(
+      new ExecutorPrepareContext(program, block_id));
   PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
   auto& block = program.Block(block_id);
   for (auto& op_desc : block.AllOps()) {
     ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
   }
-  return std::unique_ptr<ExecutorPrepareContext>(ctx);
+  return ctx;
 }
 
 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
@@ -406,6 +407,9 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
       }
     }
   }
+#else
+  LOG(WARNING)
+      << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
 #endif
 }
 
diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/framework/init.cc
index 85beae775b96c3b7e08a2795bcd0ec79b24faeb4..a1094976f6c0965ac0a601d7e37575969146fdab 100644
--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
@@ -113,6 +114,9 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   }
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
+#ifndef PADDLE_WITH_MKLDNN
+  operators::math::SetNumThreads(1);
+#endif
 }
 
 void InitGLOG(const std::string &prog_name) {
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index a56674cbe216e312c4394ef537140122352dc785..e331c8128f2e8121dbbfe82b74ea35f2d0d399c0 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -410,5 +410,38 @@ void LoDTensor::MergeLoDTensor(
   }
 }
 
+LoD ConvertToLengthBasedLoD(const LoD &offset_lod) {
+  LoD length_lod;
+  length_lod.reserve(offset_lod.size());
+  for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) {
+    std::vector<size_t> level;
+    if (offset_lod[lvl].size() > 0) {
+      level.reserve(offset_lod[lvl].size() - 1);
+    }
+    for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) {
+      level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]);
+    }
+    length_lod.push_back(level);
+  }
+  return length_lod;
+}
+
+LoD ConvertToOffsetBasedLoD(const LoD &length_lod) {
+  LoD offset_lod;
+  offset_lod.reserve(length_lod.size());
+  for (size_t lvl = 0; lvl < length_lod.size(); ++lvl) {
+    std::vector<size_t> level;
+    level.reserve(length_lod[lvl].size() + 1);
+    size_t tmp = 0;
+    level.push_back(tmp);
+    for (size_t idx = 0; idx < length_lod[lvl].size(); ++idx) {
+      tmp += length_lod[lvl][idx];
+      level.push_back(tmp);
+    }
+    offset_lod.push_back(level);
+  }
+  return offset_lod;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 1159fee39b0737402c60448dcbe69e7535c9d6e1..4a2729373b5c63176ed1e856f4acf29fd1e73254 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -226,5 +226,19 @@ extern void WriteToRecordIO(recordio::Writer* writer,
 extern std::vector<LoDTensor> ReadFromRecordIO(
     recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
 
+/*
+ * Convert between length-based LoD and offset-based LoD.
+ * The implementation of LoDTensor class use offset-based LoD.
+ * However, we want to expose the more user-friendly length-based
+ * LoD to the Python side instead.
+ *
+ * Example:
+ * If offset_lod = [[0, 2, 3],[0, 3, 5, 9]]
+ * then length_lod = [[2, 1], [3, 2, 4]]
+ */
+LoD ConvertToLengthBasedLoD(const LoD& offset_lod);
+
+LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index 2ceffc93319359683e87e7fec2d18784c9bf02f3..6dfe7d2d8c1cce3360d99950240bc6de5a063dab 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -228,6 +228,38 @@ TEST(LoD, CheckAbsLoD) {
   ASSERT_FALSE(CheckAbsLoD(abs_lod0));
 }
 
+TEST(LoD, ConvertToLengthBasedLoD) {
+  LoD offset_lod;
+  offset_lod.push_back(std::vector<size_t>({0, 2}));
+  offset_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  offset_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  LoD length_lod = ConvertToLengthBasedLoD(offset_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({2}));
+  expected.push_back(std::vector<size_t>({1, 2}));
+  expected.push_back(std::vector<size_t>({2, 2, 1}));
+
+  EXPECT_EQ(length_lod, expected);
+}
+
+TEST(LoD, ConvertToOffsetBasedLoD) {
+  LoD length_lod;
+  length_lod.push_back(std::vector<size_t>({2}));
+  length_lod.push_back(std::vector<size_t>({1, 2}));
+  length_lod.push_back(std::vector<size_t>({2, 2, 1}));
+
+  LoD offset_lod = ConvertToOffsetBasedLoD(length_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({0, 2}));
+  expected.push_back(std::vector<size_t>({0, 1, 3}));
+  expected.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  EXPECT_EQ(offset_lod, expected);
+}
+
 template <typename T>
 static void TestRecordIO() {
   LoDTensor tensor;
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index bb2d866c824e0fec1b241caea407a38c88a3cb51..50f374e3703a97f6c1fdb4b14fdeb0b603f9ac86 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -43,48 +43,29 @@ Scope& Scope::NewScope() const {
 }
 
 Variable* Scope::Var(const std::string& name) {
-  // acquire the lock when new var under this scope
   std::unique_lock<std::mutex> lock(mutex_);
-  auto* v = FindVarLocally(name);
-  if (v != nullptr) return v;
-
-  v = new Variable();
-  vars_[name].reset(v);
-  VLOG(3) << "Create variable " << name;
-  v->name_ = &(vars_.find(name)->first);
-  return v;
+  return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
-  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
-    *name = var_name;
+    *name = new_name;
   }
-  return Var(var_name);
+  return VarInternal(new_name);
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  // acquire the lock when find var
   std::unique_lock<std::mutex> lock(mutex_);
   return FindVarInternal(name);
 }
 
-Variable* Scope::FindVarInternal(const std::string& name) const {
-  auto var = FindVarLocally(name);
-  if (var != nullptr) {
-    return var;
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVarInternal(name);
-}
-
 const Scope* Scope::FindScope(const Variable* var) const {
-  for (auto& kv : vars_) {
-    if (kv.second.get() == var) {
-      return this;
-    }
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+  std::unique_lock<std::mutex> lock(mutex_);
+  return FindScopeInternal(var);
 }
+
 void Scope::DropKids() {
   std::unique_lock<std::mutex> lock(mutex_);
   for (Scope* s : kids_) delete s;
@@ -92,6 +73,7 @@ void Scope::DropKids() {
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
+  std::unique_lock<std::mutex> lock(mutex_);
   std::vector<std::string> known_vars;
   known_vars.reserve(this->vars_.size());
   for (auto& p : vars_) {
@@ -127,6 +109,39 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
+  std::unique_lock<std::mutex> lock(mutex_);
+  RenameInternal(origin_name, new_name);
+}
+
+std::string Scope::Rename(const std::string& origin_name) const {
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
+  RenameInternal(origin_name, new_name);
+  return new_name;
+}
+
+Variable* Scope::VarInternal(const std::string& name) {
+  auto* v = FindVarLocally(name);
+  if (v != nullptr) return v;
+
+  v = new Variable();
+  vars_[name].reset(v);
+  VLOG(3) << "Create variable " << name;
+  v->name_ = &(vars_.find(name)->first);
+  return v;
+}
+
+const Scope* Scope::FindScopeInternal(const Variable* var) const {
+  for (auto& kv : vars_) {
+    if (kv.second.get() == var) {
+      return this;
+    }
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+}
+
+void Scope::RenameInternal(const std::string& origin_name,
+                           const std::string& new_name) const {
   auto origin_it = vars_.find(origin_name);
   PADDLE_ENFORCE(origin_it != vars_.end(),
                  "Cannot find original variable with name %s", origin_name);
@@ -137,10 +152,12 @@ void Scope::Rename(const std::string& origin_name,
   vars_.erase(origin_it);
 }
 
-std::string Scope::Rename(const std::string& origin_name) const {
-  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
-  Rename(origin_name, var_name);
-  return var_name;
+Variable* Scope::FindVarInternal(const std::string& name) const {
+  auto var = FindVarLocally(name);
+  if (var != nullptr) {
+    return var;
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
 }
 
 Variable* Scope::FindVarLocally(const std::string& name) const {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 95b4f7c5f66a4161058955c7666be34414f5074c..e246241c0abfbc7bdcaf38d073cc58fc36a4f737 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -88,12 +88,20 @@ class Scope {
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const* parent) : parent_(parent) {}
 
+  // Called by Var.
+  Variable* VarInternal(const std::string& name);
+
+  // Called by FindScope.
+  const Scope* FindScopeInternal(const Variable* var) const;
+
+  // Called by Rename.
+  void RenameInternal(const std::string& origin_name,
+                      const std::string& new_name) const;
+
   // Called by FindVar recursively.
-  // Caller doesn't own the returned Variable.
   Variable* FindVarInternal(const std::string& name) const;
 
   // Called by FindVarInternal and Var.
-  // Caller doesn't own the returned Variable.
   Variable* FindVarLocally(const std::string& name) const;
 
   // Scope in `kids_` are owned by this class.
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 50835784440bfa177e38f9760bb4a47ad335a9e1..2bb2c8135d8c317388e1a0d711589a390c7e8924 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,23 +1,32 @@
 set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
-cc_library(analysis SRCS dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc fluid_to_data_flow_graph_pass.cc
-  DEPS paddle_fluid)
+cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
+  fluid_to_data_flow_graph_pass.cc
+  data_flow_graph_to_fluid_pass.cc
+  tensorrt_subgraph_pass.cc
+  dfg_graphviz_draw_pass.cc
+  DEPS framework_proto)
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
 
-cc_test(test_data_flow_graph SRCS data_flow_graph_tester.cc DEPS analysis ${FLUID_CORE_MODULES} paddle_fluid
-  ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
-set_tests_properties(test_data_flow_graph PROPERTIES DEPENDS test_word2vec)
+function (inference_analysis_test TARGET)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS)
+    cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-cc_test(test_subgraph_splitter
-        SRCS subgraph_splitter_tester.cc
-        DEPS analysis paddle_fluid tensor
-        ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
-set_tests_properties(test_subgraph_splitter PROPERTIES DEPENDS test_word2vec)
+    cc_test(${TARGET}
+            SRCS "${analysis_test_SRCS}"
+            DEPS analysis
+            ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
+    set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
+endfunction(inference_analysis_test)
 
-cc_test(test_dfg_graphviz_draw_pass
-        SRCS dfg_graphviz_draw_pass_tester.cc
-        DEPS analysis
-        ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
-set_tests_properties(test_dfg_graphviz_draw_pass PROPERTIES DEPENDS test_word2vec)
+inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
+inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
+inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
+inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
+inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
+#inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
+inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
diff --git a/paddle/contrib/tape/variable.cc b/paddle/fluid/inference/analysis/argument.cc
similarity index 50%
rename from paddle/contrib/tape/variable.cc
rename to paddle/fluid/inference/analysis/argument.cc
index 5ec1612909503f666bca0fce3246002879854156..cb0263d5d98e86b612696ebde66d17fb2543809b 100644
--- a/paddle/contrib/tape/variable.cc
+++ b/paddle/fluid/inference/analysis/argument.cc
@@ -12,22 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/contrib/tape/variable.h"
-
-namespace paddle {
-namespace tape {
-
-void Variable::InitializeVariable() {
-  LOG(INFO) << "Initialzing " << desc_.Name() << " as " << desc_.GetType();
-  framework::proto::VarType::Type var_type = desc_.GetType();
-  if (var_type == framework::proto::VarType::LOD_TENSOR) {
-    var_.GetMutable<framework::LoDTensor>();
-  } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
-    var_.GetMutable<framework::SelectedRows>();
-  } else {
-    PADDLE_THROW("Variable type %d is not in [LOD_TENSOR, SELECTED_ROWS]",
-                 var_type);
-  }
-}
-}
-}
+#include "paddle/fluid/inference/analysis/argument.h"
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7f4e03968a723df1718bd3752bdd1c3430d02be
--- /dev/null
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines the class Argument, which is the input and output of the
+ * analysis module. All the fields that needed either by Passes or PassManagers
+ * are contained in Argument.
+ *
+ * TODO(Superjomn) Find some way better to contain the fields when it grow too
+ * big.
+ */
+
+#pragma once
+
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * The argument definition of both Pass and PassManagers.
+ *
+ * All the fields should be registered here for clearness.
+ */
+struct Argument {
+  // The graph that process by the Passes or PassManagers.
+  std::unique_ptr<DataFlowGraph> main_dfg;
+
+  // The original program desc.
+  std::unique_ptr<framework::proto::ProgramDesc> origin_program_desc;
+};
+
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+#define ANALYSIS_ARGUMENT_CHECK_FIELD(field__)               \
+  if (UNLIKELY(!(field__))) {                                \
+    LOG(ERROR) << "field " << #field__ << " should be set."; \
+    return false;                                            \
+  }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
index 4220451e3caee62caa51af5bc33d6dd3fd891018..c30a7c26cecbe67f0ca73223e06b2095584aca94 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/inference/analysis/node.h"
 
 namespace paddle {
 namespace inference {
@@ -57,19 +58,7 @@ std::string DataFlowGraph::DotString() const {
   // Add nodes
   for (size_t i = 0; i < nodes.size(); i++) {
     const Node &node = nodes.Get(i);
-    switch (node.type()) {
-      case Node::Type::kValue:
-        dot.AddNode(node.repr(), node.dot_attrs());
-        break;
-      case Node::Type::kFunction:
-        dot.AddNode(node.repr(), node.dot_attrs());
-        break;
-      case Node::Type::kFunctionBlock:
-        dot.AddNode(node.repr(), node.dot_attrs());
-        break;
-      default:
-        PADDLE_THROW("unsupported Node type %d", static_cast<int>(node.type()));
-    }
+    dot.AddNode(node.repr(), node.dot_attrs());
   }
 
   // Add edges
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f7d4cca2132d11eb89eee5a71ed0a3cc7381e1ff
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -0,0 +1,77 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include "paddle/fluid/framework/proto_desc.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
+  desc_ = argument->origin_program_desc.get();
+  // Here some logic from program_desc.cc and will not add new interfaces into
+  // framework::ProgramDesc class, use some UT to assure the correctness.
+  auto* block = desc_->mutable_blocks()->Add();
+  block->set_idx(framework::kRootBlockIndex);
+  block->set_parent_idx(framework::kNoneBlockIndex);
+  return true;
+}
+
+bool DataFlowGraphToFluidPass::Finalize() { return true; }
+
+void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
+  auto traits = GraphTraits<DataFlowGraph>(graph);
+  for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
+    if (it->deleted()) continue;
+    switch (it->type()) {
+      case Node::Type::kFunction:
+        LOG(INFO) << "add function " << it->name();
+        AddFluidOp(&(*it));
+        break;
+      case Node::Type::kFunctionBlock:
+        AddEngineOp(&(*it));
+        break;
+      default:
+        continue;
+    }
+  }
+}
+
+void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
+  LOG(INFO) << "processing func " << node->name();
+  auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
+  // currently only the main block is analyzed.
+  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto* op = main_block->add_ops();
+  LOG(INFO) << "to copy the op";
+  *op = *ori_op;  // copy the attributes, by default, these will not be changed
+                  // by analysis phrase.
+  // The inputs and outputs of the existing ops are not changed by tensorrt
+  // subgraph pass.
+  // NOTE It might be changed by other passes in the long run.
+}
+
+void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
+  // auto* ori_op = static_cast<framework::proto::OpDesc*>(node->extra_info());
+  // auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  // auto* op = main_block->add_ops();
+  // TODO(Superjomn) Here need to expose some arguments for default setting.
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbb05f622cc29c99c57e649b1c57cf3e54541191
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+/*
+ * This file implements the transformation from fluid ProgramDesc to data flow
+ * graph.
+ */
+
+#pragma once
+
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
+ public:
+  DataFlowGraphToFluidPass() = default;
+
+  bool Initialize(Argument *argument) override;
+  bool Finalize() override;
+
+  void Run(DataFlowGraph *graph) override;
+
+  std::string repr() const override { return "DFG to fluid"; }
+  std::string description() const override {
+    return "Transform a DFG to a Fluid ProgramDesc";
+  }
+
+  Pass *CreatePrinterPass(std::ostream &os,
+                          const std::string &banner) const override {
+    return nullptr;
+  }
+
+ protected:
+  // Add a Fluid Op into the ProgramDesc.
+  void AddFluidOp(Node *node);
+  // Add a EngineOp into the ProgramDesc.
+  void AddEngineOp(Node *node);
+
+ private:
+  framework::proto::ProgramDesc *desc_;
+};
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
index dcee75cee50ede1d2b660e88e06544440bd5ef77..d8fc5e580a98f76233f01fdc4d7987311f78ee45 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
@@ -27,13 +27,12 @@ namespace inference {
 namespace analysis {
 
 TEST_F(DFG_Tester, Test) {
-  framework::proto::ProgramDesc new_desc;
   DataFlowGraph graph;
 
   FluidToDataFlowGraphPass pass0;
   DataFlowGraphToFluidPass pass1;
-  pass0.Initialize(desc);
-  pass1.Initialize(&new_desc);
+  ASSERT_TRUE(pass0.Initialize(&argument));
+  ASSERT_TRUE(pass1.Initialize(&argument));
 
   pass0.Run(&graph);
   pass1.Run(&graph);
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..afffb3feb0c515faa554d0d4919c442ca4515294
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
+  auto content = Draw(graph);
+  std::ofstream file(GenDotPath());
+  file.write(content.c_str(), content.size());
+  file.close();
+  LOG(INFO) << "draw dot to " << GenDotPath();
+}
+
+std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
+  Dot dot;
+  // Add nodes
+  for (size_t i = 0; i < graph->nodes.size(); i++) {
+    const Node &node = graph->nodes.Get(i);
+    if (config_.display_deleted_node || !node.deleted()) {
+      dot.AddNode(node.repr(), node.dot_attrs());
+    }
+  }
+  // Add edges
+  for (size_t i = 0; i < graph->nodes.size(); i++) {
+    const Node &node = graph->nodes.Get(i);
+    if (!config_.display_deleted_node && node.deleted()) continue;
+    for (auto &in : node.inlinks) {
+      if (!config_.display_deleted_node && in->deleted()) continue;
+      for (auto &in : node.inlinks) {
+        dot.AddEdge(in->repr(), node.repr(), {});
+      }
+    }
+  }
+  return dot.Build();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
index 41d4475382befa1bdaf7473520d64005a472a459..93ebff59ae9691394858f32c822a5e70f3345581 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -21,6 +21,7 @@ limitations under the License. */
 
 #include <fstream>
 #include <string>
+#include "paddle/fluid/inference/analysis/dot.h"
 #include "paddle/fluid/inference/analysis/pass.h"
 
 namespace paddle {
@@ -32,35 +33,39 @@ namespace analysis {
  */
 class DFG_GraphvizDrawPass : public DataFlowGraphPass {
  public:
-  DFG_GraphvizDrawPass(const std::string& dir, const std::string& id)
-      : dir_(dir), id_(id) {}
-
-  bool Initialize() override { return Pass::Initialize(); }
-  void Run(DataFlowGraph* graph) override {
-    auto content = Draw(graph);
-    std::ofstream file(GenDotPath());
-    file.write(content.c_str(), content.size());
-    file.close();
-    LOG(INFO) << "draw dot to " << GenDotPath();
-  }
+  struct Config {
+    Config(const std::string &dir, const std::string &id,
+           bool display_deleted_node = false)
+        : dir(dir), id(id), display_deleted_node(display_deleted_node) {}
+
+    // The directory to store the .dot or .png files.
+    const std::string dir;
+    // The identifier for this dot file.
+    const std::string id;
+    // Whether to display deleted nodes, default false.
+    const bool display_deleted_node;
+  };
+
+  DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
 
+  bool Initialize(Argument *argument) override { return true; }
+  void Run(DataFlowGraph *graph) override;
   bool Finalize() override { return Pass::Finalize(); }
 
-  Pass* CreatePrinterPass(std::ostream& os,
-                          const std::string& banner) const override {
-    return nullptr;
+  std::string repr() const override { return "DFG graphviz drawer"; }
+  std::string description() const override {
+    return "Debug a DFG by draw with graphviz";
   }
 
  private:
   // Path of the dot file to output.
   std::string GenDotPath() const {
-    return dir_ + "/" + "graph_" + id_ + ".dot";
+    return config_.dir + "/" + "graph_" + config_.id + ".dot";
   }
 
-  std::string Draw(DataFlowGraph* graph) { return graph->DotString(); }
+  std::string Draw(DataFlowGraph *graph);
 
-  std::string dir_;
-  std::string id_;
+  Config config_;
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
index 3fc1cc18b855440c54c1ed6a9ab49a104c8c21f0..f4b5c5fd2201cc9ff56d7ee8d8921376c2c9c59e 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -24,9 +24,10 @@ namespace inference {
 namespace analysis {
 
 TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
-  auto dfg = ProgramDescToDFG(desc);
-  DFG_GraphvizDrawPass pass("./", "test");
-  pass.Initialize();
+  auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+  DFG_GraphvizDrawPass::Config config("./", "test");
+  DFG_GraphvizDrawPass pass(config);
+  pass.Initialize(&argument);
   pass.Run(&dfg);
 
   // test content
@@ -38,7 +39,8 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
   while (std::getline(file, line)) {
     no++;
   }
-  ASSERT_EQ(no, 82);
+  // DFG is sensitive to ProgramDesc, be careful to change the existing models.
+  ASSERT_EQ(no, 112);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 9f67c989cca4a936cd320b73efaae277263fb3e2..5f62eef52876ac68dfab00348f422a46de123cfe 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -21,19 +21,23 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-FluidToDataFlowGraphPass::FluidToDataFlowGraphPass() {}
-
-bool FluidToDataFlowGraphPass::Initialize() { return Pass::Initialize(); }
-
-bool FluidToDataFlowGraphPass::Initialize(
-    const framework::proto::ProgramDesc &desc) {
-  desc_ = &desc;
+bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
+  PADDLE_ENFORCE(argument);
+  if (!argument->main_dfg) {
+    LOG(INFO) << "Init DFG";
+    argument->main_dfg.reset(new DataFlowGraph);
+  }
+  desc_ = argument->origin_program_desc.get();
   return true;
 }
 
 bool FluidToDataFlowGraphPass::Finalize() { return Pass::Finalize(); }
 
 void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
+  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE(desc_);
   // insert vars
   std::unordered_map<std::string, size_t> var2id;
   auto &main_block = desc_->blocks(framework::kRootBlockIndex);
@@ -41,7 +45,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     const auto &var = main_block.vars(i);
     auto *v = graph->nodes.Create(Node::Type::kValue);
     v->SetName(var.name());
-    v->SetExtraInfo(const_cast<void *>(static_cast<const void *>(&var)));
+    v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
     var2id[var.name()] = v->id();
   }
   for (int i = 0; i < main_block.ops_size(); i++) {
@@ -51,7 +55,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
     static_cast<Function *>(o)->SetFuncType(op.type());
     // Link to the original protobuf message's memory, make it easier to
     // generate from a data flow graph to fluid ProgramDesc.
-    o->SetExtraInfo(const_cast<void *>(static_cast<const void *>(&op)));
+    o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
     // set inputs and outputs
     // TODO(Superjomn) make sure the InputNames is the real variable name.
     for (int j = 0; j < op.inputs_size(); j++) {
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
index 33517e57becdffc0416f204247eac5feadb7ed82..176faf0220cc98bf2c0384af75125d4bc493e753 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -34,13 +34,18 @@ namespace analysis {
  */
 class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
  public:
-  FluidToDataFlowGraphPass();
-  bool Initialize() override;
-  bool Initialize(const framework::proto::ProgramDesc &desc) override;
+  FluidToDataFlowGraphPass() = default;
+
+  bool Initialize(Argument *argument) override;
   bool Finalize() override;
 
   void Run(DataFlowGraph *graph) override;
 
+  std::string repr() const override { return "fluid-to-data-flow-graph"; }
+  std::string description() const override {
+    return "transform a fluid ProgramDesc to a data flow graph.";
+  }
+
   Pass *CreatePrinterPass(std::ostream &os,
                           const std::string &banner) const override;
 
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
index 817d32c92cdbdc234eef9ed5156891c2b11ced4c..cfbbc284e491bd62a6108d6d14e7896a57d1b63e 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -23,11 +23,11 @@ namespace analysis {
 
 TEST_F(DFG_Tester, Init) {
   FluidToDataFlowGraphPass pass;
-  pass.Initialize();
-  pass.Initialize(desc);
+  pass.Initialize(&argument);
   DataFlowGraph graph;
   pass.Run(&graph);
-  ASSERT_GT(graph.nodes.size(), 0);
+  // Analysis is sensitive to ProgramDesc, careful to change the original model.
+  ASSERT_EQ(graph.nodes.size(), 37);
   pass.Finalize();
   LOG(INFO) << '\n' << graph.DotString();
 }
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 58eb0e715cb71d87179f3240de55021603cd7423..f0039e113159fdcc0cc1c209a8bc899bc82984c1 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -62,6 +62,7 @@ struct DataTypeNamer {
     SET_TYPE(int);
     SET_TYPE(bool);
     SET_TYPE(float);
+    SET_TYPE(void *);
   }
 
   std::unordered_map<decltype(typeid(int).hash_code()),  // NOLINT
diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc
index fe060526080b1ee01aa98f2ff06fb2191eddf9da..3339b5044df0cf91d00aa9ddad310d4bf263bc3c 100644
--- a/paddle/fluid/inference/analysis/node.cc
+++ b/paddle/fluid/inference/analysis/node.cc
@@ -40,6 +40,9 @@ Node *NodeMap::Create(Node::Type type) {
     case Node::Type::kValue:
       nodes_.emplace_back(new Value);
       break;
+    case Node::Type::kFunctionBlock:
+      nodes_.emplace_back(new FunctionBlock);
+      break;
     default:
       PADDLE_THROW("Not supported node type.");
   }
diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h
index 7972ca25c92186a8c55a76de645f4fdbb089e8d3..8c2e6d88b9605d9923d002f73b60cd92b5e551b7 100644
--- a/paddle/fluid/inference/analysis/node.h
+++ b/paddle/fluid/inference/analysis/node.h
@@ -71,12 +71,17 @@ class Node {
 
   // Get an additional attribute and convert it to T data type. NOTE this will
   // silently create a new attribute if not exists.
-  Attr &attr(const std::string &name) { return attrs_[name]; }
+  Attr &attr(const std::string &name) const { return attrs_[name]; }
 
   int id() const { return id_; }
 
-  bool deleted() const { return deleted_; }
+  // The Protobuf description is set/get with a void* to decouple Node interface
+  // from a specific kind of Protobuf message.
+  void SetPbDesc(void *pb) { attr("pb_desc").Pointer() = pb; }
+  void *pb_desc() const { return attr("pb_desc").Pointer(); }
+
   void SetDeleted() { deleted_ = true; }
+  bool deleted() const { return deleted_; }
 
   void SetName(const std::string &name) { name_ = name; }
   const std::string &name() const { return name_; }
@@ -84,29 +89,25 @@ class Node {
   void SetType(Type type) { type_ = type; }
   Type type() const { return type_; }
 
-  void *extra_info() const { return extra_info_; }
-  void SetExtraInfo(void *extra_info) { extra_info_ = extra_info; }
-
   // Input links.
   std::vector<Node *> inlinks;
   // Output links.
   std::vector<Node *> outlinks;
 
   // A helper class to maintain the status from Pass.
-  // TODO(superjomn) add a checker here to ensure the T is primary.
   struct Attr {
     // NOTE T should be a primary type or a struct combined by several primary
     // types.
     // NOTE the STL containers should not use here.
     // Some usages
-    // Attr attr;
-    // T data;
-    // attr.data.assign((char*)data, sizeof(data));
+    //   Attr attr;
+    //   attr.Bool() = true;
 
     bool &Bool() { return As<bool>(); }
     float &Float() { return As<float>(); }
     int32_t &Int32() { return As<int32_t>(); }
     int64_t &Int64() { return As<int64_t>(); }
+    void *&Pointer() { return As<void *>(); }
 
    private:
     template <typename T>
@@ -130,6 +131,7 @@ class Node {
     size_t type_hash_{std::numeric_limits<size_t>::max()};
   };
 
+  // Type checks.
   bool IsFunction() const { return type_ == Node::Type::kFunction; }
   bool IsValue() const { return type_ == Node::Type::kValue; }
   bool IsFunctionBlock() const { return type_ == Node::Type::kFunctionBlock; }
@@ -148,9 +150,6 @@ class Node {
   Type type_{Type::kNone};
   // Mark this node is deleted by some pass.
   bool deleted_{false};
-
-  void *extra_info_;
-
   mutable std::unordered_map<std::string, Attr> attrs_;
 };
 
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
index aa0e8667b5e4a9e6156c25fcad03bb8eee3287f6..65632b749177add9dcb297bffad1e85f68a80b02 100644
--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/node.h"
@@ -30,19 +31,24 @@ namespace analysis {
 class Pass {
  public:
   Pass() = default;
-  virtual ~Pass() {}
+  virtual ~Pass() = default;
   // Virtual method overridden by subclasses to do only necessary initialization
   // before any pass is run.
-  virtual bool Initialize() { return false; }
+  // virtual bool Initialize() { return false; }
   // There is some passes such as FlowToDataFlowGraphPass that needs a
   // ProgramDesc. Here use the native ProgramDesc ProtoBuf message, so that it
   // only couple with the proto file.
-  virtual bool Initialize(const framework::proto::ProgramDesc &desc) {
-    return false;
-  }
+  // virtual bool Initialize(const framework::proto::ProgramDesc &desc) { return
+  // false; }
   // There are some Passes such as DataFlowGraphToFluidPass that will output a
   // ProgramDesc.
-  virtual bool Initialize(framework::proto::ProgramDesc *desc) { return false; }
+  // virtual bool Initialize(framework::proto::ProgramDesc *desc) { return
+  // false; }
+
+  // Mutable Pass.
+  virtual bool Initialize(Argument *argument) { return false; }
+  // Readonly Pass.
+  virtual bool Initialize(const Argument &argument) { return false; }
 
   // Virtual method overriden by subclasses to do any necessary clean up after
   // all passes have run.
@@ -50,7 +56,9 @@ class Pass {
 
   // Get a Pass appropriate to print the Node this pass operates on.
   virtual Pass *CreatePrinterPass(std::ostream &os,
-                                  const std::string &banner) const = 0;
+                                  const std::string &banner) const {
+    return nullptr;
+  }
 
   // Run on a single Node.
   virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
@@ -60,6 +68,11 @@ class Pass {
   virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; }
   // Run on a single DataFlowGraph.
   virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; }
+
+  // Human-readable short representation.
+  virtual std::string repr() const = 0;
+  // Human-readable long description.
+  virtual std::string description() const = 0;
 };
 
 // NodePass process on any Node types.
diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b17c0e0d724ebeea7b84bf63024cd141891a78b4
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void DfgPassManager::RunAll() {
+  PADDLE_ENFORCE(argument_);
+  for (auto& pass : data_) {
+    VLOG(4) << "Running pass [" << pass->repr() << "]";
+    pass->Run(argument_->main_dfg.get());
+  }
+}
+
+void NodePassManager::RunAll() {
+  PADDLE_ENFORCE(argument_);
+  PADDLE_ENFORCE(argument_->main_dfg.get());
+  auto trait =
+      GraphTraits<DataFlowGraph>(argument_->main_dfg.get()).nodes_in_DFS();
+  for (auto& node : trait) {
+    for (auto& pass : data_) {
+      pass->Run(&node);
+    }
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager.h b/paddle/fluid/inference/analysis/pass_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..7841c4b9d08001264af9f3a248a96814d1c273c4
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass_manager.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the logic of pass management. The analysis for inference is
+ * a pipeline of Passes, a PassManager is a agency that helps to manage the
+ * executation of the Passes.
+ *
+ * There are two modes of Passes, the first one is called NodePass and takes
+ * an Node as input and output; the second one is called DFGPass and takes a
+ * DFG(Data Flow Graph) as input and output. It is hard to put all the passes in
+ * the same pipeline, there are two kinds of PassManagers, both takes a DFG as
+ * input and output a DFG, but the Passes inside are different:
+ *
+ *   1. NodePassManager: the passes inside are all NodePasses, it can have
+ *      different graph trivial algorithm, for example, DFS_NodePassManager will
+ *      trigger the passes in depth first order;
+ *   2. DfgPassManager: the passes inside are all DfgPasses.
+ */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * PassManager is the base class for all pass managers, a pass manager has
+ * several Pass-es registered, and execute them in the linear order.
+ */
+class PassManager : public OrderedRegistry<Pass> {
+ public:
+  PassManager() = default;
+  // Call all the passes' Initialize methods. The desc and data_flow_graph are
+  // globally shared, so pass them as the arguemnts for all the pass managers.
+  virtual bool Initialize(const Argument& argument) { return false; }
+
+  virtual bool Initialize(Argument* argument) {
+    argument_ = argument;
+    for (auto& pass : data_) {
+      LOG(INFO) << "Initializing pass " << pass->repr();
+      if (!pass->Initialize(argument)) {
+        LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Call all the passes' Finalize methods.
+  virtual bool Finalize() {
+    for (auto& pass : data_) {
+      if (!pass->Finalize()) {
+        LOG(ERROR) << "Failed to finalize pass [" << pass->repr() << "]";
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Run all the passes.
+  virtual void RunAll() = 0;
+
+  // Short identifier.
+  virtual std::string repr() const = 0;
+  // Long description.
+  virtual std::string description() const = 0;
+
+  virtual ~PassManager() = default;
+
+ protected:
+  Argument* argument_{nullptr};
+};
+
+/*
+ * A pass manager that process a DFG.
+ */
+class DfgPassManager : public PassManager {
+ public:
+  DfgPassManager() = default;
+
+  void RunAll() override;
+
+  virtual ~DfgPassManager() = default;
+};
+
+/*
+ * A pass manager that process a Node each time.
+ */
+class NodePassManager : public PassManager {
+ public:
+  NodePassManager() = default;
+
+  void RunAll() override;
+
+  virtual ~NodePassManager() = default;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7af6a199514636224f0b8303abea7d398400d278
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -0,0 +1,85 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class TestDfgPassManager final : public DfgPassManager {
+ public:
+  TestDfgPassManager() = default;
+  virtual ~TestDfgPassManager() = default;
+  // Short identifier.
+  std::string repr() const override { return "test-pass-manager"; }
+  // Long description.
+  std::string description() const override { return "test doc"; }
+};
+
+class TestNodePassManager final : public NodePassManager {
+ public:
+  virtual ~TestNodePassManager() = default;
+
+  std::string repr() const override { return "test-node-pass-manager"; }
+  std::string description() const override { return "test doc"; }
+};
+
+class TestNodePass final : public NodePass {
+ public:
+  virtual ~TestNodePass() = default;
+
+  bool Initialize(Argument* argument) override { return true; }
+
+  void Run(Node* node) override {
+    LOG(INFO) << "- Processing node " << node->repr();
+  }
+
+  std::string repr() const override { return "test-node"; }
+  std::string description() const override { return "some doc"; }
+};
+
+TEST_F(DFG_Tester, DFG_pass_manager) {
+  TestDfgPassManager manager;
+  DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
+
+  manager.Register("fluid-to-flow-graph", new FluidToDataFlowGraphPass);
+  manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
+  manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
+
+  ASSERT_TRUE(manager.Initialize(&argument));
+  manager.RunAll();
+}
+
+TEST_F(DFG_Tester, Node_pass_manager) {
+  // Pre-process: initialize the DFG with the ProgramDesc first.
+  FluidToDataFlowGraphPass pass0;
+  pass0.Initialize(&argument);
+  pass0.Run(argument.main_dfg.get());
+
+  TestNodePassManager manager;
+  manager.Register("test-node-pass", new TestNodePass);
+  ASSERT_TRUE(manager.Initialize(&argument));
+  manager.RunAll();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
index 0644c0db12e3daabba76dbaad33847f5624b157a..8134494f8bccb132f2ed7d1ba1fb615a298596ed 100644
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -19,22 +19,23 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
+  if (node->type() != Node::Type::kFunction) return false;
+  const auto* func = static_cast<const Function*>(node);
+  if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
+      func->func_type() == "conv2d" || func->func_type() == "mul" ||
+      func->func_type() == "sigmoid" || func->func_type() == "softmax") {
+    LOG(INFO) << "sub-graph marked " << node->repr();
+    return true;
+  }
+  return false;
+};
+
 TEST_F(DFG_Tester, Split) {
   auto desc = LoadProgramDesc();
   auto dfg = ProgramDescToDFG(desc);
   LOG(INFO) << "spliter\n" << dfg.DotString();
 
-  SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
-    if (node->type() != Node::Type::kFunction) return false;
-    const auto* func = static_cast<const Function*>(node);
-    if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
-        func->func_type() == "conv2d" || func->func_type() == "mul" ||
-        func->func_type() == "sigmoid" || func->func_type() == "softmax") {
-      LOG(INFO) << "sub-graph marked " << node->repr();
-      return true;
-    }
-    return false;
-  };
   ASSERT_GT(dfg.nodes.size(), 5UL);
 
   auto subgraphs = SubGraphSplitter(&dfg, teller)();
@@ -62,6 +63,28 @@ TEST_F(DFG_Tester, Split) {
   ASSERT_EQ(subgraphs.back().size(), 6UL);
 }
 
+TEST_F(DFG_Tester, Fuse) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+
+  size_t count0 = dfg.nodes.size();
+
+  SubGraphFuse fuse(&dfg, teller);
+  fuse();
+
+  int count1 = 0;
+  for (auto& node : dfg.nodes.nodes()) {
+    if (node->deleted()) {
+      LOG(INFO) << "deleted " << node->repr();
+    }
+    count1 += node->deleted();
+  }
+
+  // At least one nodes should be deleted.
+  ASSERT_EQ(dfg.nodes.size(), count0 + 1);  // added a new FunctionBlock
+  ASSERT_EQ(6UL, count1);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b75df33b71311acd0e626e5a13c18469b19ef136
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -0,0 +1,33 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TensorRTSubGraphPass::TensorRTSubGraphPass(
+    const TensorRTSubGraphPass::NodeInsideSubgraphTeller &teller)
+    : node_inside_subgraph_teller_(teller) {}
+
+void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
+  SubGraphFuse(graph, node_inside_subgraph_teller_);
+}
+
+}  // analysis
+}  // inference
+
+}  // paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..79e9e2bcc9e626a102dfdab6f1f50c8d58f9bbdd
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/inference/analysis/node.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Parse the graph and replace TensorRT supported nodes with SubGraphNode
+ */
+class TensorRTSubGraphPass : public DataFlowGraphPass {
+ public:
+  // Tell whether to transform a sub-graph into TensorRT.
+  using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller;
+
+  TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
+
+  bool Initialize(Argument* argument) override { return true; }
+
+  // This class get a sub-graph as input and determine whether to transform this
+  // sub-graph into TensorRT.
+  void Run(DataFlowGraph* graph) override;
+
+ private:
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d12dcf0d0fe7f9354f7ed1aac924aeab3403e9b8
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_string(model_dir, "", "inference test model dir");
+
+TEST(TensorRTSubGraph, single_pass) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+
+  SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
+    if (node->type() != Node::Type::kFunction) return false;
+    const auto* func = static_cast<const Function*>(node);
+    if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
+        func->func_type() == "conv2d" || func->func_type() == "mul" ||
+        func->func_type() == "sigmoid" || func->func_type() == "softmax") {
+      LOG(INFO) << "sub-graph marked " << node->repr();
+      return true;
+    }
+    return false;
+  };
+
+  DFG_GraphvizDrawPass::Config config{"./", "test"};
+  DFG_GraphvizDrawPass dfg_pass(config);
+  dfg_pass.Initialize();
+
+  DFG_GraphvizDrawPass dfg_pass1(config);
+  dfg_pass1.Initialize();
+
+  dfg_pass.Run(&dfg);
+
+  TensorRTSubGraphPass trt_pass(std::move(teller));
+  trt_pass.Initialize();
+
+  trt_pass.Run(&dfg);
+
+  dfg_pass1.Run(&dfg);
+
+  // Check the TRT op's block desc
+  for (auto node : dfg.nodes.nodes()) {
+    if (node->IsFunctionBlock()) {
+    }
+  }
+}
+
+TEST(TensorRTSubGraph, pass_manager) {}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
index 722fa99a48a5f2b0e778904de0c35977d0ee3cc0..ce1191a567a4198f003520c40bf02487c48c56eb 100644
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -15,33 +15,46 @@ limitations under the License. */
 #pragma once
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
+#include <fstream>
 #include <string>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/io.h"
 
 namespace paddle {
 namespace inference {
+
+// Read ProgramDesc from a __model__ file, defined in io.cc
+extern void ReadBinaryFile(const std::string& filename, std::string* contents);
+
 namespace analysis {
 
 DEFINE_string(inference_model_dir, "", "inference test model dir");
 
 static framework::proto::ProgramDesc LoadProgramDesc(
     const std::string& model_dir = FLAGS_inference_model_dir) {
-  paddle::platform::CPUPlace place;
-  paddle::framework::Executor executor(place);
-  paddle::framework::Scope scope;
-  auto program = Load(&executor, &scope, model_dir);
-  return *program->Proto();
+  std::string msg;
+  std::string net_file = FLAGS_inference_model_dir + "/__model__";
+  std::ifstream fin(net_file, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", net_file);
+  fin.seekg(0, std::ios::end);
+  msg.resize(fin.tellg());
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(msg.at(0)), msg.size());
+  fin.close();
+  framework::proto::ProgramDesc program_desc;
+  program_desc.ParseFromString(msg);
+  return program_desc;
 }
 
 static DataFlowGraph ProgramDescToDFG(
     const framework::proto::ProgramDesc& desc) {
   DataFlowGraph graph;
   FluidToDataFlowGraphPass pass;
-  pass.Initialize(desc);
+  Argument argument;
+  argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
+  pass.Initialize(&argument);
   pass.Run(&graph);
   pass.Finalize();
   return graph;
@@ -49,9 +62,12 @@ static DataFlowGraph ProgramDescToDFG(
 
 class DFG_Tester : public ::testing::Test {
  protected:
-  void SetUp() override { desc = LoadProgramDesc(FLAGS_inference_model_dir); }
+  void SetUp() override {
+    auto desc = LoadProgramDesc(FLAGS_inference_model_dir);
+    argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
+  }
 
-  framework::proto::ProgramDesc desc;
+  Argument argument;
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index c7a5a49dd02d0db022fabff5c3ae1c7800bac25c..6697952051c4b1997ca6b550da17a52e64cb3454 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -64,7 +64,8 @@ class OpConverter {
     (*it)(op, scope, test_mode);
   }
 
-  // convert fluid block to tensorrt network
+  // Convert a fluid block to tensorrt network, NOTE it just convert operators,
+  // the INetwork's inputs and outputs should specified in some other modules.
   void ConvertBlock(const framework::proto::BlockDesc& block,
                     const std::unordered_set<std::string>& parameters,
                     const framework::Scope& scope, TensorRTEngine* engine) {
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index b60f00de9fa5fc8f8f4537379bf9ee9c8bb6f31c..b06a9bbc6758ae9410b2fce99ef2b1a9e7ab98c0 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -51,11 +51,12 @@ class TensorRTEngine : public EngineBase {
     nvinfer1::Weights w_;
   };
 
-  TensorRTEngine(int max_batch, int max_workspace, cudaStream_t* stream,
+  TensorRTEngine(int max_batch, int max_workspace,
+                 cudaStream_t* stream = nullptr,
                  nvinfer1::ILogger& logger = NaiveLogger::Global())
       : max_batch_(max_batch),
         max_workspace_(max_workspace),
-        stream_(stream),
+        stream_(stream ? stream : &default_stream_),
         logger_(logger) {}
 
   virtual ~TensorRTEngine();
@@ -121,6 +122,8 @@ class TensorRTEngine : public EngineBase {
   // the max memory size the engine uses
   int max_workspace_;
   cudaStream_t* stream_;
+  // If stream_ is not set from outside, hold its own stream.
+  cudaStream_t default_stream_;
   nvinfer1::ILogger& logger_;
 
   std::vector<Buffer> buffers_;
@@ -165,20 +168,31 @@ class TensorRTEngine : public EngineBase {
  */
 class TRT_EngineManager {
  public:
-  TensorRTEngine* Create(int max_batch, int max_workspace,
-                         cudaStream_t* stream) {
-    engines_.emplace_back(new TensorRTEngine(max_batch, max_workspace, stream));
-    return engines_.back().get();
+  bool HasEngine(const std::string& name) const {
+    return engines_.count(name) != 0;
+  }
+
+  // Get an engine called `name`.
+  TensorRTEngine* Get(const std::string& name) const {
+    return engines_.at(name).get();
+  }
+
+  // Create or get an engine called `name`
+  TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
+                         const std::string& name) {
+    auto* p = new TensorRTEngine(max_batch, max_workspace, stream);
+    engines_[name].reset(p);
+    return p;
   }
 
   void DeleteALl() {
-    for (auto& ptr : engines_) {
-      ptr.reset(nullptr);
+    for (auto& item : engines_) {
+      item.second.reset(nullptr);
     }
   }
 
  private:
-  std::vector<std::unique_ptr<TensorRTEngine>> engines_;
+  std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
index 9dcd79c3bb9ed713ff0f12024969cc5798750988..cbba8b9d559e024fc1e955489bb8d37c77097d25 100644
--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -29,6 +29,7 @@ DEFINE_string(data_file, "", "File of input index data.");
 DEFINE_int32(repeat, 100, "Running the inference program repeat times");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_int32(num_threads, 1, "Number of threads should be used");
+DECLARE_bool(use_mkldnn);
 
 inline double GetCurrentMs() {
   struct timeval time;
@@ -103,9 +104,9 @@ void ThreadRunInfer(
     const int tid, paddle::framework::Scope* scope,
     const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
   // maybe framework:ProgramDesc is not thread-safe
+  paddle::platform::CPUPlace place;
+  paddle::framework::Executor executor(place);
   auto& sub_scope = scope->NewScope();
-  auto place = paddle::platform::CPUPlace();
-  auto executor = paddle::framework::Executor(place);
   auto inference_program =
       paddle::inference::Load(&executor, scope, FLAGS_model_path);
 
@@ -182,8 +183,8 @@ TEST(inference, nlp) {
     stop_ms = GetCurrentMs();
   } else {
     // 1. Define place, executor, scope
-    auto place = paddle::platform::CPUPlace();
-    auto executor = paddle::framework::Executor(place);
+    paddle::platform::CPUPlace place;
+    paddle::framework::Executor executor(place);
 
     // 2. Initialize the inference_program and load parameters
     std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index d5390529163491c2711e50ffad236534e88b73ee..9b1ab1e228dd758b52975abc4c4aa0bdeadbe2de 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -43,14 +43,16 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
 
   *index = 0;  // unlock memory
 
-  void* p;
+  void* p = nullptr;
 
 #ifdef PADDLE_WITH_MKLDNN
   // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
   // memory alignment
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0);
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0, "Alloc %ld error!",
+                    size);
 #else
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0);
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0, "Alloc %ld error!",
+                    size);
 #endif
   PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);
 
diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc
index 46ed99bcf2234f7621d9f00eb48c846d8a355795..137bca5e2b8e2754aed274970e08b03ee816a7f2 100644
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -12,16 +12,20 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "mkldnn.hpp"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 namespace paddle {
 namespace operators {
 
-using paddle::framework::Tensor;
-using paddle::platform::MKLDNNDeviceContext;
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::stream;
+using platform::GetMKLDNNFormat;
+using platform::MKLDNNDeviceContext;
+using platform::to_void_cast;
 
 namespace {
 std::string gethash(const mkldnn::memory::dims &operand_dims,
@@ -35,188 +39,260 @@ std::string gethash(const mkldnn::memory::dims &operand_dims,
   };
   return dim2str(operand_dims) + std::to_string(algorithm);
 }
+}  // namespace
+
+template <typename Functor>
+class MKLDNNActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *x = ctx.Input<Tensor>("X");
+    PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                       x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input x tensor");
+
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+    functor(ctx);
+  }
+};
 
-template <typename T, typename ExecContext>
-void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
-                     const T alpha = 0, const T beta = 0) {
+template <typename Functor>
+class MKLDNNActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
+                       diff_y->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input OutGrad tensor");
+
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+    functor(ctx);
+  }
+};
+
+template <typename T>
+void eltwise_forward(const framework::ExecutionContext &ctx,
+                     mkldnn::algorithm algorithm, const T alpha = 0,
+                     const T beta = 0) {
   PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                  "It must use CPUPlace.");
-
   auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
   const auto &mkldnn_engine = dev_ctx.GetEngine();
 
-  // get buffers
-  const auto *src = ctx.template Input<Tensor>("X");
-  const auto *src_data = src->template data<T>();
+  const auto *x = ctx.Input<Tensor>("X");
+  auto *y = ctx.Output<Tensor>("Out");
 
-  auto *dst = ctx.template Output<Tensor>("Out");
-  T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
+  const T *x_data = x->data<T>();
+  T *y_data = y->mutable_data<T>(ctx.GetPlace());
 
-  // get memory dim
-  PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4,
+  PADDLE_ENFORCE(x->dims().size() == 2 || x->dims().size() == 4,
                  "Input dim must be with 2 or 4");
-  std::vector<int> src_tz = framework::vectorize2int(src->dims());
+
+  std::vector<int> src_tz = framework::vectorize2int(x->dims());
+
+  auto src_format =
+      src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format();
 
   const std::string key = gethash(src_tz, algorithm);
   const std::string key_src_data =
       key + ctx.op().Output("Out") + "@eltwise_fwd_src_data";
-  const std::string key_src_mem = key + "@eltwise_fwd_src_mem";
-  const std::string key_dst_mem = key + "@eltwise_fwd_dst_mem";
-  const std::string key_fwd = key + "@eltwise_fwd";
+  const std::string key_src_layout =
+      key + ctx.op().Output("Out") + "@eltwise_fwd_src_layout";
+  const std::string key_with_layout = key + std::to_string(src_format);
+  const std::string key_src_mem = key_with_layout + "@eltwise_fwd_src_mem";
+  const std::string key_dst_mem = key_with_layout + "@eltwise_fwd_dst_mem";
+  const std::string key_fwd = key_with_layout + "@eltwise_fwd";
+  const std::string key_fwd_pd = key_with_layout + "@eltwise_fwd_pd";
+
+  // save input data and layout to be referred in backward path
+  auto p_src_data = std::make_shared<const T *>(x_data);
+  dev_ctx.SetBlob(key_src_data, p_src_data);
+  auto p_src_layout = std::make_shared<memory::format>(src_format);
+  dev_ctx.SetBlob(key_src_layout, p_src_layout);
 
   auto p_fwd = std::static_pointer_cast<mkldnn::eltwise_forward>(
       dev_ctx.GetBlob(key_fwd));
 
-  // save input data to be referred in backward path
-  auto p_src_data = std::make_shared<const T *>(src_data);
-  dev_ctx.SetBlob(key_src_data, p_src_data);
+  std::shared_ptr<memory> dst_memory;
 
   if (p_fwd == nullptr) {
-    // create memory description
-    auto data_md = src_tz.size() == 2
-                       ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                                 mkldnn::memory::format::nc)
-                       : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                                 mkldnn::memory::format::nchw);
-
-    // create memory primitives
-    auto p_src_mem = std::make_shared<mkldnn::memory>(mkldnn::memory(
-        {data_md, mkldnn_engine}, platform::to_void_cast(src_data)));
-    dev_ctx.SetBlob(key_src_mem, p_src_mem);
-
-    auto p_dst_mem = std::make_shared<mkldnn::memory>(mkldnn::memory(
-        {data_md, mkldnn_engine}, platform::to_void_cast(dst_data)));
-    dev_ctx.SetBlob(key_dst_mem, p_dst_mem);
-
-    auto fwd_desc = mkldnn::eltwise_forward::desc(
-        mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
-    auto p_fwd_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
-        fwd_desc, mkldnn_engine);
-    const std::string key_fwd_pd = key + "eltwise_fwd_pd";
-    dev_ctx.SetBlob(key_fwd_pd, p_fwd_pd);
-    p_fwd = std::make_shared<mkldnn::eltwise_forward>(
-        *p_fwd_pd, *(p_src_mem.get()), *(p_dst_mem.get()));
+    // create mkldnn memory for input X
+    auto src_md = platform::MKLDNNMemDesc(
+        src_tz, platform::MKLDNNGetDataType<T>(), src_format);
+    auto src_memory = std::shared_ptr<memory>(
+        new memory({src_md, mkldnn_engine}, to_void_cast(x_data)));
+    // save src_memory to be referred in backward path
+    dev_ctx.SetBlob(key_src_mem, src_memory);
+
+    // create primitive descriptor for activation forward and save it
+    auto forward_desc = mkldnn::eltwise_forward::desc(
+        mkldnn::prop_kind::forward_training, algorithm,
+        src_memory->get_primitive_desc().desc(), alpha, beta);
+    auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
+        forward_desc, mkldnn_engine);
+
+    // save prim desc into global device context to be referred in backward path
+    dev_ctx.SetBlob(key_fwd_pd, forward_pd);
+
+    // create mkldnn memory for output y
+    dst_memory =
+        std::make_shared<memory>(forward_pd->dst_primitive_desc(), y_data);
+
+    dev_ctx.SetBlob(key_dst_mem, dst_memory);
+
+    // create activation primitive
+    p_fwd = std::make_shared<mkldnn::eltwise_forward>(*forward_pd, *src_memory,
+                                                      *dst_memory);
     dev_ctx.SetBlob(key_fwd, p_fwd);
   } else {
     // primitives already exist
-    auto p_src_mem =
+    auto src_memory =
         std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
-    PADDLE_ENFORCE(p_src_mem != nullptr,
-                   "Fail to find eltwise p_src_mem in device context.");
-    auto p_dst_mem =
+    PADDLE_ENFORCE(src_memory != nullptr,
+                   "Fail to find eltwise src_memory in device context.");
+    dst_memory =
         std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_dst_mem));
-    PADDLE_ENFORCE(p_dst_mem != nullptr,
-                   "Fail to find eltwise p_src_mem in device context.");
+    PADDLE_ENFORCE(dst_memory != nullptr,
+                   "Fail to find eltwise dst_memory in device context.");
 
-    p_src_mem->set_data_handle(platform::to_void_reinterpret_cast(src_data));
-    p_dst_mem->set_data_handle(dst_data);
+    src_memory->set_data_handle(platform::to_void_cast(x_data));
+    dst_memory->set_data_handle(y_data);
   }
 
   // push primitive to stream and wait until it's executed
-  std::vector<mkldnn::primitive> pipeline = {*(p_fwd.get())};
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  std::vector<primitive> pipeline;
+  pipeline.push_back(*p_fwd);
+  stream(stream::kind::eager).submit(pipeline).wait();
+
+  y->set_layout(DataLayout::kMKLDNN);
+  y->set_format(GetMKLDNNFormat(*dst_memory));
 }
 
-template <typename T, typename ExecContext>
-void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
-                  const T alpha = 0, const T beta = 0) {
+template <typename T>
+void eltwise_grad(const framework::ExecutionContext &ctx,
+                  mkldnn::algorithm algorithm, const T alpha = 0,
+                  const T beta = 0) {
   auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
   const auto &mkldnn_engine = dev_ctx.GetEngine();
 
-  // get buffers
-  const auto *out = ctx.template Input<Tensor>("Out");
-
-  auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
-  const auto *diff_dst = dout->template data<T>();
+  const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
+  auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-  auto *dx =
-      ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
-  const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
+  const T *diff_y_data = diff_y->data<T>();
+  T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());
 
-  // get memory dim
-  std::vector<int> src_tz = framework::vectorize2int(out->dims());
+  std::vector<int> diff_dst_tz = framework::vectorize2int(diff_y->dims());
 
-  const std::string key = gethash(src_tz, algorithm);
-  const std::string key_diff_src_mem = key + "@eltwise_diff_src_mem";
-  const std::string key_diff_dst_mem = key + "@eltwise_diff_dst_mem";
-  const std::string key_grad = key + "@eltwise_grad";
+  auto diff_y_format =
+      diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format();
 
+  const std::string key = gethash(diff_dst_tz, algorithm);
   const std::string key_src_data =
       key + ctx.op().Input("Out") + "@eltwise_fwd_src_data";
+  const std::string key_src_layout =
+      key + ctx.op().Input("Out") + "@eltwise_fwd_src_layout";
+  const auto p_src_layout =
+      std::static_pointer_cast<memory::format>(dev_ctx.GetBlob(key_src_layout));
+  const std::string key_src_mem =
+      key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem";
+  const std::string key_fwd_pd =
+      key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd";
+  const std::string key_with_layouts =
+      key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format);
+  const std::string key_diff_src_mem =
+      key_with_layouts + "@eltwise_diff_src_mem";
+  const std::string key_diff_dst_mem =
+      key_with_layouts + "@eltwise_diff_dst_mem";
+  const std::string key_grad = key_with_layouts + "@eltwise_grad";
+
   const auto p_src_data =
       std::static_pointer_cast<T *>(dev_ctx.GetBlob(key_src_data));
 
-  const std::string key_src_mem = key + "@eltwise_fwd_src_mem";
-  auto p_src_mem =
+  auto src_memory =
       std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
-  p_src_mem->set_data_handle(*p_src_data.get());
+  PADDLE_ENFORCE(src_memory != nullptr,
+                 "Fail to find src_memory in device context");
+  src_memory->set_data_handle(*p_src_data.get());
+
+  std::shared_ptr<memory> diff_src_memory;
 
-  auto p_grad = std::static_pointer_cast<mkldnn::eltwise_forward::primitive>(
+  auto p_grad = std::static_pointer_cast<mkldnn::eltwise_backward>(
       dev_ctx.GetBlob(key_grad));
 
   if (p_grad == nullptr) {
-    // create memory description
-    auto data_md = src_tz.size() == 2
-                       ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                                 mkldnn::memory::format::nc)
-                       : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                                 mkldnn::memory::format::nchw);
-
-    // create memory primitives
-    std::shared_ptr<void> p_diff_src_mem =
-        std::make_shared<mkldnn::memory>(mkldnn::memory(
-            {data_md, mkldnn_engine}, platform::to_void_cast(diff_src)));
-    dev_ctx.SetBlob(key_diff_src_mem, p_diff_src_mem);
-    std::shared_ptr<void> p_diff_dst_mem =
-        std::make_shared<mkldnn::memory>(mkldnn::memory(
-            {data_md, mkldnn_engine}, platform::to_void_cast(diff_dst)));
-    dev_ctx.SetBlob(key_diff_dst_mem, p_diff_dst_mem);
-
-    auto bwd_desc = mkldnn::eltwise_backward::desc(algorithm, data_md, data_md,
-                                                   alpha, beta);
-
-    const std::string key_fwd_pd = key + "eltwise_fwd_pd";
-    auto *p_fwd_pd = static_cast<mkldnn::eltwise_forward::primitive_desc *>(
-        dev_ctx.GetBlob(key_fwd_pd).get());
-
-    auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
-        bwd_desc, mkldnn_engine, *p_fwd_pd);
-
+    // create mkldnn memory for input diff_y
+    auto diff_dst_md = platform::MKLDNNMemDesc(
+        diff_dst_tz, platform::MKLDNNGetDataType<T>(), diff_y_format);
+    auto diff_dst_memory = std::shared_ptr<memory>(
+        new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data)));
+    dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory);
+
+    // retrieve eltwise primitive desc from device context
+    auto forward_pd =
+        std::static_pointer_cast<mkldnn::eltwise_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_fwd_pd));
+    PADDLE_ENFORCE(forward_pd != nullptr,
+                   "Fail to find eltwise_fwd_pd in device context");
+
+    // ceate primitive descriptor for activation backward
+    auto backward_desc = mkldnn::eltwise_backward::desc(
+        algorithm, diff_dst_memory->get_primitive_desc().desc(),
+        src_memory->get_primitive_desc().desc(), alpha, beta);
+    auto backward_pd = mkldnn::eltwise_backward::primitive_desc(
+        backward_desc, mkldnn_engine, *forward_pd);
+
+    // create mkldnn memory for output diff_src
+    diff_src_memory = std::make_shared<memory>(
+        backward_pd.diff_src_primitive_desc(), diff_x_data);
+    dev_ctx.SetBlob(key_diff_src_mem, diff_src_memory);
+
+    // create activation backward primitive
     p_grad = std::make_shared<mkldnn::eltwise_backward>(
-        eltwise_bwd_prim_desc, *static_cast<mkldnn::memory *>(p_src_mem.get()),
-        *(static_cast<mkldnn::memory *>(p_diff_dst_mem.get())),
-        *(static_cast<mkldnn::memory *>(p_diff_src_mem.get())));
+        backward_pd, *src_memory, *diff_dst_memory, *diff_src_memory);
+    dev_ctx.SetBlob(key_grad, p_grad);
   } else {
     // primitives already exist
-    auto p_diff_src_mem = std::static_pointer_cast<mkldnn::memory>(
+    diff_src_memory = std::static_pointer_cast<mkldnn::memory>(
         dev_ctx.GetBlob(key_diff_src_mem));
-    auto p_diff_dst_mem = std::static_pointer_cast<mkldnn::memory>(
+    auto diff_dst_memory = std::static_pointer_cast<mkldnn::memory>(
         dev_ctx.GetBlob(key_diff_dst_mem));
 
-    p_diff_src_mem->set_data_handle(
-        platform::to_void_reinterpret_cast(diff_src));
-    p_diff_dst_mem->set_data_handle(
-        platform::to_void_reinterpret_cast(diff_dst));
+    diff_src_memory->set_data_handle(
+        platform::to_void_reinterpret_cast(diff_x_data));
+    diff_dst_memory->set_data_handle(
+        platform::to_void_reinterpret_cast(diff_y_data));
   }
 
   // push primitive to stream and wait until it's executed
-  std::vector<mkldnn::primitive> pipeline = {*(p_grad.get())};
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  std::vector<primitive> pipeline;
+  pipeline.push_back(*p_grad);
+  stream(stream::kind::eager).submit(pipeline).wait();
+
+  diff_x->set_layout(DataLayout::kMKLDNN);
+  diff_x->set_format(GetMKLDNNFormat(*diff_src_memory));
 }
-}  // anonymous namespace
 
 template <typename T, mkldnn::algorithm algorithm>
 struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
-  template <typename ExecContext>
-  void operator()(const ExecContext &ctx) const {
+  void operator()(const framework::ExecutionContext &ctx) const {
     eltwise_forward<T>(ctx, algorithm);
   }
 };
 
 template <typename T, mkldnn::algorithm algorithm>
 struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
-  template <typename ExecContext>
-  void operator()(const ExecContext &ctx) const {
+  void operator()(const framework::ExecutionContext &ctx) const {
     eltwise_grad<T>(ctx, algorithm);
   }
 };
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index af1d85047e519df6766b2139a0445ae9dc5945e2..b6b498a616c22898bcf99ada6f150a4b5a9bd54c 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using paddle::framework::Tensor;
+
 #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)               \
   class OP_NAME##OpMaker                                                \
       : public ::paddle::framework::OpProtoAndCheckerMaker {            \
@@ -29,7 +31,7 @@ namespace operators {
       AddAttr<bool>("use_mkldnn",                                       \
                     "(bool, default false) Only used in mkldnn kernel") \
           .SetDefault(false);                                           \
-      AddComment(OP_COMMENT);                                           \
+      AddComment(#OP_COMMENT);                                          \
     }                                                                   \
   }
 
@@ -58,7 +60,6 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
                                       const framework::OperatorWithKernel& oper,
                                       const std::string& name) {
   framework::LibraryType library{framework::LibraryType::kPlain};
-
   framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
   auto it = oper.Attrs().find("use_mkldnn");
@@ -82,6 +83,7 @@ class ActivationOp : public framework::OperatorWithKernel {
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
+ protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return GetKernelType(ctx, *this, "X");
@@ -96,6 +98,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
   }
 
+ protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return GetKernelType(ctx, *this, "Out");
@@ -112,7 +115,7 @@ $$out = \frac{1}{1 + e^{-x}}$$
 __attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator
 
-$$out = \log \frac{1}{1 + e^{-x}}$$
+$$out = \\log \\frac{1}{1 + e^{-x}}$$
 
 )DOC";
 
@@ -133,7 +136,7 @@ $out = \max(x, 0)$
 __attribute__((unused)) constexpr char TanhDoc[] = R"DOC(
 Tanh Activation Operator.
 
-$$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 )DOC";
 
@@ -196,7 +199,7 @@ $out = [x]$
 __attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC(
 Reciprocal Activation Operator.
 
-$$out = \frac{1}{x}$$
+$$out = \\frac{1}{x}$$
 
 )DOC";
 
@@ -252,15 +255,14 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "Output of Softshrink operator");
     AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
     AddComment(R"DOC(
-Softshrink Activation Operator.
+:strong:`Softshrink Activation Operator`
 
-$$
-out = \begin{cases} 
-    x - \lambda, \text{if } x > \lambda \\
-    x + \lambda, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+..  math::
+    out = \begin{cases} 
+         x - \lambda, \text{if } x > \lambda \\
+         x + \lambda, \text{if } x < -\lambda \\
+         0,  \text{otherwise}
+         \end{cases}
 
 )DOC");
   }
@@ -271,18 +273,18 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "Input of HardShrink operator");
     AddOutput("Out", "Output of HardShrink operator");
-    AddAttr<float>("threshold", "The value of threshold for HardShrink")
+    AddAttr<float>("threshold",
+                   "The value of threshold for HardShrink. [default: 0.5]")
         .SetDefault(0.5f);
     AddComment(R"DOC(
-HardShrink Activation Operator.
+:strong:`HardShrink activation operator`
 
-$$
-out = \begin{cases} 
-    x, \text{if } x > \lambda \\
-    x, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+..  math::
+    out = \begin{cases}
+            x, \text{if } x > \lambda \\
+            x, \text{if } x < -\lambda \\
+            0,  \text{otherwise}
+          \end{cases}
 
 )DOC");
   }
@@ -394,18 +396,18 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "Input of ThresholdedRelu operator");
     AddOutput("Out", "Output of ThresholdedRelu operator");
-    AddAttr<float>("threshold", "The threshold location of activation")
+    AddAttr<float>("threshold",
+                   "The threshold location of activation. [default 1.0].")
         .SetDefault(1.0f);
     AddComment(R"DOC(
-ThresholdedRelu Activation Operator.
+:strong:`ThresholdedRelu activation operator`
 
-$$
-out = \begin{cases} 
-    x, \text{if } x > threshold \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+..  math::
 
+    out = \begin{cases}
+             x,  \text{if } x > threshold \\
+             0,  \text{otherwise}
+          \end{cases}
 )DOC");
   }
 };
@@ -444,7 +446,7 @@ class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Swish Activation Operator.
 
-$$out = \frac{x}{1 + e^{- \beta x}}$$
+$$out = \\frac{x}{1 + e^{- \beta x}}$$
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc
index 62636bb2f9078768180ab1e0016e3565617d24d2..dc43c69be0bcea2b82e1d61a9a5b2e03129d4f8e 100644
--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
@@ -91,32 +91,31 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
         "(int64_t). The number of chunks both in Inference and Label on the "
         "given mini-batch.");
     AddAttr<int>("num_chunk_types",
-                 "(int). The number of chunk type. See below for details.");
-    AddAttr<std::string>(
-        "chunk_scheme",
-        "(string, default IOB). The labeling scheme indicating "
-        "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below "
-        "for details.")
+                 "The number of chunk type. See the description for details.");
+    AddAttr<std::string>("chunk_scheme",
+                         "The labeling scheme indicating "
+                         "how to encode the chunks. Must be IOB, IOE, IOBES or "
+                         "plain. See the description"
+                         "for details.")
         .SetDefault("IOB");
     AddAttr<std::vector<int>>("excluded_chunk_types",
-                              "(list<int>) A list including chunk type ids "
+                              "A list including chunk type ids "
                               "indicating chunk types that are not counted. "
-                              "See below for details.")
+                              "See the description for details.")
         .SetDefault(std::vector<int>{});
     AddComment(R"DOC(
 For some basics of chunking, please refer to
-‘Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
+'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
 
-
-CheckEvalOp computes the precision, recall, and F1-score of chunk detection,
+ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
 and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
 Here is a NER example of labeling for these tagging schemes:
-
- 	     Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
-  IO:    I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
-  IOB:   B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
-  IOE:   I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
-  IOBES: B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
+   
+          Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
+   IO     I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
+   IOB    B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
+   IOE    I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
+   IOBES  B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
 
 There are three chunk types(named entity types) including PER(person), ORG(organization)
 and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
@@ -124,31 +123,31 @@ and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chun
 Since the calculations actually use label ids rather than labels, extra attention
 should be paid when mapping labels to ids to make CheckEvalOp work. The key point
 is that the listed equations are satisfied by ids.
-
-    tag_type = label % num_tag_type
-    chunk_type = label / num_tag_type
+   
+   tag_type = label % num_tag_type
+   chunk_type = label / num_tag_type
 
 where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
 is the num of chunk types, and `tag_type` get its value from the following table.
-
-    Scheme Begin Inside End   Single
-     plain   0     -      -     -
-     IOB     0     1      -     -
-     IOE     -     0      1     -
-     IOBES   0     1      2     3
+   
+   Scheme Begin Inside End   Single
+    plain   0     -      -     -
+    IOB     0     1      -     -
+    IOE     -     0      1     -
+    IOBES   0     1      2     3
 
 Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
 PER and LOC. To satisfy the above equations, the label map can be like this:
 
-    B-ORG  0
-    I-ORG  1
-    B-PER  2
-    I-PER  3
-    B-LOC  4
-    I-LOC  5
-    O      6
+   B-ORG  0
+   I-ORG  1
+   B-PER  2
+   I-PER  3
+   B-LOC  4
+   I-LOC  5
+   O      6
 
-It’s not hard to verify the equations noting that the num of chunk types
+It's not hard to verify the equations noting that the num of chunk types
 is 3 and the num of tag types in IOB scheme is 2. For example, the label
 id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
 I-LOC is 2, which consistent with the results from the equations.
diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc
index c87bded034e382c981d119e8499d6780e288031f..eae86a373be278cbb3ea9425b2ff0169f8faa99e 100644
--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
@@ -54,10 +54,19 @@ be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as
 shown in the following formula:
 
 $$
-Out = \frac{max\_norm * X}{norm(X)},
+Out = \\frac{max\\_norm * X}{norm(X)},
 $$
 
 where $norm(X)$ represents the L2 norm of $X$.
+
+Examples:
+        .. code-block:: python
+
+            data = fluid.layer.data(
+                name='data', shape=[2, 4, 6], dtype='float32')
+            reshaped = fluid.layers.clip_by_norm(
+                x=data, max_norm=0.5)
+
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc
index 3a4819f3dec9704a4a7c8910dd22e80fda082335..f40b1ba338d429c248103eeb930ac7e1bb690218 100644
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -23,30 +23,26 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     OpComment comment;
-    AddInput("X",
-             string::Sprintf("(LoDTensor) the left hand operand of %s operator",
-                             comment.type));
-    AddInput("Y", string::Sprintf(
-                      "(LoDTensor) the right hand operand of %s operator",
-                      comment.type));
+    AddInput("X", string::Sprintf("the left hand operand of %s operator",
+                                  comment.type));
+    AddInput("Y", string::Sprintf("the right hand operand of %s operator",
+                                  comment.type));
     AddAttr<bool>("force_cpu",
-                  "(bool, default false) Force fill output variable to cpu "
+                  "Force fill output variable to cpu "
                   "memory. Otherwise, fill output variable to the running "
-                  "device")
-        .SetDefault(false);
-    AddOutput("Out", string::Sprintf(
-                         "(LoDTensor) n-dim bool tensor. Each element is %s",
-                         comment.equation));
-    AddComment(string::Sprintf(R"DOC(%s Operator
-
+                  "device [default true].")
+        .SetDefault(true);
+    AddOutput("Out", string::Sprintf("n-dim bool tensor. Each element is %s",
+                                     comment.equation));
+    AddComment(string::Sprintf(R"DOC(
 It operates element-wise on X and Y, and returns the Out. Each of them is a
 N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
-calculated by %s
+calculated by $%s$
 )DOC",
-                               comment.type, comment.equation));
-    AddAttr<int>("axis",
-                 "(int, default -1). The start dimension index "
-                 "for broadcasting Y onto X.")
+                               comment.equation));
+    AddAttr<int>(
+        "axis",
+        "The start dimension index for broadcasting Y onto X. [default -1]")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
   }
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 38337f9aa52435c445420047957500d21069506a..c72405593788493e10a1293b0c722e2d11c6e312 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -107,7 +107,13 @@ REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
                       false> /* set false to disable empty grad */);
 REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>);
+    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int>);
 REGISTER_OP_CPU_KERNEL(
     concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc
index 590eca9d066ff7549939e62ddbfedc8ab76bb5e7..8e38e5231fbf6955ff8a9680a241a4a4ba1b924d 100644
--- a/paddle/fluid/operators/concat_op.cu.cc
+++ b/paddle/fluid/operators/concat_op.cu.cc
@@ -15,7 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/concat_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>);
+    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int>);
 REGISTER_OP_CUDA_KERNEL(
     concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int>);
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index 1b1b8bf5ed959dd9c2ce8c9f5c905a75b81865fd..a496301526f58875ff51aeaa5b2094c3c656531c 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -60,34 +60,45 @@ template <typename DeviceContext, typename T>
 class ConcatGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto out_var_names = ctx.Outputs(framework::GradVarName("X"));
     auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
     int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
 
+    // get output tensor that the name is not kEmptyVarName
+    std::vector<framework::Tensor*> outputs;
+    for (size_t j = 0; j < outs.size(); ++j) {
+      if (out_var_names[j] != framework::kEmptyVarName) {
+        outs[j]->mutable_data<T>(ctx.GetPlace());
+        outputs.push_back(outs[j]);
+      } else {
+        outputs.push_back(nullptr);
+      }
+    }
+
     // Sometimes direct copies will be faster, this maybe need deeply analysis.
     if (axis == 0 && outs.size() < 10) {
       size_t input_offset = 0;
-      auto in_stride = framework::stride_numel(in->dims());
+      const auto in_stride = framework::stride_numel(out_grad->dims());
 
-      for (auto& out : outs) {
-        out->mutable_data<T>(ctx.GetPlace());
-        auto out_stride = framework::stride_numel(out->dims());
-        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
-                                    out_stride, in->data<T>() + input_offset,
-                                    in_stride, out_stride[axis]);
+      for (size_t i = 0; i < outs.size(); ++i) {
+        auto out_stride = framework::stride_numel(ins[i]->dims());
+        auto* out = outputs[i];
+        if (out != nullptr) {
+          StridedNumelCopyWithAxis<T>(
+              ctx.device_context(), axis, out->data<T>(), out_stride,
+              out_grad->data<T>() + input_offset, in_stride, out_stride[axis]);
+        }
         input_offset += out_stride[axis];
       }
     } else {
-      std::vector<framework::Tensor> outputs(outs.size());
-      for (size_t j = 0; j < outs.size(); ++j) {
-        outs[j]->mutable_data<T>(ctx.GetPlace());
-        outputs[j] = *outs[j];
-      }
-
       auto& dev_ctx = ctx.template device_context<DeviceContext>();
       paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
           concat_grad_functor;
-      concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), &outputs);
+      concat_grad_functor(dev_ctx, *out_grad, ins, static_cast<int>(axis),
+                          &outputs);
     }
   }
 };
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 0b363f5c43f9fc191790e5cca629ffc46eb9388c..2e9e957ebdc2a5cb7663b968c5da631aebe60b1c 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -156,7 +156,7 @@ Parameters(strides, paddings) are two elements. These two elements represent hei
 and width, respectively.
 The input(X) size and output(Out) size may be different.
 
-Example:
+For an example:
   Input:
        Input shape: $(N, C_{in}, H_{in}, W_{in})$
        Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
index 046dd11910bb0ff46b567c3b89883582782205d3..8f3644039f9950a8a70e2fd66c20837a5f52bd7f 100644
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -76,9 +76,9 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
 
     AddComment(R"DOC(
-Cosine Similarity Operator.
+**Cosine Similarity Operator**
 
-$Out = X^T * Y / (\sqrt{X^T * X} * \sqrt{Y^T * Y})$
+$Out = \frac{X^T * Y}{(\sqrt{X^T * X} * \sqrt{Y^T * Y})}$
 
 The input X and Y must have the same shape, except that the 1st dimension
 of input Y could be just 1 (different from input X), which will be
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index 40f43936db662f2b18ffa540da4794755b5d6fc7..c27befe1143baa68add4b56f3572eab75272c3a5 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -53,21 +53,18 @@ sequence of observed tags.
 The output of this operator changes according to whether Input(Label) is given:
 
 1. Input(Label) is given:
-
-This happens in training. This operator is used to co-work with the chunk_eval
-operator.
-
-When Input(Label) is given, the crf_decoding operator returns a row vector
-with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
-prediction, or 1 indicating a tag is correctly predicted. Such an output is the
-input to chunk_eval operator.
+   This happens in training. This operator is used to co-work with the chunk_eval
+   operator.
+   When Input(Label) is given, the crf_decoding operator returns a row vector
+   with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
+   prediction, or 1 indicating a tag is correctly predicted. Such an output is the
+   input to chunk_eval operator.
 
 2. Input(Label) is not given:
-
-This is the standard decoding process.
+   This is the standard decoding process.
 
 The crf_decoding operator returns a row vector with shape [N x 1] whose values
-range from 0 to maximum tag number - 1. Each element indicates an index of a
+range from 0 to maximum tag number - 1, Each element indicates an index of a
 predicted tag.
 )DOC");
   }
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index 91cfbbda7352c9b1676aae99e2bd57ccc9e10069..772e80bbea4f2db654cefd0dcb404bc33803bd7a 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -52,7 +52,7 @@ static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
   } else {
     res = ctx.Attr<std::vector<int>>("offsets");
     PADDLE_ENFORCE_EQ(
-        rank, res.size(),
+        rank, static_cast<int>(res.size()),
         "Offsets size should be equal to dimension size of input tensor.");
   }
   return res;
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 92bb835e8f18e17ae1355fdec29f43b8ffb70460..5302b822d6b9f232e9ccd0d03cc549d7d5044ebf 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -30,19 +30,19 @@ class CumOp : public framework::OperatorWithKernel {
 class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "Input of Cumsum operator");
-    AddOutput("Out", "Output of Cumsum operator");
+    AddInput("X", "Input of cumsum operator");
+    AddOutput("Out", "Output of cumsum operator");
     AddAttr<int>("axis",
-                 "(int, default -1). The dimenstion to accumulate along. "
-                 "-1 means the last dimenstion")
+                 "The dimenstion to accumulate along. -1 means the last "
+                 "dimenstion [default -1].")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
     AddAttr<bool>("exclusive",
-                  "bool, default false). Whether to perform exclusive cumsum")
+                  "Whether to perform exclusive cumsum. [default false].")
         .SetDefault(false);
     AddAttr<bool>("reverse",
-                  "bool, default false). If true, the cumsum is performed in "
-                  "the reversed direction")
+                  "If true, the cumsum is performed in the reversed direction. "
+                  "[default false].")
         .SetDefault(false);
     AddComment(R"DOC(
 The cumulative sum of the elements along a given axis.
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index 02ffe3651e1deefcf6981c3d304d64b9a01661bf..ea004f7cd340030e61571825941a50e89735ef05 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -245,7 +245,7 @@ void GRPCClient::Proceed() {
     if (c->status_.ok()) {
       c->Process();
     } else {
-      LOG(ERROR) << "var: " << c->var_h_.String()
+      LOG(FATAL) << "var: " << c->var_h_.String()
                  << " grpc error:" << c->status_.error_message();
     }
     delete c;
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 8c4b4321b7582a5cfad89f23e3d298ed16162d99..d0f95f727fdbc82777147e3e8ada6ad4f7a35e60 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -106,23 +106,36 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
               "and M represents the number of deocded boxes.");
 
     AddComment(R"DOC(
-Bounding Box Coder Operator.
+
+Bounding Box Coder.
+
 Encode/Decode the target bounding box with the priorbox information.
+
 The Encoding schema described below:
-ox = (tx - px) / pw / pxv
-oy = (ty - py) / ph / pyv
-ow = log(abs(tw / pw)) / pwv 
-oh = log(abs(th / ph)) / phv 
+
+    ox = (tx - px) / pw / pxv
+
+    oy = (ty - py) / ph / pyv
+
+    ow = log(abs(tw / pw)) / pwv 
+
+    oh = log(abs(th / ph)) / phv 
+
 The Decoding schema described below:
-ox = (pw * pxv * tx * + px) - tw / 2
-oy = (ph * pyv * ty * + py) - th / 2
-ow = exp(pwv * tw) * pw + tw / 2
-oh = exp(phv * th) * ph + th / 2
-where tx, ty, tw, th denote the target box's center coordinates, width and
-height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
-center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
-of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
-width and height.
+
+    ox = (pw * pxv * tx * + px) - tw / 2
+
+    oy = (ph * pyv * ty * + py) - th / 2
+
+    ow = exp(pwv * tw) * pw + tw / 2
+
+    oh = exp(phv * th) * ph + th / 2
+
+where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
+and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
+priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
+`phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the
+encoded/decoded coordinates, width and height.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc
index 8e58605fcea04f9ffa97ce8cca53c073e7068aaf..9c89b7ca9af1b235659554afc805600d31ef8ea6 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cc
@@ -68,15 +68,16 @@ class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
               "representing pairwise iou scores.");
 
     AddComment(R"DOC(
-IOU Similarity Operator.
+**IOU Similarity Operator**
+
 Computes intersection-over-union (IOU) between two box lists.
- Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
- boxes in 'Y' are shared by all instance of the batched inputs of X.
- Given two boxes A and B, the calculation of IOU is as follows:
+Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
+boxes in 'Y' are shared by all instance of the batched inputs of X.
+Given two boxes A and B, the calculation of IOU is as follows:
 
 $$
 IOU(A, B) = 
-\frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)}
+\\frac{area(A\\cap B)}{area(A)+area(B)-area(A\\cap B)}
 $$
 
 )DOC");
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
index 335e8dd470f851d8c5f6bdbc94cfc343da269034..568d50d457d838d5f11605710c0d3b987af01d10 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -83,11 +83,13 @@ class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddComment(R"DOC(
 PolygonBoxTransform Operator.
+
+PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
+
 The input is the final geometry output in detection network.
 We use 2*n numbers to denote the coordinate shift from n corner vertices of
 the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi),
 the geometry output contains 2*n channels.
-PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise_mul_op.cc
index ba343909bb87b4f2efa56c0a4ff664b278e90c60..7cd67e74de6b9c4fbc718f60b4f671ccab2f9956 100644
--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_mul_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\odot\\ Y");
+REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\\\odot Y");
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
index 8050f61d4546f3351645f23ddcc63b2c49f17929..4a974281481c8bc02589b428098475d73b8a0ba5 100644
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
@@ -36,11 +36,12 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
   void Apply() override {
     AddAttr<float>("mean",
                    "(float, default 0.0) "
-                   "mean of random tensor.")
+                   "The mean (or center) of the gaussian distribution.")
         .SetDefault(.0f);
     AddAttr<float>("std",
                    "(float, default 1.0) "
-                   "std of random tensor.")
+                   "The standard deviation (std, or spread) of the "
+                   "gaussian distribution.")
         .SetDefault(1.0f);
     AddAttr<int>("seed",
                  "(int, default 0) "
@@ -55,9 +56,11 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
         .SetDefault(framework::proto::VarType::FP32);
 
     AddComment(R"DOC(
-GaussianRandom Operator.
 
 Used to initialize tensors with gaussian random generator.
+The defalut mean of the distribution is 0. and defalut standard
+deviation (std) of the distribution is 1.. Uers can set mean and std
+by input arguments.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/get_places_op.cc b/paddle/fluid/operators/get_places_op.cc
index eafc364a15fa17cc5107bba737b0b44e712b0bef..db6ff7825690176ded0ab957764ed8411d3cd804 100644
--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
@@ -85,7 +85,7 @@ class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
         .InEnum({"CUDA", "CPU", "AUTO"})
         .SetDefault("AUTO");
     AddComment(R"DOC(
-Returns a list of places based on flags. The list will be used for parallel
+Returns a list of places based on arguments. The list will be used for parallel
 execution.
 )DOC");
   }
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index ab097d31e9ab5eafa788539170e7e405df697625..14ce1da2e97186a50ed8bd52223a500c4c57b328 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -62,36 +62,33 @@ class LayerNormOp : public framework::OperatorWithKernel {
 class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "(LoDTensor) The input tensor.");
+    AddInput("X", "The input tensor.");
     AddInput("Scale",
-             "(Tensor, optional) Scale is a 1-dimensional tensor of size "
+             "(optional) Scale is a 1-dimensional tensor of size "
              "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
              "It is applied to the output.")
         .AsDispensable();
     AddInput("Bias",
-             "(Tensor, optional) Bias is a 1-dimensional tensor of size "
+             "(optional) Bias is a 1-dimensional tensor of size "
              "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
              "It is applied to the output.")
         .AsDispensable();
-    AddOutput("Y", "(LoDTensor) Result after normalization.");
-    AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
-        .AsIntermediate();
-    AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
+    AddOutput("Y", "Result after normalization.");
+    AddOutput("Mean", "Mean of the current mini batch.").AsIntermediate();
+    AddOutput("Variance", "Variance of the current mini batch.")
         .AsIntermediate();
 
     AddAttr<float>("epsilon",
-                   "(float, default 1e-5) Constant for "
-                   "numerical stability")
+                   "Constant for numerical stability [default 1e-5].")
         .SetDefault(1e-5)
         .AddCustomChecker([](const float &epsilon) {
           PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
                          "'epsilon' should be between 0.0 and 0.001.");
         });
     AddAttr<int>("begin_norm_axis",
-                 "(int default:1), the "
-                 "axis of `begin_norm_axis ... Rank(X) - 1` will be "
+                 "the axis of `begin_norm_axis ... Rank(X) - 1` will be "
                  "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
-                 "matrix [N,H].")
+                 "matrix [N,H]. [default 1].")
         .SetDefault(1)
         .AddCustomChecker([](const int &begin_norm_axis) {
           PADDLE_ENFORCE_GT(begin_norm_axis, 0,
@@ -99,10 +96,14 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
         });
 
     AddComment(R"DOC(
-Layer Normalization.
-Layer Norm has been implemented as discussed in the paper:
-https://arxiv.org/abs/1607.06450
-...
+Assume feature vectors exist on dimensions
+:attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
+along these dimensions for each feature vector :math:`a` with size
+:math:`H`, then normalize each feature vector using the corresponding
+statistics. After that, apply learnable gain and bias on the normalized
+tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
+
+Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index a711da362771353891f900f544d97e64510dc0ba..ea1ca7f59db22bee973a8827a88e2fb80265fa51 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -84,6 +84,7 @@ CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
 http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
 
 Equation:
+
 1. Denote Input(Emission) to this operator as $x$ here.
 2. The first D values of Input(Transition) to this operator are for starting
 weights, denoted as $a$ here.
@@ -106,6 +107,7 @@ Finally, the linear chain CRF operator outputs the logarithm of the conditional
 likelihood of each training sample in a mini-batch.
 
 NOTE:
+
 1. The feature function for a CRF is made up of the emission features and the
 transition features. The emission feature weights are NOT computed in
 this operator. They MUST be computed first before this operator is called.
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 4d12278799f66f2fb92b7580ba0c43e845aa4d3a..57c2ce457791d830e4230aa25e1c5b358f476782 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -348,7 +348,8 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
 };
 
 void SignalHandler::StopAndExit(int signal_num) {
-  VLOG(3) << "Catch interrupt signal: " << signal_num << ", program will exit";
+  // Do not use VLOG here for the device for printing maybe already released.
+  // exit will release interal allocated resoureces.
   exit(0);
 }
 
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index 4751e3e8025e51a687f8fcfd25e603b61e762f6d..3225bf9bb63d57969ce9ae0e4a74e8f466c8c2d0 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -184,34 +184,32 @@ Long-Short Term Memory (LSTM) Operator.
 The defalut implementation is diagonal/peephole connection
 (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
 
-$$
-i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) \\
+$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) $$
 
-f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) \\
+$$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) $$
 
-\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\
+$$ \\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) $$
 
-o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) \\
+$$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) $$
 
-c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
+$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
 
-h_t = o_t \odot act_h(c_t)
-$$
+$$ h_t = o_t \\odot act_h(c_t) $$
 
-where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
-of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
-are diagonal weight matrices for peephole connections. In our implementation,
-we use vectors to reprenset these diagonal weight matrices. The b terms
-denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
-is the non-line activations, such as logistic sigmoid function, and
-$i, f, o$ and $c$ are the input gate, forget gate, output gate,
-and cell activation vectors, respectively, all of which have the same size as
-the cell output activation vector $h$.
-
-The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
-are the cell input and cell output activation functions and `tanh` is usually
-used for them. $\tilde{c_t}$ is also called candidate hidden state,
-which is computed based on the current input and the previous hidden state.
+- W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
+  of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
+  are diagonal weight matrices for peephole connections. In our implementation,
+  we use vectors to reprenset these diagonal weight matrices.
+- The b terms denote bias vectors ($b_i$ is the input gate bias vector).
+- $\sigma$ is the non-line activations, such as logistic sigmoid function.
+- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
+  and cell activation vectors, respectively, all of which have the same size as
+  the cell output activation vector $h$.
+- The $\odot$ is the element-wise product of the vectors.
+- $act_g$ and $act_h$ are the cell input and cell output activation functions
+  and `tanh` is usually used for them.
+- $\tilde{c_t}$ is also called candidate hidden state,
+  which is computed based on the current input and the previous hidden state.
 
 Set `use_peepholes` False to disable peephole connection. The formula
 is omitted here, please refer to the paper
diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc
index cc69212466b72f3fa82e8f5f58b4f3229dab28ec..14964fc62af6dd947e49a2054511780a1bb20cf2 100644
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -70,35 +70,40 @@ template <typename T>
 class ConcatGradFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const int axis,
-                  std::vector<framework::Tensor>* outputs) {
+                  const framework::Tensor& input,
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
     // TODO(zcd): Add input data validity checking
-    int num = outputs->size();
+    size_t num = outputs->size();
 
     int input_rows = 1;
-    auto dim_0 = outputs->at(0).dims();
+    auto dim_0 = ref_inputs[0]->dims();
     for (int i = 0; i < axis; ++i) {
       input_rows *= dim_0[i];
     }
+
     int input_cols = 0;
 
     std::vector<int64_t> output_cols(outputs->size());
-    for (int i = 0; i < num; ++i) {
-      int t_cols = outputs->at(i).numel() / input_rows;
+    for (size_t i = 0; i < num; ++i) {
+      int t_cols = ref_inputs[i]->numel() / input_rows;
       input_cols += t_cols;
       output_cols[i] = t_cols;
     }
     auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
 
     // computation
-    for (int k = 0; k < input_rows; ++k) {
+    for (size_t k = 0; k < input_rows; ++k) {
       const T* src_ptr = input.data<T>() + k * input_cols;
       int col_idx = 0;
       for (int j = 0; j < num; ++j) {
         int col_len = output_cols[j];
-        T* dst_ptr = outputs->at(j).data<T>() + k * col_len;
-        memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
-                     sizeof(T) * col_len);
+        auto* out_tensor = outputs->at(j);
+        if (out_tensor != nullptr) {
+          T* dst_ptr = out_tensor->data<T>() + k * col_len;
+          memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
+                       sizeof(T) * col_len);
+        }
         col_idx += col_len;
       }
     }
diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu
index 4285d38dcd6a4124543cdd2246c82a8203f5a281..f66baa6573f02a7b5ebc2c1a5dd20333ef42b05b 100644
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -102,10 +102,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
     int local_col = tid_x - curr_offset;
     int segment_width = curr_col_offset - curr_offset;
     T* output_ptr = outputs_data[curr_segment];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-      output_ptr[tid_y * segment_width + local_col] =
-          input_data[tid_y * in_col + tid_x];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * segment_width + local_col] =
+            input_data[tid_y * in_col + tid_x];
+    }
   }
 }
 
@@ -118,10 +120,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
     int split = tid_x / fixed_out_col;
     int in_offset = tid_x - split * fixed_out_col;
     T* output_ptr = outputs_data[split];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-      output_ptr[tid_y * fixed_out_col + in_offset] =
-          input_data[tid_y * in_col + tid_x];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * fixed_out_col + in_offset] =
+            input_data[tid_y * in_col + tid_x];
+    }
   }
 }
 
@@ -203,17 +207,18 @@ template <typename T>
 class ConcatGradFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const int axis,
-                  std::vector<framework::Tensor>* outputs) {
+                  const framework::Tensor& input,
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
     // TODO(zcd): Add input data validity checking
     int o_num = outputs->size();
     int out_row = 1;
-    auto dim_0 = outputs->at(0).dims();
+    auto dim_0 = ref_inputs[0]->dims();
     for (int i = 0; i < axis; ++i) {
       out_row *= dim_0[i];
     }
 
-    int out_col = outputs->at(0).numel() / out_row;
+    int out0_col = ref_inputs[0]->numel() / out_row;
     int in_col = 0, in_row = out_row;
     bool sameShape = true;
 
@@ -223,13 +228,17 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
 
     outputs_cols[0] = 0;
     for (int i = 0; i < o_num; ++i) {
-      int t_col = outputs->at(i).numel() / out_row;
+      int t_col = outputs->at(i)->numel() / out_row;
       if (sameShape) {
-        if (t_col != out_col) sameShape = false;
+        if (t_col != out0_col) sameShape = false;
       }
       in_col += t_col;
       outputs_cols[i + 1] = in_col;
-      outputs_ptr[i] = outputs->at(i).data<T>();
+      if (outputs->at(i) != nullptr) {
+        outputs_ptr[i] = outputs->at(i)->data<T>();
+      } else {
+        outputs_ptr[i] = nullptr;
+      }
     }
 
     T** dev_out_gpu_data =
@@ -255,7 +264,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
 
     if (sameShape) {
       KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), in_row, in_col, out_col, dev_out_gpu_data);
+          input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
     } else {
       const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
       KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h
index 041ce8bf8a2e9528a004c076ead4471a3837c1a6..9e080f2e8be23768dcea47b577043beef37b2eaf 100644
--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
@@ -57,7 +57,8 @@ template <typename DeviceContext, typename T>
 class ConcatGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const int axis, std::vector<framework::Tensor>* outputs);
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs);
 };
 
 }  // namespace math
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 4881cff4a368ffae9b030f04b7fff01d6ee7d26e..9e0bebd17c02a3ce010b77142757b8789cfbcdd9 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -33,12 +33,10 @@ class MeanOp : public framework::OperatorWithKernel {
 class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "The input of mean op");
-    AddOutput("Out", "The output of mean op").Reuse("X");
+    AddInput("X", "(Tensor) The input of mean op");
+    AddOutput("Out", "(Tensor) The output of mean op").Reuse("X");
     AddComment(R"DOC(
-Mean Operator.
-
-Out is a scalar which is the mean of all elements in X. 
+Mean Operator calculates the mean of all elements in X.
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index a4363fd25d57edb5c2509904a1f55634832613be..18ad46cb5eeeab2169136e40cebdaa53c0bfd587 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -62,26 +62,46 @@ class MultiplexOp : public framework::OperatorWithKernel {
 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("Ids", "The index tensor of multiplex operator.");
-    AddInput("X", "The candidate tensors of multiplex operator.")
+    AddInput("Ids",
+             "Tensor<int32>, index variable which is a 2-D tensor with shape "
+             "[M, 1] where M is the batch size.");
+    AddInput("X",
+             "A list of variables to gather from. All variables have the same "
+             "shape and the rank is at least 2.")
         .AsDuplicable();
     AddOutput("Out", "The output tensor of multiplex operator.");
     AddComment(R"DOC(
-Multiplex Operator.
-
-Multiplex multiple tensors according to the index provided by the index tensor.
-
-Ids: the index tensor.
-X[0 : N - 1]: the candidate tensors for output (N >= 2).
-For each index i from 0 to batchSize - 1, the output is the i-th row of the
+Referring to the given index variable, this layer selects rows from the
+input variables to construct a multiplex variable. Assuming that there are
+:math:`m` input variables and :math:`I_i` represents the i-th input
+variable and :math:`i` is in [0, :math:`m`). All input variables are
+tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
+Please note that rank of the input tensor should be at least 2. Each input
+variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
+where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
+* ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
+variable. The given index variable should be a 2-D tensor with shape
+[:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
+Then the output variable will be a tensor with shape [:math:`d_0`,
+:math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
+matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
+row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
+
+* Ids: the index tensor.
+
+* X[0 : N - 1]: the candidate tensors for output (N >= 2).
+
+* For each index i from 0 to batchSize - 1, the output is the i-th row of the
 the (Ids[i])-th tensor.
 
 For i-th row of the output tensor:
 
-$$y[i] = x_{k}[i]$$
+$$
+y[i] = x_{k}[i]
+$$
 
-where `y` is the output tensor, `x_{k}` is the k-th input tensor,
-and `k = Ids[i]`.
+where $y$ is the output tensor, $x_{k}$ is the k-th input tensor,
+and $k = Ids[i]$.
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 06092e680a1efbef379ccf40fdf476769f820429..e471f04662a1fa3e8e77a2db37f0da4521682018 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -128,8 +128,10 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
                               "user should avoid setting this attribute.")
         .SetDefault({});
     AddComment(R"DOC(
-Compute and return the noise-contrastive estimation training loss.
-See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+Compute and return the noise-contrastive estimation training loss. See 
+`Noise-contrastive estimation: A new estimation principle for unnormalized 
+statistical models 
+ <http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_.
 By default this operator uses a uniform distribution for sampling.
 )DOC");
   }
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 6707cdded4020fe3e2b01ba399dfc279a9da677d..f8ad63690e84339da0390d4ddd2db45f25db385a 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -204,8 +204,6 @@ void Pool2dOpMaker::Make() {
   // TODO(dzhwinter): need to registered layout transform function
 
   AddComment(R"DOC(
-Pool2d Operator.
-
 The pooling2d operation calculates the output based on
 the input, pooling_type and ksize, strides, paddings parameters.
 Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
@@ -215,19 +213,28 @@ These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
 Example:
+
   Input:
+
        X shape: $(N, C, H_{in}, W_{in})$
+
   Output:
+
        Out shape: $(N, C, H_{out}, W_{out})$
+
   For ceil_mode = false:
        $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
   For ceil_mode = true:
        $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
        $$
 
 )DOC");
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
index 282ec3f36b98e7aa62d71fb04f72721a5464e21c..559827f08494af6730aafa1e67c46a47c21dedf6 100644
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -78,11 +78,15 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
 class CreateRecordIOReaderOpMaker : public FileReaderMakerBase {
  protected:
   void Apply() override {
-    AddAttr<std::string>("filename", "The filename of record io reader");
+    AddAttr<std::string>(
+        "filename",
+        "The filename of record file. This file will given to reader.");
     AddComment(R"DOC(
-      CreateRecordIOReader Operator
+Open a recordio file and return the reader object. The returned reader object
+is thread-safe.
 
-      Create a reader from a record io file
+NOTE: This is a very low-level API. It is used for debugging data file or
+training. Please use `open_files` instead of this API for production usage.
     )DOC");
   }
 };
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 612e1f5eca3a4836db1fd167fc6bb63400d20177..e11256a49ffa6adc9410376cc8a71fa017df7e9c 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -54,7 +54,7 @@ std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
 }
 
 void FileReaderMakerBase::Make() {
-  AddOutput("Out", "(ReaderHolder) The created random reader.").AsDuplicable();
+  AddOutput("Out", "(ReaderHolder): The created random reader.").AsDuplicable();
   AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes.");
   AddAttr<std::vector<int>>(
       "ranks",
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 293abb0ea4f1ac03c3889ce2937ef8fa0845db73..d6d209d5de041500a9b4893d70800a58e8ee1e1d 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -139,7 +139,20 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The pooled output width.")
         .SetDefault(1);
     AddComment(R"DOC(
-ROIPool operator
+**ROIPool Operator**
+
+Region of interest pooling (also known as RoI pooling) is to perform
+is to perform max pooling on inputs of nonuniform sizes to obtain
+fixed-size feature maps (e.g. 7*7).
+
+The operator has three steps:
+
+1. Dividing each region proposal into equal-sized sections with
+   the pooled_width and pooled_height
+
+2. Finding the largest value in each section
+
+3. Copying these max values to the output buffer
 
 ROI Pooling for Faster-RCNN. The link below is a further introduction: 
 https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index 20f140f962c3aac364a1239a663d5f340bbeb6b2..10b1b0c899d833d70fa6afe51998fe210899e3c3 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -78,23 +78,23 @@ class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "(LoDTensor), the input(X) is a LodTensor, which supports "
+             "the input(X) is a LodTensor, which supports "
              "variable time-length input sequences. The underlying tensor "
              "in this LoDTensor is a matrix with shape (T x N), where T "
              "is the total time steps in this mini-batch and N is the input "
              "data dimension.");
     AddInput("Filter",
-             "(Tensor), the input(Filter) is a learnable parameter. It "
+             "the input(Filter) is a learnable parameter. It "
              "is a 2-D tensor with shape (future_context x N), where, "
              "future_context is the future context length and N is the data "
              "dimension.");
     AddOutput("Out",
-              "(LoDTensor), the output(Out) is a LodTensor, which supports "
+              "the output(Out) is a LodTensor, which supports "
               "variable time-length input sequences. The underlying tensor "
               "in this LodTensor is a matrix with shape T x N, i.e., the "
               "same shape as X.");
     AddComment(R"DOC(
-Row-convolution Operator.
+:strong:`Row-convolution operator`
 
 The row convolution is called lookahead convolution.  This operator was 
 introduced in the following paper for DeepSpeech2:
@@ -114,9 +114,23 @@ and a filter ($W$) of size $context \times d$,
 the output sequence is convolved as:
 
 $$
-out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :}
+out_{i, :} = \\sum_{j=i}^{i + context} in_{j,:} \\cdot W_{i-j, :}
 $$
 
+In the above equation:
+
+* $Out_{i}$: The i-th row of output variable with shape [1, D].
+
+* $\\tau$: Future context size.
+
+* $X_{j}$: The j-th row of input variable with shape [1, D].
+
+* $W_{i-j}$: The (i-j)-th row of parameters with shape [1, D].
+
+More details about row_conv please refer to
+the design document
+https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .
+
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 4687e21e7155fc7309fb28c881c0d47152df9ad5..7f8822e40053b5bcd394f446138a2292d80b69bf 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -41,13 +41,13 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "(Tensor) Input tensor of scale operator.");
     AddOutput("Out", "(Tensor) Output tensor of scale operator.");
     AddComment(R"DOC(
-Scale operator
+**Scale operator**
+
+Multiply the input tensor with a float scalar to scale the input tensor.
 
 $$Out = scale*X$$
 )DOC");
-    AddAttr<float>("scale",
-                   "(float, default 1.0)"
-                   "The scaling factor of the scale operator.")
+    AddAttr<float>("scale", "The scaling factor of the scale operator.")
         .SetDefault(1.0);
   }
 };
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index c75fce7959d1af51afd52af23fe657d10a2f3988..b44d5f898013a5d27467bd80118c29a886d5e8b3 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -36,10 +36,13 @@ class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("Input", "(Tensor), The input tensor.");
-    AddOutput("Out", "(Tensor), The shape of input tensor.");
+    AddOutput("Out",
+              "(Tensor), The shape of input tensor, the data type of the shape"
+              " is int64_t, will be on the same device with the input Tensor.");
     AddComment(R"DOC(
-Shape Operator. 
-Get the shape of input tensor.
+Shape Operator
+
+Get the shape of input tensor. Only support CPU input Tensor now.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 135e2a6f7f877c9ef159a4542b834d5627649e81..c3b0fe32098cb4b41ccc155db58809ef9f1bf46b 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -113,14 +113,14 @@ The logistic loss is given as follows:
 
        $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$
 
-We know that $$\sigma(X) = (1 / (1 + \exp(-X)))$$. By substituting this we get:
+We know that $$\sigma(X) = \\frac{1}{1 + \exp(-X)}$$. By substituting this we get:
 
        $$loss = X - X * Labels + \log(1 + \exp(-X))$$
 
 For stability and to prevent overflow of $$\exp(-X)$$ when X < 0,
 we reformulate the loss as follows:
 
-       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-|X|))$$
+       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-\|X\|))$$
 
 Both the input `X` and `Labels` can carry the LoD (Level of Details) information.
 However the output only shares the LoD with input `X`.
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 61bb445e8b4c6a71e9b1a6a0bcf02a31ab271d0a..4bd23d594134f227e86b01fd75b7e202dd76c11b 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -95,23 +95,26 @@ of that dimension. If the value passed to start or end is larger than
 the n (the number of elements in this dimension), it represents n. 
 For slicing to the end of a dimension with unknown size, it is recommended 
 to pass in INT_MAX. If axes are omitted, they are set to [0, ..., ndim-1].
-
-    Example 1:
-    Given:
-        data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-        axes = [0, 1]
-        starts = [1, 0]
-        ends = [2, 3]
-    Then:
-        result = [ [5, 6, 7], ]
-
-    Example 2:
-    Given:
-        data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-        starts = [0, 1]
-        ends = [-1, 1000]
-    Then:
-        result = [ [2, 3, 4], ]
+Following examples will explain how slice works:
+
+    .. code-block:: text
+
+        Cast1:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [1, 0]
+                ends = [2, 3]
+            Then:
+                result = [ [5, 6, 7], ]
+
+        Cast2:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                starts = [0, 1]
+                ends = [-1, 1000]
+            Then:
+                result = [ [2, 3, 4], ]
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index 5e2b2a994534c2fb1e053c067b36651d358b9da8..d661b276bc31bf0c3ab181d706ffdccec89f0632 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -115,4 +115,7 @@ USE_CPU_ONLY_OP(concat);
 
 REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker);
 REGISTER_OP_CPU_KERNEL(split,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, float>);
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, double>,
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, float>,
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, int64_t>,
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, int>);
diff --git a/paddle/fluid/operators/split_op.cu.cc b/paddle/fluid/operators/split_op.cu.cc
index efa378af857a8881f25c76379ba7cf81e64c80bb..18e0904681753aff7f3deac96efb6d62f389a031 100644
--- a/paddle/fluid/operators/split_op.cu.cc
+++ b/paddle/fluid/operators/split_op.cu.cc
@@ -15,4 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/split_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>);
+    split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SplitOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SplitOpKernel<paddle::platform::CUDADeviceContext, int>);
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index 4b1208c4376b48e25866fc510f3a6d2ea06e7610..0ea273af9d5a5c8f1ae112232a9187675031b360 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -66,17 +66,25 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
 }  // namespace
 
 template <typename DeviceContext, typename T>
-void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
+void TensorRTEngineKernel<DeviceContext, T>::Prepare(
     const framework::ExecutionContext &context) const {
   VLOG(4) << "Prepare engine";
   // Get the ProgramDesc and pass to convert.
   framework::proto::BlockDesc block_desc;
   block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-  max_batch_ = context.Attr<int>("max_batch");
+  int max_batch = context.Attr<int>("max_batch");
   auto max_workspace = context.Attr<int>("max_workspace");
-  engine_ = Singleton<TRT_EngineManager>::Global().Create(
-      max_batch_, max_workspace, &stream_);
-  engine_->InitNetwork();
+  auto params = context.Attr<std::vector<std::string>>("parameters");
+  std::unordered_set<std::string> parameters;
+  for (const auto &param : params) {
+    parameters.insert(param);
+  }
+
+  // TODO(Superjomn) replace this with a different stream
+  auto *engine = Singleton<TRT_EngineManager>::Global().Create(
+      max_batch, max_workspace, nullptr /*engine hold its own stream*/,
+      context.Attr<std::string>("engine_uniq_key"));
+  engine->InitNetwork();
 
   framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
   // Add inputs
@@ -87,24 +95,23 @@ void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
     PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
                       "TensorRT engine only takes LoDTensor as input");
     auto shape = var->GetShape();
-    engine_->DeclareInput(
+    engine->DeclareInput(
         input, FluidDataType2TRT(
                    var->Proto()->type().lod_tensor().tensor().data_type()),
         Vec2TRT_Dims(var->GetShape()));
   }
 
-  // TODO(Superjomn) parameters should be passed after analysised from outside.
   inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
-      block_desc, {}, context.scope(), engine_);
+      block_desc, parameters, context.scope(), engine);
 
   // Add outputs
   VLOG(4) << "declare outputs";
   for (auto &output : context.Outputs("Ys")) {
     VLOG(4) << "declare output " << output;
-    engine_->DeclareOutput(output);
+    engine->DeclareOutput(output);
   }
 
-  engine_->FreezeNetwork();
+  engine->FreezeNetwork();
 }
 
 class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -113,6 +120,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Xs", "A list of inputs.").AsDuplicable();
     AddOutput("Ys", "A list of outputs").AsDuplicable();
     AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
     AddAttr<int>("max_batch", "the maximum batch size.");
     AddAttr<int>("max_workspace", "the maximum batch size.");
     AddComment("TensorRT engine operator.");
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 4b089601ff76eedd87bb3a52a38c4d22d4a94bf6..8455d24ddf47382b235edda10cb9b2e8934c5f06 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -19,10 +19,14 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
 
 namespace paddle {
 namespace operators {
 
+using inference::Singleton;
+using inference::tensorrt::TRT_EngineManager;
+
 class TensorRTEngineOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -47,16 +51,18 @@ template <typename DeviceContext, typename T>
 class TensorRTEngineKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    if (!engine_) {
+    auto engine_name = context.Attr<std::string>("engine_uniq_key");
+    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
       Prepare(context);
     }
+    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
     auto input_names = context.op().Inputs("Xs");
     PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
     // Try to determine a batch_size
     auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
         context.scope(), input_names.front());
     int batch_size = tensor0.dims()[0];
-    PADDLE_ENFORCE_LE(batch_size, max_batch_);
+    PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
 
     // Convert input tensor from fluid to engine.
     for (const auto& x : context.Inputs("Xs")) {
@@ -64,20 +70,20 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
           context.scope(), x);
       if (platform::is_cpu_place(t.place())) {
-        engine_->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
-                                 t.memory_size());
+        engine->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
+                                t.memory_size());
       } else {
-        engine_->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
-                                 t.memory_size());
+        engine->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
+                                t.memory_size());
       }
     }
     // Execute the engine.
     PADDLE_ENFORCE_GT(batch_size, 0);
-    engine_->Execute(batch_size);
+    engine->Execute(batch_size);
     // Convert output tensor from engine to fluid
     for (const auto& y : context.Outputs("Ys")) {
       // convert output and copy to fluid.
-      nvinfer1::ITensor* trt_t = engine_->GetITensor(y);
+      nvinfer1::ITensor* trt_t = engine->GetITensor(y);
       auto dims = trt_t->getDimensions();
       // Use the output ITensor's dims to reshape the Fluid Tensor.
       std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
@@ -89,27 +95,22 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
       auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
       if (platform::is_cpu_place(fluid_t->place())) {
         // TODO(Superjomn) change this float to dtype size.
-        engine_->GetOutputInCPU(
+        engine->GetOutputInCPU(
             y, fluid_t->mutable_data<float>(platform::CPUPlace()),
             size * sizeof(float));
       } else {
-        engine_->GetOutputInGPU(
+        engine->GetOutputInGPU(
             y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
             size * sizeof(float));
       }
     }
 
-    cudaStreamSynchronize(stream_);
+    cudaStreamSynchronize(*engine->stream());
   }
 
  protected:
   // Build the engine.
   void Prepare(const framework::ExecutionContext& context) const;
-
- private:
-  mutable cudaStream_t stream_;
-  mutable inference::tensorrt::TensorRTEngine* engine_{nullptr};
-  mutable int max_batch_{0};
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
index 6f383de259b270038c32296b59007f6c7d895f12..3a2fef48052ae3943abad14bf87c14ca79251c94 100644
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -79,6 +79,17 @@ void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
   attr->set_type(paddle::framework::proto::AttrType::LONG);
   attr->set_l(data);
 }
+template <>
+void SetAttr<std::vector<std::string>>(framework::proto::OpDesc* op,
+                                       const std::string& name,
+                                       const std::vector<std::string>& data) {
+  auto* attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
+  for (const auto& s : data) {
+    attr->add_strings(s.c_str());
+  }
+}
 
 }  // namespace
 
@@ -123,11 +134,15 @@ TEST(TensorRTEngineOp, manual) {
   engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
   SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                        block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 30);
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 100);
   SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
+  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
+  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
+                                    std::vector<std::string>({}));
 
   LOG(INFO) << "create engine op";
   auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+  LOG(INFO) << "engine_op " << engine_op.get();
 
   framework::Scope scope;
   platform::CPUPlace place;
@@ -145,6 +160,88 @@ TEST(TensorRTEngineOp, manual) {
   engine_op->Run(scope, place);
 }
 
+void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  using shape_t = std::vector<int64_t>;
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+
+  auto AddFCLayer = [&](const std::string& x_name, const std::string& y_name,
+                        const std::string& z_name, bool x_created,
+                        const shape_t& x_shape, const shape_t& y_shape,
+                        const shape_t& z_shape) {
+
+    LOG(INFO) << "create fc op";
+    auto* fc = block_desc.AppendOp();
+    fc->SetType("mul");
+    fc->SetInput("X", std::vector<std::string>({x_name}));
+    fc->SetInput("Y", std::vector<std::string>({y_name}));
+    fc->SetOutput("Out", std::vector<std::string>({z_name}));
+
+    // Set inputs' variable shape in BlockDesc
+    if (!x_created) {
+      AddTensorToBlockDesc(block_, x_name,
+                           std::vector<int64_t>({batch_size, input_dim, 1, 1}));
+    }
+    AddTensorToBlockDesc(block_, y_name,
+                         std::vector<int64_t>({input_dim, output_dim}));
+    AddTensorToBlockDesc(block_, z_name,
+                         std::vector<int64_t>({batch_size, output_dim}));
+
+    // Prepare variables.
+    if (!x_created) {
+      CreateCPUTensor(&scope, x_name, std::vector<int64_t>(x_shape));
+    }
+    CreateCPUTensor(&scope, y_name, std::vector<int64_t>(y_shape));
+    CreateCPUTensor(&scope, z_name, std::vector<int64_t>(z_shape));
+
+    // It is wired, need to copy manually.
+    *block_->add_ops() = *fc->Proto();
+  };
+
+  // Test with 4 layer FC
+  AddFCLayer("x0", "y0", "z0", false, {batch_size, input_dim},
+             {input_dim, output_dim}, {batch_size, output_dim});
+  AddFCLayer("z0", "y1", "z1", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z1", "y2", "z2", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z2", "y3", "z3", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+
+  LOG(INFO) << "create tensorrt desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("tensorrt_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
+
+  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
+                       block_->SerializeAsString());
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch", batch_size);
+  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 2 << 10);
+  SetAttr<std::vector<std::string>>(
+      engine_op_desc.Proto(), "parameters",
+      std::vector<std::string>({"y0", "y1", "y2", "y3"}));
+  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
+
+  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+
+  // Execute them.
+  engine_op->Run(scope, place);
+}
+
+// Test with a larger FC layer.
+TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
index 78fee77df8151221459b0afa0d6789bfe82cfda5..75d6181749e4e9bd81a3c02de69caf0acd81eef9 100644
--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
@@ -35,10 +35,10 @@ class UniformRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
  protected:
   void Apply() override {
     AddComment(R"DOC(
-Uniform random operator
+UniformRandomBatchSizeLike operator.
 
 This operator initializes a tensor with the same batch_size as the Input tensor
- with random values sampled from a uniform distribution.
+with random values sampled from a uniform distribution.
 
 )DOC");
     AddAttr<float>("min",
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 137ea91caedabc3167146d91b063dbe9e2e2b931..edd1baa4ace4e246190afcd12b0716f1dd38e243 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -86,32 +86,24 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddOutput("Out", "(Tensor) The output tensor of uniform random op");
+    AddOutput("Out", "The output tensor of uniform random op");
     AddComment(R"DOC(
-Uniform random operator.
-
 This operator initializes a tensor with random values sampled from a
-uniform distribution.
+uniform distribution. The random result is in set [min, max].
 
 )DOC");
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) The shape of the output tensor");
-    AddAttr<float>("min",
-                   "(float, default -1.0) "
-                   "Minimum value of uniform random")
+    AddAttr<std::vector<int>>("shape", "The shape of the output tensor");
+    AddAttr<float>("min", "Minimum value of uniform random. [default -1.0].")
         .SetDefault(-1.0f);
-    AddAttr<float>("max",
-                   "(float, default 1.0) "
-                   "Maximun value of uniform random")
+    AddAttr<float>("max", "Maximun value of uniform random. [default 1.0].")
         .SetDefault(1.0f);
     AddAttr<int>("seed",
-                 "(int, default 0) "
                  "Random seed used for generating samples. "
                  "0 means use a seed generated by the system."
                  "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
+                 "generate the same random numbers every time. [default 0].")
         .SetDefault(0);
-    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
+    AddAttr<int>("dtype", "Output tensor data type. [default 5(FP32)].")
         .SetDefault(framework::proto::VarType::FP32);
   }
 };
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 40dc7c9a0b6a40f2419ace3ce7e0e5e82bc95c1a..f832d72b53e8d06a32d5c0ac2ecf7130aa28a666 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -28,9 +28,15 @@ DEFINE_double(fraction_of_cpu_memory_to_use, 1,
               "Default use 100% of CPU memory for PaddlePaddle,"
               "reserve the rest for page tables, etc");
 
-DEFINE_uint64(
-    initial_cpu_memory_in_mb, 500,
-    "Default initial 500MB of CPU memory for PaddlePaddle, in MD unit.");
+DEFINE_uint64(initial_cpu_memory_in_mb,
+#ifdef PADDLE_WITH_MKLDNN
+              /* Aligned with mozga-intel, MKLDNN need at least 5000 MB
+               * to obtain the best performance*/
+              5000,
+#else
+              500,
+#endif
+              "Initial CPU memory for PaddlePaddle, in MD unit.");
 
 DEFINE_double(
     fraction_of_cuda_pinned_memory_to_use, 0.5,
@@ -59,10 +65,7 @@ inline size_t CpuTotalPhysicalMemory() {
 size_t CpuMaxAllocSize() {
   // For distributed systems, it requires configuring and limiting
   // the fraction of memory to use.
-  return std::min(
-      static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use *
-                          CpuTotalPhysicalMemory()),
-      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
+  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
 }
 
 size_t CpuMinChunkSize() {
@@ -71,8 +74,11 @@ size_t CpuMinChunkSize() {
 }
 
 size_t CpuMaxChunkSize() {
-  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
-  return CpuMaxAllocSize() / 32;
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
+  // or the initial_cpu_memory_in_mb.
+  return std::min(
+      static_cast<size_t>(CpuMaxAllocSize() / 32),
+      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
 }
 
 size_t CUDAPinnedMaxAllocSize() {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index bd5c613f8cf794df5dfeb7517ed4350f9b3b6099..74036bcb3114df8fc4613bd9f4dc327463397dba 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -144,28 +144,74 @@ PYBIND11_PLUGIN(core) {
   py::class_<LoDTensor, Tensor>(m, "LoDTensor")
       .def_buffer(
           [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
-      .def(
-          "__init__",
-          [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
-            LoD new_lod;
-            new_lod.reserve(lod.size());
-            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-            new (&instance) LoDTensor(new_lod);
-          })
+      .def("__init__",
+           [](LoDTensor &instance, const std::vector<std::vector<size_t>>
+                                       &recursive_sequence_lengths) {
+             LoD new_lod;
+             new_lod.reserve(recursive_sequence_lengths.size());
+             std::copy(recursive_sequence_lengths.begin(),
+                       recursive_sequence_lengths.end(),
+                       std::back_inserter(new_lod));
+             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+             PADDLE_ENFORCE(
+                 CheckLoD(new_offset_lod, -1),
+                 "the provided recursive_sequence_lengths info is invalid");
+             new (&instance) LoDTensor(new_offset_lod);
+           })
       .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
       .def("set_lod",
            [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
+             // the input lod is offset-based level-of-detail info
+             LOG(WARNING)
+                 << "set_lod is deprecated and will be removed by 9.2018, "
+                    "please switch to set_recursive_sequence_lengths.";
              LoD new_lod;
              new_lod.reserve(lod.size());
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             PADDLE_ENFORCE(CheckLoD(new_lod, vectorize(self.dims()).front()),
+                            "the provided lod info is invalid");
              self.set_lod(new_lod);
            })
-      .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-        auto lod = self.lod();
-        std::vector<std::vector<size_t>> new_lod;
-        new_lod.reserve(lod.size());
-        std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-        return new_lod;
+      .def("set_recursive_sequence_lengths",
+           [](LoDTensor &self, const std::vector<std::vector<size_t>>
+                                   &recursive_sequence_lengths) {
+             // the input recursive_sequence_lengths is length-based
+             // level-of-detail info
+             LoD new_lod;
+             new_lod.reserve(recursive_sequence_lengths.size());
+             std::copy(recursive_sequence_lengths.begin(),
+                       recursive_sequence_lengths.end(),
+                       std::back_inserter(new_lod));
+             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+             PADDLE_ENFORCE(
+                 CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
+                 "the provided recursive_sequence_lengths info is invalid");
+             self.set_lod(new_offset_lod);
+           })
+      .def("lod",
+           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
+             // output the offset-based lod info
+             LOG(WARNING) << "lod is deprecated and will be removed by 9.2018, "
+                             "please switch to recursive_sequence_lengths.";
+             LoD lod = self.lod();
+             std::vector<std::vector<size_t>> new_lod;
+             new_lod.reserve(lod.size());
+             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             return new_lod;
+           })
+      .def("recursive_sequence_lengths",
+           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
+             // output the length-based lod info
+             LoD lod = ConvertToLengthBasedLoD(self.lod());
+             std::vector<std::vector<size_t>> new_lod;
+             new_lod.reserve(lod.size());
+             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             return new_lod;
+           })
+      .def("has_valid_recursive_sequence_lengths", [](LoDTensor &self) -> bool {
+        // Check that the lod info is valid and match the outermost
+        // dimension of the LoDTensor data
+        return CheckLoD(self.lod(), vectorize(self.dims()).front());
       });
 
   py::class_<SelectedRows>(m, "SelectedRows")
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 507479c8622c8d33722e08bba018ad1ba5452e15..555be3d00e2dc467eec45210cc997779827ed69f 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -30,7 +30,9 @@ int main(int argc, char** argv) {
   new_argv.push_back(
       strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,use_mkldnn"));
+  new_argv.push_back(strdup(
+      "--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_mb"));
+  new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
 #endif
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index bd985ad733aa8eece2f8374d033f452a0175a011..5af5bc9c4731317075b3912a4749a0b358bdd56e 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -117,7 +117,7 @@ def __bootstrap__():
 
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope', 'use_mkldnn'
+        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb'
     ]
     if core.is_compiled_with_cuda():
         read_env_flags += [
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index e2013137b14f73bb0fcfb57b4bdc35fcc043bdc0..ac396002018d5952bee4aa79ff4aaa5463e2e9e1 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -47,7 +47,7 @@ class DataToLoDTensorConverter(object):
         self.lod = []
 
         for i in six.range(lod_level):
-            self.lod.append([0])
+            self.lod.append([])
 
     def feed(self, data):
         self._feed_impl_(data, self.lod, self.lod_level)
@@ -56,8 +56,7 @@ class DataToLoDTensorConverter(object):
         if lod_level == 0:
             self.data.append(data)
         else:
-            cur_lod_len = len(data)
-            lod[0].append(lod[0][-1] + cur_lod_len)
+            lod[0].append(len(data))
             for each_data in data:
                 self._feed_impl_(each_data, lod[1:], lod_level - 1)
 
@@ -66,7 +65,7 @@ class DataToLoDTensorConverter(object):
         t = core.LoDTensor()
         t.set(arr, self.place)
         if self.lod_level > 0:
-            t.set_lod(self.lod)
+            t.set_recursive_sequence_lengths(self.lod)
         return t
 
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index f6438c82ac207d0e38d8be5e9d6252b28e72826e..42d3c9c153de2671f67bcb6d8f14e677413011ab 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -644,7 +644,13 @@ class Operator(object):
 
     def set_attr(self, name, val):
         self.attrs[name] = val
-        self.desc.set_attr(name, val)
+        if isinstance(val, Block):
+            self.desc.set_block_attr(name, val.desc)
+        elif isinstance(val, core.BlockDesc) or \
+                isinstance(val, core.ProgramDesc):
+            self.desc.set_serialized_attr(name, val.serialize_to_string())
+        else:
+            self.desc.set_attr(name, val)
 
     @property
     def attr_names(self):
@@ -1034,6 +1040,37 @@ class Block(object):
 
 
 class Program(object):
+    """
+    Python Program. Beneath it is a ProgramDesc, which is used for
+    create c++ Program. A program is a self-contained programing
+    language like container. It has at least one Block, when the
+    control flow op like conditional_block, while_op is included,
+    it will contains nested block.
+    Please reference the framework.proto for details.
+
+    Notes: we have default_startup_program and default_main_program
+    by default, a pair of them will shared the parameters.
+    The default_startup_program only run once to initialize parameters,
+    default_main_program run in every minibatch and adjust the weights.
+
+    Args:
+        None
+
+    Returns:
+        Python Program
+
+    Examples:
+       .. code-block:: python
+
+         main_program = Program()
+         startup_program = Program()
+         with fluid.program_guard(main_program=main_program, startup_program=startup_program):
+            fluid.layers.data(name="x", shape=[-1, 784], dtype='float32')
+            fluid.layers.data(name="y", shape=[-1, 1], dtype='int32')
+            fluid.layers.fc(name="fc", shape=[10], dtype='float32', act="relu")
+
+    """
+
     def __init__(self):
         self.desc = core.ProgramDesc()
         self.blocks = [Block(self, 0)]
@@ -1099,6 +1136,8 @@ class Program(object):
 
     def clone(self, for_test=False):
         """Clone the Program object
+        Args:
+           for_test(bool): indicate whether clone for test.
 
         Set for_test to False when we want to clone the program for training.
         Set for_test to True when we want to clone the program for testing.
@@ -1109,8 +1148,9 @@ class Program(object):
                 the is_test attributes in these operators will be set to True for
                 testing purposes, otherwise, they remain unchanged.
 
-        Returns(Program):
-            The cloned Program object.
+        Returns:
+            Program: The cloned Program object.
+
         """
         if for_test:
             p = self.inference_optimize()
@@ -1228,6 +1268,7 @@ class Program(object):
     def copy_param_info_from(self, other):
         """
         Copy the information of parameters from other program.
+
         Args:
             other(Program): Other program
 
@@ -1246,6 +1287,7 @@ class Program(object):
     def copy_data_info_from(self, other):
         """
         Copy the information of data variables from other program.
+
         Args:
             other(Program): Other program
 
@@ -1299,6 +1341,7 @@ class Parameter(Variable):
     def to_string(self, throw_on_error, with_details=False):
         """
         To debug string.
+
         Args:
             throw_on_error(bool): raise exception when self is not initialized
                 when throw_on_error is True
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 4e132ed26183eaa5e572128e679cdbffd42e5a42..c36ad324e70ccf0c7ca40c6921fcc650e97e8b87 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -15,11 +15,13 @@
 import framework
 import numpy as np
 import contextlib
+from framework import convert_np_dtype_to_dtype_
+from core import VarDesc
 
 __all__ = [
-    'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
+    'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'force_init_on_cpu',
     'init_on_cpu', 'ConstantInitializer', 'UniformInitializer',
-    'NormalInitializer', 'XavierInitializer'
+    'NormalInitializer', 'XavierInitializer', 'BilinearInitializer'
 ]
 
 _force_init_on_cpu_ = False
@@ -422,6 +424,101 @@ class MSRAInitializer(Initializer):
         return op
 
 
+class BilinearInitializer(Initializer):
+    """Implements the bilinear initializer.
+
+    This initializer can be used in transposed convolution operator to
+    act as upsampling. Users can upsample a feature map with shape of
+    (B, C, H, W) by any integer factor. The usage is:
+  
+    >>>  factor = 2
+    >>>  w_attr = ParamAttr(learning_rate=0., regularizer=L2Decay(0.),
+    >>>                     initializer=Bilinear())
+    >>>  conv_up = fluid.layers.conv2d_transpose(
+    >>>      input,
+    >>>      num_filters=C,
+    >>>      output_size=None,
+    >>>      filter_size=2 * factor - factor % 2,
+    >>>      padding=ceil((factor - 1) / 2.),
+    >>>      stride=factor,
+    >>>      groups=C,
+    >>>      param_attr=w_attr,
+    >>>      bias_attr=False)
+
+
+    Where, `num_filters=C` and `groups=C` means this is channel-wise tranposed
+    convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`,
+    This initializer will set a (K, K) interpolation kernel for every channel
+    of the filter identically. The resulting shape of the output feature map
+    will be (B, C, factor * H, factor * W). Note that the learning rate and the
+    weight decay are set to 0 in order to keep coefficient values of bilinear
+    interpolation unchanged during training. 
+    """
+
+    def __init__(self):
+        """Constructor for BilinearInitializer.
+        """
+        super(BilinearInitializer, self).__init__()
+
+    def __call__(self, var, block):
+        """Add biliear initialization ops for a variable
+
+        Args:
+            var (Variable): Variable that needs to be initialized.
+            block (Block): The block in which initialization ops should
+                           be added.
+
+        Returns:
+            the initialization op
+
+        Raises:
+            ValueError: If type of `var` and `block` is not right.
+                        If the shape of `var` size is not 4 and
+                        var.shape[2] != var.shape[3].
+        """
+        if not isinstance(var, framework.Variable):
+            raise ValueError("var must be framework.Variable.")
+
+        if not isinstance(block, framework.Block):
+            raise ValueError("block must be framework.Block.")
+
+        shape = var.shape
+        if len(shape) != 4:
+            raise ValueError("the length of shape must be 4.")
+        if shape[2] != shape[3]:
+            raise ValueError("shape[2] must be equal to shape[3].")
+
+        weight = np.zeros(np.prod(var.shape), dtype='float32')
+        size = shape[3]
+        # factor
+        f = np.ceil(size / 2.)
+        # center
+        c = (2 * f - 1 - f % 2) / (2. * f)
+        for i in range(np.prod(shape)):
+            x = i % size
+            y = (i / size) % size
+            weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
+        weight = np.reshape(weight, shape)
+
+        if var.dtype == VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in weight.flat]
+        else:
+            raise ValueError("Unsupported dtype %s", input.dtype)
+        if np.prod(shape) > 1024 * 1024:
+            raise ValueError("The size of input is too big. ")
+        op = block.append_op(
+            type='assign_value',
+            outputs={'Out': [var]},
+            attrs={
+                'dtype': var.dtype,
+                'shape': list(shape),
+                value_name: values
+            })
+        var.op = op
+        return op
+
+
 # We short the class name, since users will use the initializer with the package
 # name. The sample code:
 #
@@ -436,3 +533,4 @@ Uniform = UniformInitializer
 Normal = NormalInitializer
 Xavier = XavierInitializer
 MSRA = MSRAInitializer
+Bilinear = BilinearInitializer
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 80e8ff484a4c04df1b41bbca284d7c604962934c..581770feea98230ce6161bd11dc43f79cecd0048 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -20,13 +20,13 @@ from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
 from ..initializer import force_init_on_cpu
 from ops import logical_and, logical_not, logical_or
+import numpy
 
 __all__ = [
     'split_lod_tensor',
     'merge_lod_tensor',
     'BlockGuard',
     'BlockGuardWithCompletion',
-    'StaticRNNMemoryLink',
     'WhileGuard',
     'While',
     'Switch',
@@ -55,34 +55,36 @@ __all__ = [
 
 def split_lod_tensor(input, mask, level=0):
     """
-    **split_lod_tensor**
-
     This function takes in an input that contains the complete lod information,
     and takes in a mask which is used to mask certain parts of the input.
     The output is the true branch and the false branch with the mask applied to
-    the input at a certain level in the tensor.
+    the input at a certain level in the tensor. Mainly used in IfElse to split
+    data into two parts.
 
     Args:
         input(tuple|list|None): The input tensor that contains complete
                                 lod information needed to construct the output.
         mask(list): A bool column vector which masks the input.
-        level(int): The specific lod level to rank.
+        level(int): The specific lod level to split.
 
     Returns:
-        Variable: The true branch of tensor as per the mask applied to input.
-        Variable: The false branch of tensor as per the mask applied to input.
+        tuple(Variable, Variable):
+        The true branch of tensor as per the mask applied to input.
+
+        The false branch of tensor as per the mask applied to input.
 
     Examples:
         .. code-block:: python
 
-          x = layers.data(name='x', shape=[1])
+          x = fluid.layers.data(name='x', shape=[1])
           x.persistable = True
 
-          y = layers.data(name='y', shape=[1])
+          y = fluid.layers.data(name='y', shape=[1])
           y.persistable = True
 
-          out_true, out_false = layers.split_lod_tensor(
+          out_true, out_false = fluid.layers.split_lod_tensor(
                 input=x, mask=y, level=level)
+
     """
     helper = LayerHelper('split_lod_tensor', **locals())
     out_true = helper.create_tmp_variable(dtype=input.dtype)
@@ -105,8 +107,9 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
 
     This function takes in an input :math:`x`, the True branch, the False
     branch and a binary :math:`mask`. Using this information, this function
-    merges the True and False branches of the tensor into a single Output
-    at a certain lod level indiacted by :math:`level`.
+    merges the True and False branches of the tensor into a single tensor as
+    output at a certain lod level indicated by :math:`level`. Used in IfElse
+    to merge the output if True block and False Block.
 
     Args:
         in_true(tuple|list|None): The True branch to be merged.
@@ -114,7 +117,7 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
         x(tuple|list|None): The input tensor that contains complete
                             lod information needed to construct the output.
         mask(list): A bool column vector which masks the input.
-        level(int): The specific lod level to rank.
+        level(int): The specific lod level to merge.
 
     Returns:
         Variable: The merged output tensor.
@@ -233,9 +236,56 @@ class BlockGuard(object):
 
 class ParallelDo(object):
     """
-    ParallelDo class.
+    ParallelDo is used to represent multi-thread data parallel processing.
+
+    Its vanilla implementation can be shown as the following (:math:`|` means
+    single thread and :math:`||||` means multiple threads)
+
+    .. code-block:: text
+
+      In the forward pass
+        |      Split input onto different devices
+        |      Copy parameter onto different devices
+        ||||   Compute forward pass in parallel
+        |      Merge output from different devices
+
+      In the backward pass
+        |      Split output@grad onto different devices
+        ||||   Compute backward pass in parallel
+        |      accumulate param@grad from different devices to the first device
+        |      Merge input@grad from different devices
+        |      Copy param@grad to the place of parallel_do_op
 
-    ParallelDo class is used to create a ParallelDo.
+    Examples:
+
+    .. code-block:: python
+
+      images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+      label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+      # ParallelDo version & Single-thread version
+      if thread_num > 1:
+          places = fluid.layers.get_places(thread_num)
+          pd = fluid.layers.ParallelDo(places)
+          with pd.do():
+              images = pd.read_input(images)
+              label = pd.read_input(label)
+              predict = cnn_model(images)
+              cost = fluid.layers.cross_entropy(input=predict, label=label)
+
+              avg_cost = fluid.layers.mean(x=cost)
+              pd.write_output(avg_cost)
+
+          avg_cost = pd()
+          avg_cost = fluid.layers.mean(avg_cost)
+      else:
+          predict = cnn_model(images)
+          cost = fluid.layers.cross_entropy(input=predict, label=label)
+          avg_cost = fluid.layers.mean(x=cost)
+
+    .. warning::
+    
+       It will be soon deprecated, please use ParallelExecutor instead.
     """
 
     def __init__(self, places, use_nccl=False, name=None):
@@ -362,16 +412,17 @@ class StaticRNNMemoryLink(object):
     """
     StaticRNNMemoryLink class.
 
-    Args:
-        init: the initial variable for Memory
-        init: Variable
-        pre_mem: the memory variable in previous time step
-        pre_mem: Variable
-        mem: the memory variable in current time step
-        mem: Variable
-
     StaticRNNMemoryLink class is used to create a link between two
     memory cells of a StaticRNN.
+
+
+    NOTE: This is a internal data structure of a very low-level API.
+    Please use StaticRNN instead.
+
+    Args:
+        init(Variable): the initial variable for Memory.
+        pre_mem(Variable): the memory variable in previous time step.
+        mem(Variable): the memory variable in current time step.
     """
 
     def __init__(self, init, pre_mem, mem=None):
@@ -606,6 +657,29 @@ class WhileGuard(BlockGuard):
 
 
 class While(object):
+    """
+    while loop control flow.
+
+    Args:
+        cond (Variable): condition used to compare.
+        name (str): The name of this layer.
+
+    Examples:
+          .. code-block:: python
+
+            d0 = layers.data("d0", shape=[10], dtype='float32')
+            data_array = layers.array_write(x=d0, i=i)
+            array_len = layers.fill_constant(shape=[1],dtype='int64', value=3)
+
+            cond = layers.less_than(x=i, y=array_len)
+            while_op = layers.While(cond=cond)
+            with while_op.block():
+                d = layers.array_read(array=data_array, i=i)
+                i = layers.increment(x=i, in_place=True)
+                layers.array_write(result, i=i, array=d)
+                layers.less_than(x=i, y=array_len, cond=cond)
+    """
+
     BEFORE_WHILE_BLOCK = 0
     IN_WHILE_BLOCK = 1
     AFTER_WHILE_BLOCK = 2
@@ -675,8 +749,8 @@ def lod_rank_table(x, level=0):
         .. code-block:: text
 
             x is a LoDTensor:
-                x.lod = [[0,                2, 3],
-                         [0,             5, 6, 7]]
+                x.lod = [[2,                1],
+                         [5,             1, 1]]
                 x.data = [a, b, c, d, e, f, g]
 
             1. set level to 0:
@@ -706,7 +780,7 @@ def lod_rank_table(x, level=0):
         .. code-block:: python
 
             x = fluid.layers.data(name='x', shape=[10],
-                            dtype='float32', lod_level=1)
+                                  dtype='float32', lod_level=1)
             out = layers.lod_rank_table(x=x, level=0)
     """
     helper = LayerHelper("lod_rank_table", **locals())
@@ -748,17 +822,25 @@ def max_sequence_len(rank_table):
 
 
 def lod_tensor_to_array(x, table):
-    """ Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY.
+    """ 
+    Convert a LoDTensor to a LoDTensorArray.
+
+    This function split a LoDTesnor to a LoDTensorArray according to its LoD 
+    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in 
+    PaddlePaddle. The generated LoDTensorArray of this function can be further read 
+    or written by `read_from_array()` and `write_to_array()` operators. However, 
+    this function is generally an internal component of PaddlePaddle `DynamicRNN`. 
+    Users should not use it directly.
 
     Args:
-        x (Variable|list): The LOD tensor to be converted to a LOD tensor array.
+        x (Variable|list): The LoDTensor to be converted to a LoDTensorArray.
         table (ParamAttr|list): The variable that stores the level of lod
                                 which is ordered by sequence length in
-                                descending order.
+                                descending order. It is generally generated 
+                                by `layers.lod_rank_table()` API.
 
     Returns:
-        Variable: The variable of type array that has been converted from a
-                  tensor.
+        Variable: The LoDTensorArray that has been converted from the input tensor.
 
     Examples:
         .. code-block:: python
@@ -823,8 +905,7 @@ def increment(x, value=1.0, in_place=True):
         in_place (bool): If the increment should be performed in-place.
 
     Returns:
-        Variable: The tensor variable storing the transformation of
-                  element-wise increment of each value in the input.
+        Variable: The elementwise-incremented object.
 
     Examples:
         .. code-block:: python
@@ -866,7 +947,7 @@ def array_write(x, i, array=None):
         Variable: The output LOD_TENSOR_ARRAY where the input tensor is written.
 
     Examples:
-        .. code-block::python
+        .. code-block:: python
 
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
@@ -887,14 +968,17 @@ def array_write(x, i, array=None):
 
 
 def create_array(dtype):
-    """This function creates an array of type :math:`LOD_TENSOR_ARRAY` using the
-    LayerHelper.
+    """
+    **Create LoDTensorArray**
+
+    This function creates an array of LOD_TENSOR_ARRAY . It is mainly used to
+    implement RNN with array_write, array_read and While.
 
     Args:
-        dtype (int|float): The data type of the elements in the array.
+        dtype (int|float): The data type of the elements in the lod_tensor_array.
 
     Returns:
-        Variable: The tensor variable storing the elements of data type.
+        Variable: The lod_tensor_array variable storing the elements of data type.
 
     Examples:
         .. code-block:: python
@@ -909,37 +993,40 @@ def create_array(dtype):
         dtype=dtype)
 
 
-def less_than(x, y, force_cpu=True, cond=None, **ignored):
+@templatedoc()
+def less_than(x, y, force_cpu=None, cond=None, **ignored):
     """
-    **Less than**
+    ${comment}
 
-    This layer returns the truth value of :math:`x < y` elementwise.
+    >>> import paddle.fluid as fluid
+    >>> less = fluid.layers.less_than(x=label, y=limit)
 
     Args:
-        x(Variable): First operand of *less_than*
-        y(Variable): Second operand of *less_than*
-        force_cpu(Bool|True): The output data will be on CPU if set true.
+        x(${x_type}): ${x_comment}.
+        y(${y_type}): ${y_comment}.
+        force_cpu(${force_cpu_type}): ${force_cpu_comment}.
         cond(Variable|None): Optional output variable to store the result of *less_than*
 
     Returns:
-        Variable: The tensor variable storing the output of *less_than*.
-
-    Examples:
-        .. code-block:: python
-
-          less = fluid.layers.less_than(x=label, y=limit)
+        ${out_comment}.
     """
     helper = LayerHelper("less_than", **locals())
     if cond is None:
         cond = helper.create_tmp_variable(dtype='bool')
         cond.stop_gradient = True
 
+    attrs = dict()
+    if force_cpu is not None:
+        attrs['force_cpu'] = force_cpu
+    elif force_init_on_cpu():
+        attrs['force_cpu'] = force_init_on_cpu()
+
     helper.append_op(
         type='less_than',
         inputs={'X': [x],
                 'Y': [y]},
         outputs={'Out': [cond]},
-        attrs={'force_cpu': force_cpu or force_init_on_cpu()})
+        attrs=attrs)
     return cond
 
 
@@ -974,16 +1061,34 @@ def equal(x, y, cond=None, **ignored):
 
 
 def array_read(array, i):
-    """This function performs the operation to read the data in as an
+    """
+    This function performs the operation to read the data in as an
     LOD_TENSOR_ARRAY.
+
+    .. code-block:: text
+
+        Given:
+
+        array = [0.6, 0.1, 0.3, 0.1]
+        
+        And:
+        
+        i = 2
+
+        Then:
+
+        output = 0.3
+
     Args:
-        array (Variable|list): The input tensor that will be written to an array.
-        i (Variable|list): The subscript index in tensor array, that points the
-                           place where data will be written to.
+        array (Variable|list): The input tensor that store data to be read.
+        i (Variable|list): The index of the data to be read from input array.
+
     Returns:
         Variable: The tensor type variable that has the data written to it.
+
     Examples:
-        .. code-block::python
+        .. code-block:: python
+
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
           arr = layers.array_read(tmp, i=i)
@@ -1004,8 +1109,28 @@ def array_read(array, i):
 
 def shrink_memory(x, i, table):
     """
-    This function creates an operator to shrink_rnn_memory using the RankTable
+    This function creates an operator to shrink rnn memory using the RankTable
     as mentioned in the input parameter.
+
+    NOTE: This API is very low-level API. It is used by DynamicRNN only.
+
+    Since the Dynamic RNN uses no-padding way to implement RNN. The sequence
+    will be sorted by order, and the length of valid memory will be shrink after
+    each time step.
+
+    Args:
+        x(Variable): The memory object in the previous time step.
+        i(Variable): The step count variable. A int scalar as LoDTensor.
+        table(Variable): The RNNRankTable object.
+
+    Returns:
+        the memory variable after shrink.
+
+    Examples:
+
+        Since this API is very low level API. The example is not provided.
+        Please reference the implementation of class DynamicRNN for detail
+        usage.
     """
     helper = LayerHelper('shrink_memory', **locals())
     out = helper.create_tmp_variable(dtype=x.dtype)
@@ -1020,9 +1145,14 @@ def shrink_memory(x, i, table):
 
 
 def array_length(array):
-    """This function performs the operation to find the length of the input
+    """
+    **Get the Length of Input LoDTensorArray**
+
+    This function performs the operation to find the length of the input
     LOD_TENSOR_ARRAY.
 
+    Related API: array_read, array_write, While.
+
     Args:
         array (LOD_TENSOR_ARRAY): The input array that will be used
                                   to compute the length.
@@ -1031,12 +1161,13 @@ def array_length(array):
         Variable: The length of the input LoDTensorArray.
 
     Examples:
-        .. code-block::python
+        .. code-block:: python
 
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
           arr = fluid.layers.array_write(tmp, i=i)
           arr_len = fluid.layers.array_length(arr)
+
     """
     helper = LayerHelper('array_length', **locals())
     tmp = helper.create_tmp_variable(dtype='int64')
@@ -1047,6 +1178,13 @@ def array_length(array):
 
 
 class ConditionalBlockGuard(BlockGuard):
+    """
+    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for 
+    holding a ConditionalBlock, and helping users entering and exiting the 
+    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard 
+    is generally an internal component of IfElse, users should not use it directly.
+    """
+
     def __init__(self, block):
         if not isinstance(block, ConditionalBlock):
             raise TypeError("block should be conditional block")
@@ -1120,6 +1258,42 @@ class ConditionalBlock(object):
 
 
 class Switch(object):
+    """
+    Switch class works just like a `if-elif-else`. Can be used in learning rate scheduler
+    to modify learning rate
+
+    The Semantics:
+
+    1. A `switch` control-flow checks cases one-by-one.
+
+    2. The condition of each case is a boolean value, which is a scalar Variable.
+
+    3. It runs the first matched case, or the default case if there is one.
+
+    4. Once it matches a case, it runs the corresponding branch and only that branch.
+
+    Examples:
+        .. code-block:: python
+
+            lr = fluid.layers.tensor.create_global_var(
+                shape=[1],
+                value=0.0,
+                dtype='float32',
+                persistable=True,
+                name="learning_rate")
+            one_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=1.0)
+            two_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=2.0)
+
+            with fluid.layers.control_flow.Switch() as switch:
+                with switch.case(global_step == zero_var):
+                    fluid.layers.tensor.assign(input=one_var, output=lr)
+                with switch.default():
+                    fluid.layers.tensor.assign(input=two_var, output=lr)
+
+    """
+
     def __init__(self, name=None):
         self.helper = LayerHelper('switch', name=name)
         self.inside_scope = False
@@ -1149,7 +1323,8 @@ class Switch(object):
         return ConditionalBlockGuard(cond_block)
 
     def default(self):
-        """create a default case for this switch
+        """
+        create a default case for this switch
         """
         pre_cond_num = len(self.pre_not_conditions)
         if pre_cond_num == 0:
@@ -1209,6 +1384,34 @@ class IfElseBlockGuard(object):
 
 
 class IfElse(object):
+    """
+    if-else control flow.
+
+    Args:
+        cond (Variable): condition used to compare.
+        name (str, default None): The name of this layer.
+
+    Examples:
+          .. code-block:: python
+
+            limit = fluid.layers.fill_constant_batch_size_like(
+                input=label, dtype='int64', shape=[1], value=5.0)
+            cond = fluid.layers.less_than(x=label, y=limit)
+            ie = fluid.layers.IfElse(cond)
+            with ie.true_block():
+                true_image = ie.input(image)
+                hidden = fluid.layers.fc(input=true_image, size=100, act='tanh')
+                prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+
+            with ie.false_block():
+                false_image = ie.input(image)
+                hidden = fluid.layers.fc(
+                    input=false_image, size=200, act='tanh')
+                prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+            prob = ie()
+    """
     OUT_IF_ELSE_BLOCKS = 0
     IN_IF_ELSE_TRUE_BLOCKS = 1
     IN_IF_ELSE_FALSE_BLOCKS = 2
@@ -1311,6 +1514,38 @@ class IfElse(object):
 
 
 class DynamicRNN(object):
+    """
+    The dynamic RNN can process a batch of sequence data. The length of each
+    sample sequence can be different. This API automatically process them in
+    batch.
+
+    The input lod must be set. Please reference `lod_tensor`
+
+    >>> import paddle.fluid as fluid
+    >>> data = fluid.layers.data(name='sentence', dtype='int64', lod_level=1)
+    >>> embedding = fluid.layers.embedding(input=data, size=[65535, 32],
+    >>>                                    is_sparse=True)
+    >>>
+    >>> drnn = fluid.layers.DynamicRNN()
+    >>> with drnn.block():
+    >>>     word = drnn.step_input(embedding)
+    >>>     prev = drnn.memory(shape=[200])
+    >>>     hidden = fluid.layers.fc(input=[word, prev], size=200, act='relu')
+    >>>     drnn.update_memory(prev, hidden)  # set prev to hidden
+    >>>     drnn.output(hidden)
+    >>>
+    >>> # last is the last time step of rnn. It is the encoding result.
+    >>> last = fluid.layers.sequence_last_step(drnn())
+
+    The dynamic RNN will unfold sequence into timesteps. Users need to define
+    how to process each time step during the :code:`with` block.
+
+    The `memory` is used staging data cross time step. The initial value of
+    memory can be zero or another variable.
+
+    The dynamic RNN can mark multiple variables as its output. Use `drnn()` to
+    get the output sequence.
+    """
     BEFORE_RNN = 0
     IN_RNN = 1
     AFTER_RNN = 2
@@ -1333,6 +1568,15 @@ class DynamicRNN(object):
         self.mem_link = []
 
     def step_input(self, x):
+        """
+        Mark a sequence as a dynamic RNN input.
+        Args:
+            x(Variable): The input sequence.
+
+        Returns:
+            The current timestep in the input sequence.
+
+        """
         self._assert_in_rnn_block_("step_input")
         if not isinstance(x, Variable):
             raise TypeError(
@@ -1376,6 +1620,15 @@ class DynamicRNN(object):
         return array_read(array=input_array, i=self.step_idx)
 
     def static_input(self, x):
+        """
+        Mark a variable as a RNN input. The input will not be scattered into
+        time steps.
+        Args:
+            x(Variable): The input variable.
+
+        Returns:
+            The input variable that can access in RNN.
+        """
         self._assert_in_rnn_block_("static_input")
         if not isinstance(x, Variable):
             raise TypeError(
@@ -1397,6 +1650,10 @@ class DynamicRNN(object):
 
     @contextlib.contextmanager
     def block(self):
+        """
+        The block for user to define operators in RNN. See the class docstring
+        for more details.
+        """
         if self.status != DynamicRNN.BEFORE_RNN:
             raise ValueError("rnn.block() can only be invoke once")
         self.step_idx = fill_constant(
@@ -1423,6 +1680,9 @@ class DynamicRNN(object):
                     x=each_array, table=self.lod_rank_table))
 
     def __call__(self, *args, **kwargs):
+        """
+        Get the output of RNN. This API should only be invoked after RNN.block()
+        """
         if self.status != DynamicRNN.AFTER_RNN:
             raise ValueError(("Output of the dynamic RNN can only be visited "
                               "outside the rnn block."))
@@ -1437,6 +1697,70 @@ class DynamicRNN(object):
                value=0.0,
                need_reorder=False,
                dtype='float32'):
+        """
+        Create a memory variable for dynamic rnn.
+
+        If the :code:`init` is not None, :code:`memory` will be initialized by
+        this variable. The :code:`need_reorder` is used to reorder the memory as
+        the input variable. It should be set to true when the initialized memory
+        depends on the input sample.
+
+        For example,
+
+        >>> import paddle.fluid as fluid
+        >>> sentence = fluid.layers.data(
+        >>>                 name='sentence', dtype='float32', shape=[32])
+        >>> boot_memory = fluid.layers.data(
+        >>>                 name='boot', dtype='float32', shape=[10])
+        >>>
+        >>> drnn = fluid.layers.DynamicRNN()
+        >>> with drnn.block():
+        >>>     word = drnn.step_input(sentence)
+        >>>     memory = drnn.memory(init=boot_memory, need_reorder=True)
+        >>>     hidden = fluid.layers.fc(
+        >>>                 input=[word, memory], size=10, act='tanh')
+        >>>     drnn.update_memory(ex_mem=memory, new_mem=hidden)
+        >>>     drnn.output(hidden)
+        >>> rnn_output = drnn()
+
+
+        Otherwise, if :code:`shape`, :code:`value`, :code:`dtype` are set, the
+        :code:`memory` will be initialized by this :code:`value`.
+
+        For example,
+
+        >>> import paddle.fluid as fluid
+        >>> sentence = fluid.layers.data(
+        >>>                 name='sentence', dtype='float32', shape=[32])
+        >>>
+        >>> drnn = fluid.layers.DynamicRNN()
+        >>> with drnn.block():
+        >>>     word = drnn.step_input(sentence)
+        >>>     memory = drnn.memory(shape=[10], dtype='float32', value=0)
+        >>>     hidden = fluid.layers.fc(
+        >>>             input=[word, memory], size=10, act='tanh')
+        >>>     drnn.update_memory(ex_mem=memory, new_mem=hidden)
+        >>>     drnn.output(hidden)
+        >>> rnn_output = drnn()
+
+
+        Args:
+            init(Variable|None): The initialized variable.
+
+            shape(list|tuple): The memory shape. NOTE the shape does not contain
+            batch_size.
+
+            value(float): the initalized value.
+
+            need_reorder(bool): True if the initialized memory depends on the
+            input sample.
+
+            dtype(str|numpy.dtype): The data type of the initialized memory.
+
+        Returns:
+            the memory variable.
+
+        """
         self._assert_in_rnn_block_('memory')
         if init is not None:
             if not isinstance(init, Variable):
@@ -1504,6 +1828,16 @@ class DynamicRNN(object):
             return self.memory(init=init)
 
     def update_memory(self, ex_mem, new_mem):
+        """
+        Update the memory from ex_mem to new_mem. NOTE that the shape and data
+        type of :code:`ex_mem` and :code:`new_mem` must be same.
+        Args:
+            ex_mem(Variable): the memory variable.
+            new_mem(Variable): the plain variable generated in RNN block.
+
+        Returns:
+            None
+        """
         self._assert_in_rnn_block_('update_memory')
         if not isinstance(ex_mem, Variable):
             raise TypeError("The input arg `ex_mem` of update_memory() must "
@@ -1521,6 +1855,15 @@ class DynamicRNN(object):
         self.mem_link.append((new_mem, mem_array))
 
     def output(self, *outputs):
+        """
+        mark the RNN output variables.
+
+        Args:
+            outputs: The output variables.
+
+        Returns:
+            None
+        """
         self._assert_in_rnn_block_('output')
         parent_block = self._parent_block_()
         for each in outputs:
@@ -1563,26 +1906,26 @@ def reorder_lod_tensor_by_rank(x, rank_table):
 
 def is_empty(x, cond=None, **ignored):
     """
-    **Is Empty**
-
-    This layer returns the truth value of whether the variable is empty.
+    Test whether a Variable is empty.
 
     Args:
-        x(Variable): Operand of *is_empty*
-        cond(Variable|None): Optional output variable to store the result
-                             of *is_empty*
+        x (Variable): The Variable to be tested.
+        cond (Variable|None): Output parameter. Returns the test result 
+                              of given 'x'. Default: None
 
     Returns:
-        Variable: The tensor variable storing the output of *is_empty*.
+        Variable: A bool scalar. True if 'x' is an empty Variable.
 
     Raises:
         TypeError: If input cond is not a variable, or cond's dtype is
-                   not bool
+                   not bool.
 
     Examples:
         .. code-block:: python
 
-          less = fluid.layers.is_empty(x=input)
+          res = fluid.layers.is_empty(x=input)
+          # or:
+          fluid.layers.is_empty(x=input, cond=res)
     """
     helper = LayerHelper("is_empty", **locals())
     if cond is None:
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3a83db12fd13651578deeac6b562bac2f1e4e4b6..d5471d182bf19015995aeec2a81ec5a772765712 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -97,7 +97,9 @@ def detection_output(loc,
         nms_eta(float): The parameter for adaptive NMS.
 
     Returns:
-        Variable: The detection outputs is a LoDTensor with shape [No, 6].
+        Variable: 
+        
+            The detection outputs is a LoDTensor with shape [No, 6].
             Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
             `No` is the total number of detections in this mini-batch. For each
             instance, the offsets in first dimension are called LoD, the offset
@@ -110,15 +112,15 @@ def detection_output(loc,
     Examples:
         .. code-block:: python
 
-        pb = layers.data(name='prior_box', shape=[10, 4],
+            pb = layers.data(name='prior_box', shape=[10, 4],
                          append_batch_size=False, dtype='float32')
-        pbv = layers.data(name='prior_box_var', shape=[10, 4],
+            pbv = layers.data(name='prior_box_var', shape=[10, 4],
                           append_batch_size=False, dtype='float32')
-        loc = layers.data(name='target_box', shape=[2, 21, 4],
+            loc = layers.data(name='target_box', shape=[2, 21, 4],
                           append_batch_size=False, dtype='float32')
-        scores = layers.data(name='scores', shape=[2, 21, 10],
+            scores = layers.data(name='scores', shape=[2, 21, 10],
                           append_batch_size=False, dtype='float32')
-        nmsed_outs = fluid.layers.detection_output(scores=scores,
+            nmsed_outs = fluid.layers.detection_output(scores=scores,
                                        loc=loc,
                                        prior_box=pb,
                                        prior_box_var=pbv)
@@ -210,53 +212,68 @@ def bipartite_match(dist_matrix,
                     dist_threshold=None,
                     name=None):
     """
-    **Bipartite matchint operator**
-
-    This operator is a greedy bipartite matching algorithm, which is used to
-    obtain the matching with the maximum distance based on the input
+    This operator implements a greedy bipartite matching algorithm, which is
+    used to obtain the matching with the maximum distance based on the input
     distance matrix. For input 2D matrix, the bipartite matching algorithm can
-    find the matched column for each row, also can find the matched row for
-    each column. And this operator only calculate matched indices from column
-    to row. For each instance, the number of matched indices is the number of
-    of columns of the input ditance matrix.
-
-    There are two outputs to save matched indices and distance.
-    A simple description, this algothrim matched the best (maximum distance)
+    find the matched column for each row (matched means the largest distance),
+    also can find the matched row for each column. And this operator only
+    calculate matched indices from column to row. For each instance,
+    the number of matched indices is the column number of the input distance
+    matrix.
+
+    There are two outputs, matched indices and distance.
+    A simple description, this algorithm matched the best (maximum distance)
     row entity to the column entity and the matched indices are not duplicated
     in each row of ColToRowMatchIndices. If the column entity is not matched
     any row entity, set -1 in ColToRowMatchIndices.
 
-    Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
+    NOTE: the input DistMat can be LoDTensor (with LoD) or Tensor.
     If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
     If Tensor, the height of ColToRowMatchIndices is 1.
 
+    NOTE: This API is a very low level API. It is used by :code:`ssd_loss`
+    layer. Please consider to use :code:`ssd_loss` instead.
+
     Args:
         dist_matrix(Variable): This input is a 2-D LoDTensor with shape
             [K, M]. It is pair-wise distance matrix between the entities
             represented by each row and each column. For example, assumed one
             entity is A with shape [K], another entity is B with shape [M]. The
-            dist_matirx[i][j] is the distance between A[i] and B[j]. The bigger
-            the distance is, the better macthing the pairs are. Please note,
-            This tensor can contain LoD information to represent a batch of
-            inputs. One instance of this batch can contain different numbers of
-            entities.
+            dist_matrix[i][j] is the distance between A[i] and B[j]. The bigger
+            the distance is, the better matching the pairs are.
+
+            NOTE: This tensor can contain LoD information to represent a batch
+            of inputs. One instance of this batch can contain different numbers
+            of entities.
         match_type(string|None): The type of matching method, should be
-           'bipartite' or 'per_prediction', 'bipartite' by defalut.
+           'bipartite' or 'per_prediction'. [default 'bipartite'].
         dist_threshold(float|None): If `match_type` is 'per_prediction',
             this threshold is to determine the extra matching bboxes based
-            on the maximum distance, 0.5 by defalut.
+            on the maximum distance, 0.5 by default.
     Returns:
-        match_indices(Variable): A 2-D Tensor with shape [N, M] in int type.
-            N is the batch size. If match_indices[i][j] is -1, it
-            means B[j] does not match any entity in i-th instance.
-            Otherwise, it means B[j] is matched to row
-            match_indices[i][j] in i-th instance. The row number of
-            i-th instance is saved in match_indices[i][j].
-        match_distance(Variable): A 2-D Tensor with shape [N, M] in float type.
-            N is batch size. If match_indices[i][j] is -1,
-            match_distance[i][j] is also -1.0. Otherwise, assumed
-            match_distance[i][j] = d, and the row offsets of each instance
-            are called LoD. Then match_distance[i][j] = dist_matrix[d+LoD[i]][j].
+        tuple: a tuple with two elements is returned. The first is
+        matched_indices, the second is matched_distance.
+
+        The matched_indices is a 2-D Tensor with shape [N, M] in int type.
+        N is the batch size. If match_indices[i][j] is -1, it
+        means B[j] does not match any entity in i-th instance.
+        Otherwise, it means B[j] is matched to row
+        match_indices[i][j] in i-th instance. The row number of
+        i-th instance is saved in match_indices[i][j].
+
+        The matched_distance is a 2-D Tensor with shape [N, M] in float type
+        . N is batch size. If match_indices[i][j] is -1,
+        match_distance[i][j] is also -1.0. Otherwise, assumed
+        match_distance[i][j] = d, and the row offsets of each instance
+        are called LoD. Then match_distance[i][j] =
+        dist_matrix[d+LoD[i]][j].
+
+    Examples:
+
+        >>> x = fluid.layers.data(name='x', shape=[4], dtype='float32')
+        >>> y = fluid.layers.data(name='y', shape=[4], dtype='float32')
+        >>> iou = fluid.layers.iou_similarity(x=x, y=y)
+        >>> matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
     """
     helper = LayerHelper('bipartite_match', **locals())
     match_indices = helper.create_tmp_variable(dtype='int32')
@@ -281,8 +298,6 @@ def target_assign(input,
                   mismatch_value=None,
                   name=None):
     """
-    **Target assigner operator**
-
     This operator can be, for given the target bounding boxes or labels,
     to assign classification and regression targets to each prediction as well as
     weights to prediction. The weights is used to specify which prediction would
@@ -296,20 +311,24 @@ def target_assign(input,
 
     1. Assigning all outpts based on `match_indices`:
 
-    If id = match_indices[i][j] > 0,
+    .. code-block:: text
+
+        If id = match_indices[i][j] > 0,
 
-        out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
-        out_weight[i][j] = 1.
+            out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
+            out_weight[i][j] = 1.
 
-    Otherwise,
+        Otherwise,
 
-        out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
-        out_weight[i][j] = 0.
+            out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
+            out_weight[i][j] = 0.
 
     2. Assigning out_weight based on `neg_indices` if `neg_indices` is provided:
 
     Assumed that the row offset for each instance in `neg_indices` is called neg_lod,
     for i-th instance and each `id` of neg_indices in this instance:
+    
+    .. code-block:: text
 
         out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
         out_weight[i][id] = 1.0
@@ -326,10 +345,23 @@ def target_assign(input,
        mismatch_value (float32): Fill this value to the mismatched location.
 
     Returns:
-       out (Variable): The output is a 3D Tensor with shape [N, P, K],
-           N and P is the same as they are in `neg_indices`, K is the
-           same as it in input of X. If `match_indices[i][j]`.
-       out_weight (Variable): The weight for output with the shape of [N, P, 1].
+        tuple: 
+        
+               A tuple(out, out_weight) is returned. out is a 3D Tensor with 
+               shape [N, P, K], N and P is the same as they are in 
+               `neg_indices`, K is the same as it in input of X. If 
+               `match_indices[i][j]`. out_weight is the weight for output with 
+               the shape of [N, P, 1].
+
+    Examples:
+
+        .. code-block:: python
+
+            matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
+            gt = layers.data(
+                        name='gt', shape=[1, 1], dtype='int32', lod_level=1)
+            trg, trg_weight = layers.target_assign(
+                            gt, matched_indices, mismatch_value=0)
     """
     helper = LayerHelper('target_assign', **locals())
     out = helper.create_tmp_variable(dtype=input.dtype)
@@ -364,7 +396,7 @@ def ssd_loss(location,
              normalize=True,
              sample_size=None):
     """
-    **Multi-box loss layer for object dection algorithm of SSD**
+    **Multi-box loss layer for object detection algorithm of SSD**
 
     This layer is to compute dection loss for SSD given the location offset
     predictions, confidence predictions, prior boxes and ground-truth boudding
@@ -372,21 +404,35 @@ def ssd_loss(location,
     is a weighted sum of the localization loss (or regression loss) and
     confidence loss (or classification loss) by performing the following steps:
 
-    1. Find matched boundding box by bipartite matching algorithm.
+    1. Find matched bounding box by bipartite matching algorithm.
+
       1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
+
       1.2 Compute matched boundding box by bipartite matching algorithm.
+
     2. Compute confidence for mining hard examples
+
       2.1. Get the target label based on matched indices.
+
       2.2. Compute confidence loss.
+
     3. Apply hard example mining to get the negative example indices and update
        the matched indices.
+
     4. Assign classification and regression targets
+
       4.1. Encoded bbox according to the prior boxes.
+
       4.2. Assign regression targets.
+
       4.3. Assign classification targets.
+
     5. Compute the overall objective loss.
+
       5.1 Compute confidence loss.
+
       5.1 Compute localization loss.
+
       5.3 Compute the overall weighted loss.
 
     Args:
@@ -421,39 +467,36 @@ def ssd_loss(location,
         mining_type (str): The hard example mining type, should be 'hard_example'
             or 'max_negative', now only support `max_negative`.
         normalize (bool): Whether to normalize the SSD loss by the total number
-            of output locations, True by defalut.
+            of output locations, True by default.
         sample_size (int): The max sample size of negative box, used only when
             mining_type is 'hard_example'.
 
     Returns:
-        Variable: The weighted sum of the localization loss and confidence loss,
-            with shape [N * Np, 1], N and Np are the same as they are
-            in `location`.
+        The weighted sum of the localization loss and confidence loss, with \
+        shape [N * Np, 1], N and Np are the same as they are in `location`.
 
     Raises:
-        ValueError: If mining_type is 'hard_example', now only support
-            mining type of `max_negative`.
+        ValueError: If mining_type is 'hard_example', now only support mining \
+        type of `max_negative`.
 
     Examples:
-        .. code-block:: python
-
-            pb = layers.data(
-                name='prior_box',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            pbv = layers.data(
-                name='prior_box_var',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
-            scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
-            gt_box = layers.data(
-                name='gt_box', shape=[4], lod_level=1, dtype='float32')
-            gt_label = layers.data(
-                name='gt_label', shape=[1], lod_level=1, dtype='float32')
-            loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
+        >>> pb = fluid.layers.data(
+        >>>                   name='prior_box',
+        >>>                   shape=[10, 4],
+        >>>                   append_batch_size=False,
+        >>>                   dtype='float32')
+        >>> pbv = fluid.layers.data(
+        >>>                   name='prior_box_var',
+        >>>                   shape=[10, 4],
+        >>>                   append_batch_size=False,
+        >>>                   dtype='float32')
+        >>> loc = fluid.layers.data(name='target_box', shape=[10, 4], dtype='float32')
+        >>> scores = fluid.layers.data(name='scores', shape=[10, 21], dtype='float32')
+        >>> gt_box = fluid.layers.data(
+        >>>         name='gt_box', shape=[4], lod_level=1, dtype='float32')
+        >>> gt_label = fluid.layers.data(
+        >>>         name='gt_label', shape=[1], lod_level=1, dtype='float32')
+        >>> loss = fluid.layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
     """
 
     helper = LayerHelper('ssd_loss', **locals())
@@ -577,7 +620,7 @@ def prior_box(input,
               offset=0.5,
               name=None):
     """
-    **Prior box operator**
+    **Prior Box Operator**
 
     Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
     Each position of the input produce N prior boxes, N is determined by
@@ -606,26 +649,30 @@ def prior_box(input,
        name(str): Name of the prior box op. Default: None.
 
     Returns:
-        boxes(Variable): the output prior boxes of PriorBox.
-             The layout is [H, W, num_priors, 4].
-             H is the height of input, W is the width of input,
-             num_priors is the total
-             box count of each position of input.
-        Variances(Variable): the expanded variances of PriorBox.
-             The layout is [H, W, num_priors, 4].
-             H is the height of input, W is the width of input
-             num_priors is the total
-             box count of each position of input
+        tuple: A tuple with two Variable (boxes, variances)
+
+        boxes: the output prior boxes of PriorBox.
+        The layout is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input,
+        num_priors is the total
+        box count of each position of input.
+
+        variances: the expanded variances of PriorBox.
+        The layout is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input
+        num_priors is the total
+        box count of each position of input
 
 
     Examples:
         .. code-block:: python
-            box, var = prior_box(
-            input=conv1,
-            image=images,
-            min_sizes=[100.],
-            flip=True,
-            clip=True)
+
+            box, var = fluid.layers.prior_box(
+                input=conv1,
+                image=images,
+                min_sizes=[100.],
+                flip=True,
+                clip=True)
     """
     helper = LayerHelper("prior_box", **locals())
     dtype = helper.input_dtype()
@@ -695,11 +742,9 @@ def multi_box_head(inputs,
                    stride=1,
                    name=None):
     """
-    **Prior_boxes**
-
     Generate prior boxes for SSD(Single Shot MultiBox Detector)
     algorithm. The details of this algorithm, please refer the
-    section 2.2 of SSD paper (SSD: Single Shot MultiBox Detector)
+    section 2.2 of SSD paper `SSD: Single Shot MultiBox Detector
     <https://arxiv.org/abs/1512.02325>`_ .
 
     Args:
@@ -740,24 +785,27 @@ def multi_box_head(inputs,
        name(str): Name of the prior box layer. Default: None.
 
     Returns:
-        mbox_loc(Variable): The predicted boxes' location of the inputs.
-             The layout is [N, H*W*Priors, 4]. where Priors
-             is the number of predicted boxes each position of each input.
-        mbox_conf(Variable): The predicted boxes' confidence of the inputs.
-             The layout is [N, H*W*Priors, C]. where Priors
-             is the number of predicted boxes each position of each input
-             and C is the number of Classes.
-        boxes(Variable): the output prior boxes of PriorBox.
-             The layout is [num_priors, 4]. num_priors is the total
-             box count of each position of inputs.
-        Variances(Variable): the expanded variances of PriorBox.
-             The layout is [num_priors, 4]. num_priors is the total
-             box count of each position of inputs
+        tuple: A tuple with four Variables. (mbox_loc, mbox_conf, boxes, variances)
+
+        mbox_loc: The predicted boxes' location of the inputs. The layout
+        is [N, H*W*Priors, 4]. where Priors is the number of predicted
+        boxes each position of each input.
+
+        mbox_conf: The predicted boxes' confidence of the inputs. The layout
+        is [N, H*W*Priors, C]. where Priors is the number of predicted boxes
+        each position of each input and C is the number of Classes.
+
+        boxes: the output prior boxes of PriorBox. The layout is [num_priors, 4].
+        num_priors is the total box count of each position of inputs.
+
+        variances: the expanded variances of PriorBox. The layout is
+        [num_priors, 4]. num_priors is the total box count of each position of inputs
 
 
     Examples:
         .. code-block:: python
-          mbox_locs, mbox_confs, box, var = layers.multi_box_head(
+
+          mbox_locs, mbox_confs, box, var = fluid.layers.multi_box_head(
             inputs=[conv1, conv2, conv3, conv4, conv5, conv5],
             image=images,
             num_classes=21,
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 9de88e2c3205ace74beff43df7ae8956897d965a..8d153b75cd49953770cfa89348914a375be82a82 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -22,9 +22,9 @@ from ..executor import global_scope
 from layer_function_generator import generate_layer_fn, templatedoc
 
 __all__ = [
-    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
-    'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer',
-    'random_data_generator', 'Preprocessor', 'load'
+    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'Recv',
+    'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
+    'double_buffer', 'random_data_generator', 'Preprocessor', 'load'
 ]
 
 
@@ -109,10 +109,35 @@ class BlockGuardServ(BlockGuard):
 
 class ListenAndServ(object):
     """
-    ListenAndServ class.
+    **ListenAndServ Layer**
+    
+    ListenAndServ is used to create a rpc server bind and listen
+    on specific TCP port, this server will run the sub-block when
+    received variables from clients.
 
-    ListenAndServ class is used to wrap listen_and_serv op to create a server
-    which can receive variables from clients and run a block.
+    Args:
+        endpoint(string): IP:port string which the server will listen on.
+        inputs(list): a list of variables that the server will get from clients.
+        fan_in(int): how many client are expected to report to this server, default: 1.
+        optimizer_mode(bool): whether to run the server as a parameter server, default: True.
+
+    Examples:
+        .. code-block:: python
+
+            with fluid.program_guard(main):
+                serv = layers.ListenAndServ(
+                    "127.0.0.1:6170", ["X"], optimizer_mode=False)
+                with serv.do():
+                    x = layers.data(
+                        shape=[32, 32],
+                        dtype='float32',
+                        name="X",
+                        append_batch_size=False)
+                    fluid.initializer.Constant(value=1.0)(x, main.global_block())
+                    layers.scale(x=x, scale=10.0, out=out_var)
+
+            exe = fluid.Executor(place)
+            exe.run(main)
     """
 
     def __init__(self, endpoint, inputs, fan_in=1, optimizer_mode=True):
@@ -177,18 +202,17 @@ class ListenAndServ(object):
             })
 
 
-def Send(endpoints, send_vars, get_vars=None):
+def Send(endpoints, send_vars, sync=True):
     """
-    Send layer
+    Send variables to the server side, and get vars from server
+    side when server have finished running server side program.
 
     Args:
-        endpoints: comma seperated IP:PORT pairs in the order
+        endpoints (str): comma seperated IP:PORT pairs in the order
                    of send_vars to send
-        send_vars: vars to send
-        get_vars: vars to get from server after send completes.
-
-    Send variables to the server side, and get vars from server
-    side when server have finished running server side program.
+        send_vars (list): variables to send to server
+        sync (bool): whether to wait the request finish
+    
     """
     assert (type(send_vars) == list)
 
@@ -196,40 +220,33 @@ def Send(endpoints, send_vars, get_vars=None):
     endpoints = list(set(epmap))
 
     helper = LayerHelper("Send", **locals())
-    if not get_vars:
-        get_vars = []
-        for s in send_vars:
-            v = helper.create_tmp_variable(dtype=s.dtype, stop_gradient=True)
-            get_vars.append(v)
     rpc_op_role_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
     helper.append_op(
         type="send",
         inputs={"X": send_vars},
-        outputs={"Out": get_vars},
         attrs={
             "endpoints": endpoints,
             "epmap": epmap,
             rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC
         })
+    if sync:
+        helper.append_op(type="send_barrier", attrs={"endpoints": endpoints})
 
-    return get_vars
 
-
-def Recv(endpoints, get_vars):
+def Recv(endpoints, get_vars, sync=True):
     """
-    Recv layer
+    Receive variables from server side
 
     Args:
-        endpoints: comma seperated IP:PORT pairs in the order
+        endpoints (str): comma seperated IP:PORT pairs in the order
                    of send_vars to send
-        send_vars: vars to send
-        get_vars: vars to get from server after send completes.
+        get_vars (list): vars to get from server after send completes.
+        sync (bool): whether to wait the request finish
 
-    Send variables to the server side, and get vars from server
-    side when server have finished running server side program.
+    Returns:
+        list: list of received variables
     """
-    assert (type(send_vars) == list)
     assert (type(get_vars) == list)
 
     epmap = endpoints.split(",")
@@ -242,6 +259,9 @@ def Recv(endpoints, get_vars):
         outputs={"Out": get_vars},
         attrs={"endpoints": endpoints,
                "epmap": epmap})
+    if sync:
+        helper.append_op(type="fetch_barrier", attrs={"endpoints": endpoints})
+    return get_vars
 
 
 def monkey_patch_reader_methods(reader):
@@ -292,6 +312,7 @@ def _copy_reader_create_op_(block, op):
     return new_op
 
 
+@templatedoc(op_type='create_recordio_file_reader')
 def open_recordio_file(filename,
                        shapes,
                        lod_levels,
@@ -299,34 +320,30 @@ def open_recordio_file(filename,
                        pass_num=1,
                        for_parallel=True):
     """
-    Open a RecordIO file
-
-    This layer takes a RecordIO file to read from and returns a Reader Variable.
-    Via the Reader Variable, we can get data from the given RecordIO file.
+    ${comment}
 
     Args:
-       filename(str): The RecordIO file's name.
+       filename(${filename_type}): ${filename_comment}.
        shapes(list): List of tuples which declaring data shapes.
-       lod_levels(list): List of ints which declaring data lod_level.
+       lod_levels(${lod_levels_type}): ${lod_levels_comment}.
        dtypes(list): List of strs which declaring data type.
        pass_num(int): Number of passes to run.
        for_parallel(Bool): Set it as True if you are going to run
             subsequent operators in parallel.
 
     Returns:
-       Variable: A Reader Variable via which we can get RecordIO file data.
+       ${out_comment}.
 
     Examples:
-       .. code-block:: python
-
-         reader = fluid.layers.io.open_recordio_file(
-                                          filename='./data.recordio',
-                                          shapes=[(3,224,224), (1)],
-                                          lod_levels=[0, 0],
-                                          dtypes=['float32', 'int64'])
 
-         # Via the reader, we can use 'read_file' layer to get data:
-         image, label = fluid.layers.io.read_file(reader)
+        >>> import paddle.fluid as fluid
+        >>> reader = fluid.layers.io.open_recordio_file(
+        >>>                               filename='./data.recordio',
+        >>>                               shapes=[(3,224,224), (1)],
+        >>>                               lod_levels=[0, 0],
+        >>>                               dtypes=['float32', 'int64'])
+        >>> # Via the reader, we can use 'read_file' layer to get data:
+        >>> image, label = fluid.layers.io.read_file(reader)
     """
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
@@ -386,16 +403,16 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
        Variable: A Reader Variable from which we can get random data.
 
     Examples:
-       .. code-block:: python
 
-         reader = fluid.layers.io.random_data_generator(
-                                          low=0.0,
-                                          high=1.0,
-                                          shapes=[(3,224,224), (1)],
-                                          lod_levels=[0, 0])
+        .. code-block:: python
 
-         # Via the reader, we can use 'read_file' layer to get data:
-         image, label = fluid.layers.io.read_file(reader)
+            reader = fluid.layers.random_data_generator(
+                                             low=0.0,
+                                             high=1.0,
+                                             shapes=[[3,224,224], [1]],
+                                             lod_levels=[0, 0])
+            # Via the reader, we can use 'read_file' layer to get data:
+            image, label = fluid.layers.read_file(reader)
     """
     dtypes = [core.VarDesc.VarType.FP32] * len(shapes)
     shape_concat = []
@@ -544,16 +561,77 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
 
 
 def shuffle(reader, buffer_size):
+    """
+    Shuffle the reader.
+    """
     return __create_unshared_decorated_reader__(
         'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
 
 
 def batch(reader, batch_size):
+    """
+    This layer is a reader decorator. It takes a reader and adds 
+    'batching' decoration on it. When reading with the result 
+    decorated reader, output data will be automatically organized 
+    to the form of batches.
+
+    Args:
+        reader(Variable): The reader to be decorated with 'batching'.
+        batch_size(int): The batch size.
+
+    Returns:
+        Variable: The reader which has been decorated with 'batching'.
+
+    Examples:
+        .. code-block:: python
+
+            raw_reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
+                                                           './data2.recordio'],
+                                                    shapes=[(3,224,224), (1)],
+                                                    lod_levels=[0, 0],
+                                                    dtypes=['float32', 'int64'],
+                                                    thread_num=2,
+                                                    buffer_size=2)
+            batch_reader = fluid.layers.batch(reader=raw_reader, batch_size=5)
+
+            # If we read data with the raw_reader:
+            #     data = fluid.layers.read_file(raw_reader)
+            # We can only get data instance by instance.
+            # 
+            # However, if we read data with the batch_reader:
+            #     data = fluid.layers.read_file(batch_reader)
+            # Each 5 adjacent instances will be automatically combined together 
+            # to become a batch. So what we get('data') is a batch data instead 
+            # of an instance.
+    """
     return __create_unshared_decorated_reader__(
         'create_batch_reader', reader, {'batch_size': int(batch_size)})
 
 
 def double_buffer(reader, place=None, name=None):
+    """
+    Wrap a double buffer reader. The data will copy to target place with a
+    double buffer queue. If the target place is None, the place that executor
+    perform on will be used.
+
+    Args:
+        reader(Variable): the reader variable need to be wrapped.
+        place(Place): the place of target data. Default is the sample place of
+            executor perform.
+
+        name(str): Variable name. None if the user does not care.
+
+    Returns:
+        wrapped reader with double buffer.
+
+    Examples:
+
+        >>> reader = fluid.layers.open_files(filenames=['somefile'],
+        >>>                                  shapes=[[-1, 784], [-1, 1]],
+        >>>                                  dtypes=['float32', 'int64'])
+        >>> reader = fluid.layers.double_buffer(reader)
+        >>> img, label = fluid.layers.read_file(reader)
+    """
     attrs = dict()
     if place is not None:
         attrs['place'] = str(place).upper()
@@ -571,15 +649,41 @@ def parallel(reader):
                                               {})
 
 
-def read_file(file_obj):
+def read_file(reader):
+    """
+    Execute the given reader and get data via it.
+
+    A reader is also a Variable. It can be a raw reader generated by 
+    `fluid.layers.open_files()` or a decorated one generated by 
+    `fluid.layers.double_buffer()` and so on.
+
+    Args:
+
+        reader(Variable): The reader to execute.
+
+    Returns:
+        Tuple[Variable]: Data read via the given reader.
+
+    Examples:
+        .. code-block:: python
+
+           data_file = fluid.layers.open_files(
+                filenames=['mnist.recordio'],
+                shapes=[(-1, 748), (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"])
+            data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(data_file, batch_size=64))
+            input, label = fluid.layers.read_file(data_file)
+    """
     helper = LayerHelper('read_file')
     out = [
         helper.create_tmp_variable(
             stop_gradient=True, dtype='float32')
-        for _ in range(len(file_obj.desc.shapes()))
+        for _ in range(len(reader.desc.shapes()))
     ]
     helper.append_op(
-        type='read', inputs={'Reader': [file_obj]}, outputs={'Out': out})
+        type='read', inputs={'Reader': [reader]}, outputs={'Out': out})
     if len(out) == 1:
         return out[0]
     else:
@@ -587,6 +691,26 @@ def read_file(file_obj):
 
 
 class Preprocessor(object):
+    """
+    A block for data pre-processing in reader.
+
+    Args:
+        reader (Variable): A reader variable.
+        name (str, default None): The name of the reader.
+
+    Examples:
+          .. code-block:: python
+
+            preprocessor = fluid.layers.io.Preprocessor(reader=reader)
+            with preprocessor.block():
+                img, lbl = preprocessor.inputs()
+                img_out = img / 2
+                lbl_out = lbl + 1
+                preprocessor.outputs(img_out, lbl_out)
+
+            data_file = fluid.layers.io.double_buffer(preprocessor())
+
+    """
     BEFORE_SUB_BLOCK = 0
     IN_SUB_BLOCK = 1
     AFTER_SUB_BLOCK = 2
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index cb60a3aec9a5a69f1eed281eb017384a621c66a8..3096389101a5e5b302c78145b8bc9f1d71f6b8cb 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -44,6 +44,18 @@ def _type_to_str_(tp):
     return framework_pb2.AttrType.Name(tp)
 
 
+_two_dollar_pattern_ = re.compile(r"\$\$([^\$]+)\$\$")
+_single_dollar_pattern_ = re.compile(r"\$([^\$]+)\$")
+_two_bang_pattern_ = re.compile(r"!!([^!]+)!!")
+
+
+def escape_math(text):
+    return _two_bang_pattern_.sub(
+        r'$$\1$$',
+        _single_dollar_pattern_.sub(r':math:`\1`',
+                                    _two_dollar_pattern_.sub(r"!!\1!!", text)))
+
+
 def _generate_doc_string_(op_proto):
     """
     Generate docstring by OpProto
@@ -59,18 +71,16 @@ def _generate_doc_string_(op_proto):
         raise TypeError("OpProto should be `framework_pb2.OpProto`")
 
     buf = cStringIO.StringIO()
-    buf.write(op_proto.comment)
+    buf.write(escape_math(op_proto.comment))
     buf.write('\nArgs:\n')
     for each_input in op_proto.inputs:
         line_begin = '    {0}: '.format(_convert_(each_input.name))
         buf.write(line_begin)
-        buf.write(each_input.comment)
-        buf.write('\n')
-        buf.write(' ' * len(line_begin))
-        buf.write('Duplicable: ')
-        buf.write(str(each_input.duplicable))
-        buf.write('  Optional: ')
-        buf.write(str(each_input.dispensable))
+        buf.write(escape_math(each_input.comment))
+        if each_input.duplicable:
+            buf.write("  Duplicatable.")
+        if each_input.dispensable:
+            buf.write("  Optional.")
         buf.write('\n')
 
     skip_attrs = OpProtoHolder.generated_op_attr_names()
@@ -83,7 +93,7 @@ def _generate_doc_string_(op_proto):
         buf.write(' (')
         buf.write(_type_to_str_(each_attr.type))
         buf.write('): ')
-        buf.write(each_attr.comment)
+        buf.write(escape_math(each_attr.comment))
         buf.write('\n')
 
     if len(op_proto.outputs) != 0:
@@ -92,7 +102,7 @@ def _generate_doc_string_(op_proto):
         for each_opt in op_proto.outputs:
             if not each_opt.intermediate:
                 break
-        buf.write(each_opt.comment)
+        buf.write(escape_math(each_opt.comment))
 
     return buf.getvalue()
 
@@ -224,9 +234,6 @@ def autodoc(comment=""):
     return __impl__
 
 
-_inline_math_single_dollar = re.compile(r"\$([^\$]+)\$")
-
-
 def templatedoc(op_type=None):
     """
     Decorator of layer function. It will use the docstring from the layer
@@ -244,9 +251,6 @@ def templatedoc(op_type=None):
     def trim_ending_dot(msg):
         return msg.rstrip('.')
 
-    def escape_inline_math(msg):
-        return _inline_math_single_dollar.sub(repl=r':math:`\1`', string=msg)
-
     def __impl__(func):
         if op_type is None:
             op_type_name = func.__name__
@@ -260,7 +264,7 @@ def templatedoc(op_type=None):
         for line in comment_lines:
             line = line.strip()
             if len(line) != 0:
-                comment += escape_inline_math(line)
+                comment += escape_math(line)
                 comment += " "
             elif len(comment) != 0:
                 comment += "\n    \n    "
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 716cc7824eff0c56cc55a055310fa8b1913ac5e6..6071e3e74218e4db4cddc223818d3a9b7086fd86 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -25,10 +25,11 @@ import nn
 import ops
 import tensor
 from ..initializer import init_on_cpu
+from ..framework import default_main_program, Parameter
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
-    'polynomial_decay', 'piecewise_decay', 'noam_decay'
+    'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS'
 ]
 
 
@@ -70,21 +71,40 @@ def noam_decay(d_model, warmup_steps):
 
 
 def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
-    """Applies exponential decay to the learning rate.
+    """
+    Applies exponential decay to the learning rate. 
+
+    When training a model, it is often recommended to lower the learning rate as the 
+    training progresses. By using this function, the learning rate will be decayed by 
+    'decay_rate' every 'decay_steps' steps.
+
+    >>> if staircase == True:
+    >>>     decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
+    >>> else:
+    >>>     decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
 
-    ```python
-    decayed_learning_rate = learning_rate *
-            decay_rate ^ (global_step / decay_steps)
-    ```
     Args:
-        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training
-        decay_steps: A Python `int32` number.
-        decay_rate: A Python `float` number.
-        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+        learning_rate(Variable|float): The initial learning rate.
+        decay_steps(int): See the decay computation above.
+        decay_rate(float): The decay rate. See the decay computation above.
+        staircase(Boolean): If True, decay the learning rate at discrete intervals.
+                            Default: False
 
     Returns:
-        The decayed learning rate
+        Variable: The decayed learning rate
+
+    Examples:
+        .. code-block:: python
+
+          base_lr = 0.1
+          sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.exponential_decay(
+                    learning_rate=base_lr,
+                    decay_steps=10000,
+                    decay_rate=0.5,
+                    staircase=True))
+          sgd_optimizer.minimize(avg_cost)
+
     """
     global_step = _decay_step_counter()
 
@@ -128,22 +148,39 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
 
 
 def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
-    """Applies inverse time decay to the initial learning rate.
+    """
+    Applies inverse time decay to the initial learning rate.
 
-    >>> if staircase:
+    When training a model, it is often recommended to lower the learning rate as the 
+    training progresses. By using this function, an inverse decay function will be 
+    applied to the initial learning rate.
+
+    >>> if staircase == True:
     >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
     >>> else:
     >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
 
     Args:
-        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training.
-        decay_steps: A Python `int32` number.
-        decay_rate: A Python `float` number.
-        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+        learning_rate(Variable|float): The initial learning rate.
+        decay_steps(int): See the decay computation above.
+        decay_rate(float): The decay rate. See the decay computation above.
+        staircase(Boolean): If True, decay the learning rate at discrete intervals.
+                            Default: False
 
     Returns:
-        The decayed learning rate
+        Variable: The decayed learning rate
+
+    Examples:
+        .. code-block:: python
+
+          base_lr = 0.1
+          sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.inverse_time_decay(
+                    learning_rate=base_lr,
+                    decay_steps=10000,
+                    decay_rate=0.5,
+                    staircase=True))
+          sgd_optimizer.minimize(avg_cost)
     """
     global_step = _decay_step_counter()
 
@@ -162,25 +199,28 @@ def polynomial_decay(learning_rate,
                      end_learning_rate=0.0001,
                      power=1.0,
                      cycle=False):
-    """Applies polynomial decay to the initial learning rate.
+    """
+    Applies polynomial decay to the initial learning rate.
+
+    .. code-block:: python
+
+     if cycle:
+       decay_steps = decay_steps * ceil(global_step / decay_steps)
+     else:
+       global_step = min(global_step, decay_steps)
+       decayed_learning_rate = (learning_rate - end_learning_rate) *
+            (1 - global_step / decay_steps) ^ power + end_learning_rate
 
-    >>> if cycle:
-    >>>     decay_steps = decay_steps * ceil(global_step / decay_steps)
-    >>> else:
-    >>>     global_step = min(global_step, decay_steps)
-    >>> decayed_learning_rate = (learning_rate - end_learning_rate) *
-    >>>                   (1 - global_step / decay_steps) ^ power +
-    >>>                   end_learning_rate
     Args:
-        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training
-        decay_steps: A Python `int32` number.
-        end_learning_rate: A Python `float` number.
-        power: A Python `float` number
-        cycle: Boolean. If set true, decay the learning rate every decay_steps.
+        learning_rate(Variable|float32): A scalar float32 value or a Variable. This
+          will be the initial learning rate during training.
+        decay_steps(int32): A Python `int32` number.
+        end_learning_rate(float): A Python `float` number.
+        power(float): A Python `float` number.
+        cycle(bool): If set true, decay the learning rate every decay_steps.
 
     Returns:
-        The decayed learning rate
+        Variable: The decayed learning rate
     """
     global_step = _decay_step_counter()
 
@@ -209,15 +249,27 @@ def polynomial_decay(learning_rate,
 def piecewise_decay(boundaries, values):
     """Applies piecewise decay to the initial learning rate.
 
-    >>> boundaries = [10000, 20000]
-    >>> values = [1.0, 0.5, 0.1]
-    >>>
-    >>> if step < 10000:
-    >>>     learning_rate = 1.0
-    >>> elif 10000 <= step < 20000:
-    >>>     learning_rate = 0.5
-    >>> else:
-    >>>     learning_rate = 0.1
+      The algorithm can be described as the code below.
+
+      .. code-block:: python
+
+        boundaries = [10000, 20000]
+        values = [1.0, 0.5, 0.1]
+        if step < 10000:
+            learning_rate = 1.0
+        elif 10000 <= step < 20000:
+            learning_rate = 0.5
+        else:
+            learning_rate = 0.1
+    Args:
+        boundaries: A list of steps numbers.
+        values: A list of learning rate values that will be picked during
+            different step boundaries.
+
+    Returns:
+        The decayed learning rate.
+
+
     """
 
     if len(values) - len(boundaries) != 1:
@@ -249,3 +301,41 @@ def piecewise_decay(boundaries, values):
                 tensor.assign(last_value_var, lr)
 
     return lr
+
+
+def append_LARS(params_grads, learning_rate, weight_decay):
+    """Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
+       each layer.
+
+    ```python
+        learning_rate *= local_gw_ratio * sqrt(sumsq(param))
+                        / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
+    ```
+
+    Args:
+        learning_rate: A learning rate Variable. This
+          is the global learning rate for LARS.
+        weight_decay: A Python `float` number.
+
+    Returns:
+        The decayed learning rate
+    """
+
+    def _balanced_weight(param_norm, grad_norm):
+        if weight_decay == 1.0:
+            return grad_norm + param_norm
+        else:
+            return grad_norm + weight_decay * param_norm
+
+    for param, grad in params_grads:
+        param_lr = param.optimize_attr['learning_rate']
+        param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
+        grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
+        if type(param_lr) == float and param_lr == 1.0:
+            decayed_lr = learning_rate * param_norm \
+                         / _balanced_weight(param_norm, grad_norm)
+        else:
+            decayed_lr = learning_rate * param_lr * param_norm \
+                         / _balanced_weight(param_norm, grad_norm)
+        # set back param local learning rate
+        param.optimize_attr['learning_rate'] = decayed_lr
diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py
index a1c64ce2771526cbd0baa944f97d01e7878b3ac1..58de1b6b9fe17a24203e93de6780190b9fc6b3e7 100644
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -27,8 +27,32 @@ __all__ = ['accuracy', 'auc']
 
 def accuracy(input, label, k=1, correct=None, total=None):
     """
+    accuracy layer.
+    Refer to the https://en.wikipedia.org/wiki/Precision_and_recall
+
     This function computes the accuracy using the input and label.
-    The output is the top k inputs and their indices.
+    If the correct label occurs in top k predictions, then correct will increment by one.
+    Note: the dtype of accuracy is determined by input. the input and label dtype can be different.
+
+    Args:
+        input(Variable): The input of accuracy layer, which is the predictions of network.
+          Carry LoD information is supported.
+        label(Variable): The label of dataset.
+        k(int): The top k predictions for each class will be checked.
+        correct(Variable): The correct predictions count.
+        total(Variable): The total entries count.
+
+    Returns:
+        Variable: The correct rate.
+
+    Examples:
+        .. code-block:: python
+
+           data = fluid.layers.data(name="data", shape=[-1, 32, 32], dtype="float32")
+           label = fluid.layers.data(name="data", shape=[-1,1], dtype="int32")
+           predict = fluid.layers.fc(input=data, size=10)
+           acc = fluid.layers.accuracy(input=predict, label=label, k=5)
+
     """
     helper = LayerHelper("accuracy", **locals())
     topk_out, topk_indices = nn.topk(input, k=k)
@@ -53,6 +77,43 @@ def accuracy(input, label, k=1, correct=None, total=None):
 
 
 def auc(input, label, curve='ROC', num_thresholds=200):
+    """
+    **Area Under the Curve (AUC) Layer**
+
+    This implementation computes the AUC according to forward output and label.
+    It is used very widely in binary classification evaluation. 
+
+    Note: If input label contains values other than 0 and 1, it will be cast 
+    to `bool`. Find the relevant definitions `here <https://en.wikipedia.org\
+    /wiki/Receiver_operating_characteristic#Area_under_the_curve>`_.
+
+    There are two types of possible curves:
+
+        1. ROC: Receiver operating characteristic;
+        2. PR: Precision Recall
+
+    Args:
+        input(Variable): A floating-point 2D Variable, values are in the range 
+                         [0, 1]. Each row is sorted in descending order. This 
+                         input should be the output of topk. Typically, this 
+                         Variable indicates the probability of each label.
+        label(Variable): A 2D int Variable indicating the label of the training 
+                         data. The height is batch size and width is always 1.
+        curve(str): Curve type, can be 'ROC' or 'PR'. Default 'ROC'.
+        num_thresholds(int): The number of thresholds to use when discretizing 
+                             the roc curve. Default 200.
+
+    Returns:
+        Variable: A scalar representing the current AUC.
+
+    Examples:
+        .. code-block:: python
+        
+            # network is a binary classification model and label the ground truth
+            prediction = network(image, is_infer=True)
+            auc_out=fluid.layers.auc(input=prediction, label=label)
+    """
+
     warnings.warn(
         "This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \
         but can not aggregate them and get the pass AUC, because pass \
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 2c1f9888282188b8066924cef0108ee697f91da6..f6f188df0d6a9a33f4ad858f00c1ba0fd36661b9 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -25,20 +25,74 @@ import utils
 import random
 
 __all__ = [
-    'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru',
-    'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy',
-    'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d',
-    'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'batch_norm',
-    'beam_search_decode', 'conv2d_transpose', 'sequence_expand', 'lstm_unit',
-    'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod',
-    'sequence_first_step', 'sequence_last_step', 'dropout', 'split',
-    'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk',
-    'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce',
-    'beam_search', 'row_conv', 'multiplex', 'layer_norm',
-    'softmax_with_cross_entropy', 'smooth_l1', 'one_hot',
-    'autoincreased_step_counter', 'reshape', 'lod_reset', 'lrn', 'pad',
-    'label_smooth', 'roi_pool', 'dice_loss', 'image_resize',
-    'image_resize_short', 'resize_bilinear', 'gather', 'random_crop', 'mean_iou'
+    'fc',
+    'embedding',
+    'dynamic_lstm',
+    'dynamic_lstmp',
+    'dynamic_gru',
+    'gru_unit',
+    'linear_chain_crf',
+    'crf_decoding',
+    'cos_sim',
+    'cross_entropy',
+    'square_error_cost',
+    'chunk_eval',
+    'sequence_conv',
+    'conv2d',
+    'conv3d',
+    'sequence_pool',
+    'sequence_softmax',
+    'softmax',
+    'pool2d',
+    'pool3d',
+    'batch_norm',
+    'beam_search_decode',
+    'conv2d_transpose',
+    'conv3d_transpose',
+    'sequence_expand',
+    'lstm_unit',
+    'reduce_sum',
+    'reduce_mean',
+    'reduce_max',
+    'reduce_min',
+    'reduce_prod',
+    'sequence_first_step',
+    'sequence_last_step',
+    'dropout',
+    'split',
+    'ctc_greedy_decoder',
+    'edit_distance',
+    'l2_normalize',
+    'matmul',
+    'topk',
+    'warpctc',
+    'sequence_reshape',
+    'transpose',
+    'im2sequence',
+    'nce',
+    'beam_search',
+    'row_conv',
+    'multiplex',
+    'layer_norm',
+    'softmax_with_cross_entropy',
+    'smooth_l1',
+    'one_hot',
+    'autoincreased_step_counter',
+    'reshape',
+    'lod_reset',
+    'lrn',
+    'pad',
+    'label_smooth',
+    'roi_pool',
+    'dice_loss',
+    'image_resize',
+    'image_resize_short',
+    'resize_bilinear',
+    'gather',
+    'random_crop',
+    'mean_iou',
+    'relu',
+    'log',
 ]
 
 
@@ -54,14 +108,15 @@ def fc(input,
     """
     **Fully Connected Layer**
 
-    The fully connected layer can take multiple tensors as its inputs. It
-    creates a variable called weights for each input tensor, which represents
-    a fully connected weight matrix from each input unit to each output unit.
-    The fully connected layer multiplies each input tensor with its coresponding
-    weight to produce an output Tensor. If multiple input tensors are given,
-    the results of multiple multiplications will be sumed up. If bias_attr is
-    not None, a bias variable will be created and added to the output. Finally,
-    if activation is not None, it will be applied to the output as well.
+    This function creates a fully connected layer in the network. It can take 
+    multiple tensors as its inputs. It creates a variable called weights for 
+    each input tensor, which represents a fully connected weight matrix from 
+    each input unit to each output unit. The fully connected layer multiplies 
+    each input tensor with its coresponding weight to produce an output Tensor. 
+    If multiple input tensors are given, the results of multiple multiplications 
+    will be sumed up. If bias_attr is not None, a bias variable will be created 
+    and added to the output. Finally, if activation is not None, it will be applied 
+    to the output as well.
 
     This process can be formulated as follows:
 
@@ -102,7 +157,7 @@ def fc(input,
         name (str, default None): The name of this layer.
 
     Returns:
-        A tensor variable storing the transformation result.
+        Variable: The transformation result.
 
     Raises:
         ValueError: If rank of the input tensor is less than 2.
@@ -110,8 +165,7 @@ def fc(input,
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(
-              name="data", shape=[32, 32], dtype="float32")
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
           fc = fluid.layers.fc(input=data, size=1000, act="tanh")
     """
 
@@ -173,11 +227,11 @@ def embedding(input,
             have two elements which indicate the size of the dictionary of
             embeddings and the size of each embedding vector respectively.
         is_sparse(bool): The flag indicating whether to use sparse update.
-        is_distributed (bool): Whether to run lookup table from remote parameter server.
+        is_distributed(bool): Whether to run lookup table from remote parameter server.
         padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
             Otherwise the given :attr:`padding_idx` indicates padding the output
             with zeros whenever lookup encounters it in :attr:`input`. If
-            :math:`padding_idx < 0`, the padding_idx to use in lookup is
+            :math:`padding_idx < 0`, the :attr:`padding_idx` to use in lookup is
             :math:`size[0] + dim`.
         param_attr(ParamAttr): Parameters for this layer
         dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
@@ -213,6 +267,7 @@ def embedding(input,
     return tmp
 
 
+@templatedoc(op_type="lstm")
 def dynamic_lstm(input,
                  size,
                  h_0=None,
@@ -227,56 +282,11 @@ def dynamic_lstm(input,
                  dtype='float32',
                  name=None):
     """
-    **Dynamic LSTM Layer**
-
-    The defalut implementation is diagonal/peephole connection
-    (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
-
-    .. math::
-
-        i_t & = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i)
-
-        f_t & = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f)
-
-        \\tilde{c_t} & = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
-
-        o_t & = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o)
-
-        c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
-
-        h_t & = o_t \odot act_h(c_t)
-
-    where the :math:`W` terms denote weight matrices (e.g. :math:`W_{xi}` is
-    the matrix of weights from the input gate to the input), :math:`W_{ic}, \
-    W_{fc}, W_{oc}` are diagonal weight matrices for peephole connections. In
-    our implementation, we use vectors to reprenset these diagonal weight
-    matrices. The :math:`b` terms denote bias vectors (:math:`b_i` is the input
-    gate bias vector), :math:`\sigma` is the non-linear activations, such as
-    logistic sigmoid function, and :math:`i, f, o` and :math:`c` are the input
-    gate, forget gate, output gate, and cell activation vectors, respectively,
-    all of which have the same size as the cell output activation vector :math:`h`.
-
-    The :math:`\odot` is the element-wise product of the vectors. :math:`act_g`
-    and :math:`act_h` are the cell input and cell output activation functions
-    and `tanh` is usually used for them. :math:`\\tilde{c_t}` is also called
-    candidate hidden state, which is computed based on the current input and
-    the previous hidden state.
-
-    Set `use_peepholes` to `False` to disable peephole connection. The formula
-    is omitted here, please refer to the paper
-    http://www.bioinf.jku.at/publications/older/2604.pdf for details.
-
-    Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
-    operations on the input :math:`x_{t}` are NOT included in this operator.
-    Users can choose to use fully-connect layer before LSTM layer.
+    ${comment}
 
     Args:
-        input(Variable): The input of dynamic_lstm layer, which supports
-                         variable-time length input sequence. The underlying
-                         tensor in this Variable is a matrix with shape
-                         (T X 4D), where T is the total time steps in this
-                         mini-batch, D is the hidden size.
-        size(int): 4 * hidden size.
+        input (Variable): ${input_comment}
+        size (int): 4 * hidden size.
         h_0(Variable): The initial hidden state is an optional input, default is zero.
                        This is a tensor with shape (N x D), where N is the
                        batch size and D is the hidden size.
@@ -291,33 +301,26 @@ def dynamic_lstm(input,
                                                 W_{fh}, W_{oh}`}
                                - The shape is (D x 4D), where D is the hidden
                                  size.
-        bias_attr(ParamAttr|None): The bias attribute for the learnable bias
+        bias_attr (ParamAttr|None): The bias attribute for the learnable bias
                               weights, which contains two parts, input-hidden
                               bias weights and peephole connections weights if
                               setting `use_peepholes` to `True`.
 
                               1. `use_peepholes = False`
-                                - Biases = {:math:`b_c, b_i, b_f, b_o`}.
-                                - The shape is (1 x 4D).
+                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                 - The shape is (1 x 4D).
                               2. `use_peepholes = True`
-                                - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
                                                  W_{fc}, W_{oc}`}.
-                                - The shape is (1 x 7D).
-        use_peepholes(bool): Whether to enable diagonal/peephole connections,
-                             default `True`.
-        is_reverse(bool): Whether to compute reversed LSTM, default `False`.
-        gate_activation(str): The activation for input gate, forget gate and
-                              output gate. Choices = ["sigmoid", "tanh", "relu",
-                              "identity"], default "sigmoid".
-        cell_activation(str): The activation for cell output. Choices = ["sigmoid",
-                              "tanh", "relu", "identity"], default "tanh".
-        candidate_activation(str): The activation for candidate hidden state.
-                              Choices = ["sigmoid", "tanh",
-                                  "relu", "identity"],
-                              default "tanh".
-        dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
+                                 - The shape is (1 x 7D).
+        use_peepholes (bool): ${use_peepholes_comment}
+        is_reverse (bool): ${is_reverse_comment}
+        gate_activation (str): ${gate_activation_comment}
+        cell_activation (str): ${cell_activation_comment}
+        candidate_activation (str): ${candidate_activation_comment}
+        dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
+        name (str|None): A name for this layer(optional). If set None, the layer
+                         will be named automatically.
 
     Returns:
         tuple: The hidden state, and cell state of LSTM. The shape of both \
@@ -488,27 +491,31 @@ def dynamic_lstmp(input,
         cell_activation(str): The activation for cell output. Choices = ["sigmoid",
                               "tanh", "relu", "identity"], default "tanh".
         candidate_activation(str): The activation for candidate hidden state.
-                              Choices = ["sigmoid", "tanh",
-                                  "relu", "identity"],
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
                               default "tanh".
         proj_activation(str): The activation for projection output.
-                              Choices = ["sigmoid", "tanh",
-                                  "relu", "identity"],
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
                               default "tanh".
         dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
 
     Returns:
-        tuple: The projection of hidden state, and cell state of LSTMP. The \
-               shape of projection is (T x P), for the cell state which is \
-               (T x D), and both LoD is the same with the `input`.
+        tuple: A tuple of two output variable: the projection of hidden state, \
+               and cell state of LSTMP. The shape of projection is (T x P), \
+               for the cell state which is (T x D), and both LoD is the same \
+               with the `input`.
 
     Examples:
+
         .. code-block:: python
 
+            dict_dim, emb_dim = 128, 64
+            data = fluid.layers.data(name='sequence', shape=[1],
+                                     dtype='int32', lod_level=1)
+            emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
             hidden_dim, proj_dim = 512, 256
-            fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+            fc_out = fluid.layers.fc(input=emb, size=hidden_dim * 4,
                                      act=None, bias_attr=None)
             proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out,
                                                      size=hidden_dim * 4,
@@ -574,10 +581,10 @@ def dynamic_gru(input,
                 candidate_activation='tanh',
                 h_0=None):
     """
-    **Dynamic GRU Layer**
+    **Gated Recurrent Unit (GRU) Layer**
 
     Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on
-    Sequence Modeling <https://arxiv.org/abs/1412.3555>`_
+    Sequence Modeling <https://arxiv.org/abs/1412.3555>`_ .
 
     The formula is as follows:
 
@@ -624,17 +631,25 @@ def dynamic_gru(input,
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid".
         candidate_activation(str): The activation for candidate hidden state.
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
-        h_0 (Variable): The hidden output of the first time step.
+        h_0 (Variable): This is initial hidden state. If not set, default is
+            zero. This is a tensor with shape (N x D), where N is the number of
+            total time steps of input mini-batch feature and D is the hidden
+            size.
 
     Returns:
         Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
-            and lod is the same with the input.
+            and sequence length is the same with the input.
 
     Examples:
+
         .. code-block:: python
 
+            dict_dim, emb_dim = 128, 64
+            data = fluid.layers.data(name='sequence', shape=[1],
+                                     dtype='int32', lod_level=1)
+            emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
             hidden_dim = 512
-            x = fluid.layers.fc(input=data, size=hidden_dim * 3)
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
             hidden = fluid.layers.dynamic_gru(input=x, dim=hidden_dim)
     """
 
@@ -782,11 +797,14 @@ def linear_chain_crf(input, label, param_attr=None):
 
     Args:
         input(${emission_type}): ${emission_comment}
+        input(${transition_type}): ${transition_comment}
         label(${label_type}): ${label_comment}
         param_attr(ParamAttr): The attribute of the learnable parameter.
 
     Returns:
-        ${log_likelihood_comment}
+        output(${emission_exps_type}): ${emission_exps_comment} \n
+        output(${transition_exps_type}): ${transition_exps_comment} \n
+        output(${log_likelihood_type}): ${log_likelihood_comment}
 
     """
     helper = LayerHelper('linear_chain_crf', **locals())
@@ -821,11 +839,19 @@ def crf_decoding(input, param_attr, label=None):
 
     Args:
         input(${emission_type}): ${emission_comment}
+
         param_attr(ParamAttr): The parameter attribute for training.
+
         label(${label_type}): ${label_comment}
 
     Returns:
-        ${viterbi_path_comment}
+        Variable: ${viterbi_path_comment}
+    
+    Examples:
+        .. code-block:: python
+
+           crf_decode = layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
     """
     helper = LayerHelper('crf_decoding', **locals())
     transition = helper.get_parameter(param_attr.name)
@@ -840,15 +866,15 @@ def crf_decoding(input, param_attr, label=None):
     return viterbi_path
 
 
+@templatedoc()
 def cos_sim(X, Y):
     """
-    This function performs the cosine similarity between two tensors
-    X and Y and returns that as the output.
+    ${comment}
 
     Args:
-        X (Variable): The input X.
-        Y (Variable): The input Y.
-    
+        X (Variable): ${x_comment}.
+        Y (Variable): ${y_comment}.
+
     Returns:
         Variable: the output of cosine(X, Y).
     """
@@ -872,13 +898,13 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
 
     Drop or keep each element of `x` independently. Dropout is a regularization
     technique for reducing overfitting by preventing neuron co-adaption during
-    training. The dropout operator randomly set (according to the given dropout
+    training. The dropout operator randomly sets (according to the given dropout
     probability) the outputs of some units to zero, while others are remain
     unchanged.
 
     Args:
-        x (Variable): The input tensor.
-         dropout_prob (float): Probability of setting units to zero.
+        x (Variable): The input tensor variable.
+        dropout_prob (float): Probability of setting units to zero.
         is_test (bool): A flag indicating whether it is in test phrase or not.
         seed (int): A Python integer used to create random seeds. If this
                     parameter is set to None, a random seed is used.
@@ -888,13 +914,14 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
                          will be named automatically.
 
     Returns:
-        Variable: A tensor variable.
+        Variable: A tensor variable is the shape with `x`.
 
     Examples:
+
         .. code-block:: python
 
-          x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-          droped = fluid.layers.dropout(input=x, dropout_rate=0.5)
+            x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            droped = fluid.layers.dropout(x, dropout_prob=0.5)
     """
 
     helper = LayerHelper('dropout', **locals())
@@ -1044,20 +1071,94 @@ def chunk_eval(input,
                num_chunk_types,
                excluded_chunk_types=None):
     """
+    **Chunk Evaluator**
+
     This function computes and outputs the precision, recall and
     F1-score of chunk detection.
 
+    For some basics of chunking, please refer to
+    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+
+    ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
+    and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
+    Here is a NER example of labeling for these tagging schemes:
+
+    .. code-block:: python
+    
+       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
+              Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
+       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
+       IO     I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
+       IOB    B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
+       IOE    I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
+       IOBES  B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
+       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
+
+    There are three chunk types(named entity types) including PER(person), ORG(organization)
+    and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
+
+    Since the calculations actually use label ids rather than labels, extra attention
+    should be paid when mapping labels to ids to make CheckEvalOp work. The key point
+    is that the listed equations are satisfied by ids.
+
+    .. code-block:: python
+
+       tag_type = label % num_tag_type
+       chunk_type = label / num_tag_type
+
+    where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
+    is the num of chunk types, and `tag_type` get its value from the following table.
+
+    .. code-block:: python
+    
+       Scheme Begin Inside End   Single
+        plain   0     -      -     -
+        IOB     0     1      -     -
+        IOE     -     0      1     -
+        IOBES   0     1      2     3
+
+    Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
+    PER and LOC. To satisfy the above equations, the label map can be like this:
+
+    .. code-block:: python
+
+       B-ORG  0
+       I-ORG  1
+       B-PER  2
+       I-PER  3
+       B-LOC  4
+       I-LOC  5
+       O      6
+
+    It's not hard to verify the equations noting that the num of chunk types
+    is 3 and the num of tag types in IOB scheme is 2. For example, the label
+    id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
+    I-LOC is 2, which consistent with the results from the equations.
+
     Args:
         input (Variable): prediction output of the network.
         label (Variable): label of the test data set.
         chunk_scheme (str): ${chunk_scheme_comment}
         num_chunk_types (int): ${num_chunk_types_comment}
         excluded_chunk_types (list): ${excluded_chunk_types_comment}
-    
+
     Returns:
-        tuple: tuple containing: (precision, recall, f1_score,
-               num_infer_chunks, num_label_chunks,
-               num_correct_chunks)
+        tuple: tuple containing: precision, recall, f1_score,
+        num_infer_chunks, num_label_chunks,
+        num_correct_chunks
+    
+    Examples:
+        .. code-block:: python
+
+            crf = fluid.layers.linear_chain_crf(
+                input=hidden, label=label, param_attr=ParamAttr(name="crfw"))
+            crf_decode = fluid.layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
+            fluid.layers.chunk_eval(
+                input=crf_decode,
+                label=label,
+                chunk_scheme="IOB",
+                num_chunk_types=(label_dict_len - 1) / 2)
     """
     helper = LayerHelper("chunk_eval", **locals())
 
@@ -1113,15 +1214,11 @@ def sequence_conv(input,
         bias_attr (ParamAttr|None): attributes for bias
         param_attr (ParamAttr|None): attributes for parameter
         act (str): the activation type
-    
+
     Returns:
         Variable: output of sequence_conv
     """
 
-    # FIXME(dzh) : want to unify the argument of python layer
-    # function. So we ignore some unecessary attributes.
-    # such as, padding_trainable, context_start.
-
     helper = LayerHelper('sequence_conv', **locals())
     dtype = helper.input_dtype()
     filter_shape = [filter_size * input.shape[1], num_filters]
@@ -1146,6 +1243,41 @@ def sequence_conv(input,
 
 
 def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
+    """
+    This function computes the softmax activation among all time-steps for each
+    sequence. The dimension of each time-step should be 1. Thus, the shape of
+    input Tensor can be either :math:`[N, 1]` or :math:`[N]`, where :math:`N` 
+    is the sum of the length of all sequences.
+
+    For i-th sequence in a mini-batch:
+
+    .. math::
+
+        Out(X[lod[i]:lod[i+1]], :) = \\frac{\exp(X[lod[i]:lod[i+1], :])}{\sum(\exp(X[lod[i]:lod[i+1], :]))}
+
+    For example, for a mini-batch of 3 sequences with variable-length,
+    each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
+    then softmax will be computed among :math:`X[0:2, :]`, :math:`X[2:5, :]`,
+    :math:`X[5:7, :]`, and :math:`N` turns out to be 7.
+
+    Args:
+        input (Variable): The input variable which is a LoDTensor.
+        bias_attr (ParamAttr|None): attributes for bias
+        param_attr (ParamAttr|None): attributes for parameter
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
+        library is installed. Default: True
+    
+    Returns:
+        Variable: output of sequence_softmax
+
+    Examples:
+
+        .. code-block:: python
+
+             x = fluid.layers.data(name='x', shape=[7, 1],
+                              dtype='float32', lod_level=1)
+             x_sequence_softmax = fluid.layers.sequence_softmax(input=x)
+    """
     helper = LayerHelper('sequence_softmax', **locals())
     dtype = helper.input_dtype()
     softmax_out = helper.create_tmp_variable(dtype)
@@ -1158,6 +1290,45 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
 
 
 def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
+    """
+    The input of the softmax layer is a 2-D tensor with shape N x K (N is the
+    batch_size, K is the dimension of input feature). The output tensor has the
+    same shape as the input tensor.
+
+    For each row of the input tensor, the softmax operator squashes the
+    K-dimensional vector of arbitrary real values to a K-dimensional vector of real
+    values in the range [0, 1] that add up to 1.
+
+    It computes the exponential of the given dimension and the sum of exponential
+    values of all the other dimensions in the K-dimensional vector input.
+    Then the ratio of the exponential of the given dimension and the sum of
+    exponential values of all the other dimensions is the output of the softmax
+    operator.
+
+    For each row :math:`i` and each column :math:`j` in Input(X), we have:
+
+    .. math::
+
+        Out[i, j] = \\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}
+
+    Args:
+        input (Variable): The input variable.
+        bias_attr (ParamAttr): attributes for bias
+        param_attr (ParamAttr): attributes for parameter
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
+        library is installed.
+
+    Returns:
+        Variable: output of softmax
+
+    Examples:
+
+        .. code-block:: python
+
+             fc = fluid.layers.fc(input=x, size=10)
+             softmax = fluid.layers.softmax(input=fc)
+
+    """
     helper = LayerHelper('softmax', **locals())
     dtype = helper.input_dtype()
     softmax_out = helper.create_tmp_variable(dtype)
@@ -1183,14 +1354,17 @@ def conv2d(input,
            act=None,
            name=None):
     """
-    **Convlution2D Layer**
-
     The convolution2D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are in NCHW format. Where N is batch size, C is the number of
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCHW format, where N is batch size, C is the number of
     channels, H is the height of the feature, and W is the width of the feature.
-    The details of convolution layer, please refer UFLDL's `convolution,
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
+    Filter is in MCHW format, where M is the number of output image channels,
+    C is the number of input image channels, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input image channels divided by the groups.
+    Please refer to UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
+    for more detials.
     If bias attribution and activation type are provided, bias is added to the
     output of the convolution, and the corresponding activation function is
     applied to the final result.
@@ -1201,15 +1375,14 @@ def conv2d(input,
 
         Out = \sigma (W \\ast X + b)
 
-    In the above equation:
+    Where:
 
     * :math:`X`: Input value, a tensor with NCHW format.
     * :math:`W`: Filter value, a tensor with MCHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
-                   different.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
     Example:
 
@@ -1220,6 +1393,7 @@ def conv2d(input,
           Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
 
         - Output:
+
           Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
@@ -1231,7 +1405,7 @@ def conv2d(input,
 
     Args:
         input (Variable): The input image with [N, C, H, W] format.
-            num_filters(int): The number of filter. It is as same as the output
+        num_filters(int): The number of filter. It is as same as the output
             image channel.
         filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
             it must contain two integers, (filter_size_H, filter_size_W).
@@ -1254,7 +1428,8 @@ def conv2d(input,
         bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
         use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
             library is installed. Default: True
-        use_mkldnn (bool): Use mkldnn kernels or not.
+        use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
+            with mkldnn library. Default: False
         act (str): Activation type. Default: None
         name (str|None): A name for this layer(optional). If set None, the layer
             will be named automatically.
@@ -1270,13 +1445,9 @@ def conv2d(input,
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d = fluid.layers.conv2d(
-              input=data, num_filters=2, filter_size=3, act="relu")
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
     """
-    if stride is None:
-        stride = [1, 1]
 
     num_channels = input.shape[1]
 
@@ -1339,6 +1510,168 @@ def conv2d(input,
     return helper.append_activation(pre_act)
 
 
+def conv3d(input,
+           num_filters,
+           filter_size,
+           stride=1,
+           padding=0,
+           dilation=1,
+           groups=None,
+           param_attr=None,
+           bias_attr=None,
+           use_cudnn=True,
+           use_mkldnn=False,
+           act=None,
+           name=None):
+    """
+    **Convlution3D Layer**
+
+    The convolution3D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input(Input) and
+    Output(Output) are in NCDHW format. Where N is batch size C is the number of
+    channels, D is the depth of the feature, H is the height of the feature,
+    and W is the width of the feature. Convlution3D is similar with Convlution2D
+    but adds one dimension(depth). If bias attribution and activation type are
+    provided, bias is added to the output of the convolution, and the
+    corresponding activation function is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    In the above equation:
+
+    * :math:`X`: Input value, a tensor with NCDHW format.
+    * :math:`W`: Filter value, a tensor with MCDHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
+
+        - Output:
+          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
+
+    Args:
+        input (Variable): The input image with [N, C, D, H, W] format.
+            num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride (int|tuple): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
+            stride_D = stride_H = stride_W = stride. Default: stride = 1.
+        padding (int|tuple): The padding size. If padding is a tuple, it must
+            contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
+            padding_D = padding_H = padding_W = padding. Default: padding = 0.
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups (int): The groups number of the Conv3d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr (ParamAttr): The parameters to the Conv3d Layer. Default: None
+        bias_attr (ParamAttr): Bias parameter for the Conv3d layer. Default: None
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        use_mkldnn (bool): Use mkldnn kernels or not.
+        act (str): Activation type. Default: None
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: The tensor variable storing the convolution and \
+                  non-linearity activation result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
+          conv3d = fluid.layers.conv3d(input=data, num_filters=2, filter_size=3, act="relu")
+    """
+
+    l_type = 'conv3d'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+
+    num_channels = input.shape[1]
+
+    if groups is None:
+        num_filter_channels = num_channels
+    else:
+        if num_channels % groups != 0:
+            raise ValueError("num_channels must be divisible by groups.")
+        num_filter_channels = num_channels / groups
+
+    filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
+    stride = utils.convert_to_list(stride, 3, 'stride')
+    padding = utils.convert_to_list(padding, 3, 'padding')
+    dilation = utils.convert_to_list(dilation, 3, 'dilation')
+
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False")
+
+    input_shape = input.shape
+    filter_shape = [num_filters, num_filter_channels] + filter_size
+
+    def _get_default_param_initializer():
+        std = (2.0 / (filter_size[0]**3 * num_channels))**0.5
+        return Normal(0.0, std, 0)
+
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=filter_shape,
+        dtype=dtype,
+        default_initializer=_get_default_param_initializer())
+
+    pre_bias = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type=l_type,
+        inputs={
+            'Input': input,
+            'Filter': filter_param,
+        },
+        outputs={"Output": pre_bias},
+        attrs={
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': use_mkldnn
+        })
+
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+
+    return helper.append_activation(pre_act)
+
+
 def sequence_pool(input, pool_type):
     """
     This function add the operator for sequence pooling.
@@ -1355,13 +1688,13 @@ def sequence_pool(input, pool_type):
     .. code-block:: text
 
        x is a 1-level LoDTensor:
-         x.lod = [[0, 2, 5, 7]]
+         x.lod = [[2, 3, 2]]
          x.data = [1, 3, 2, 4, 6, 5, 1]
          x.dims = [7, 1]
 
        then output is a Tensor:
          out.dim = [3, 1]
-         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         with condition len(x.lod[-1]) == out.dims[0]
 
        for different pool_type:
          average: out.data = [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
@@ -1420,13 +1753,13 @@ def sequence_first_step(input):
     .. code-block:: text
 
        x is a 1-level LoDTensor:
-         x.lod = [[0, 2, 5, 7]]
+         x.lod = [[2, 3, 2]]
          x.data = [1, 3, 2, 4, 6, 5, 1]
          x.dims = [7, 1]
 
        then output is a Tensor:
          out.dim = [3, 1]
-         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         with condition len(x.lod[-1]) == out.dims[0]
          out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
 
     Args:
@@ -1453,13 +1786,13 @@ def sequence_last_step(input):
     .. code-block:: text
 
        x is a 1-level LoDTensor:
-         x.lod = [[0, 2, 5, 7]]
+         x.lod = [[2, 3, 2]]
          x.data = [1, 3, 2, 4, 6, 5, 1]
          x.dims = [7, 1]
 
        then output is a Tensor:
          out.dim = [3, 1]
-         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         with condition len(x.lod[-1]) == out.dims[0]
          out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
 
     Args:
@@ -1479,6 +1812,7 @@ def sequence_last_step(input):
     return sequence_pool(input=input, pool_type="last")
 
 
+@templatedoc()
 def pool2d(input,
            pool_size=-1,
            pool_type="max",
@@ -1490,7 +1824,99 @@ def pool2d(input,
            use_mkldnn=False,
            name=None):
     """
-    This function adds the operator for pooling in 2 dimensions, using the
+    ${comment}
+
+    Args:
+        input (Variable): The input tensor of pooling operator. The format of 
+                          input tensor is NCHW, where N is batch size, C is 
+                          the number of channels, H is the height of the 
+                          feature, and W is the width of the feature.
+        pool_size (int): The side length of pooling windows. All pooling 
+                         windows are squares with pool_size on a side.
+        pool_type: ${pooling_type_comment}
+        pool_stride (int): stride of the pooling layer.
+        pool_padding (int): padding size.
+        global_pooling: ${global_pooling_comment}
+        use_cudnn: ${use_cudnn_comment}
+        ceil_mode: ${ceil_mode_comment}
+        use_mkldnn: ${use_mkldnn_comment}
+        name (str|None): A name for this layer(optional). If set None, the 
+                        layer will be named automatically.
+
+    Returns:
+        Variable: The pooling result.
+
+    Raises:
+        ValueError: If 'pool_type' is not "max" nor "avg"
+        ValueError: If 'global_pooling' is False and 'pool_size' is -1
+        ValueError: If 'use_cudnn' is not a bool value.
+
+    Examples:
+
+        .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.pool2d(
+                            input=data, 
+                            pool_size=2, 
+                            pool_type='max', 
+                            pool_stride=1, 
+                            global_pooling=False)
+    """
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+
+    if global_pooling is False and pool_size == -1:
+        raise ValueError(
+            "When the global_pooling is False, pool_size must be passed "
+            "and be a valid value. Received pool_size: " + str(pool_size))
+
+    pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
+    pool_padding = utils.convert_to_list(pool_padding, 2, 'pool_padding')
+    pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
+
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False")
+
+    l_type = 'pool2d'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": input},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "global_pooling": global_pooling,
+            "strides": pool_stride,
+            "paddings": pool_padding,
+            "use_cudnn": use_cudnn,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": use_mkldnn
+        })
+
+    return pool_out
+
+
+def pool3d(input,
+           pool_size=-1,
+           pool_type="max",
+           pool_stride=1,
+           pool_padding=0,
+           global_pooling=False,
+           use_cudnn=True,
+           ceil_mode=False,
+           use_mkldnn=False,
+           name=None):
+    """
+    This function adds the operator for pooling in 3-dimensions, using the
     pooling configurations mentioned in input parameters.
 
     Args:
@@ -1505,9 +1931,9 @@ def pool2d(input,
         use_mkldnn (bool): ${use_mkldnn_comment}
         name (str): A name for this layer(optional). If set None, the layer
             will be named automatically.
-    
+
     Returns:
-        Variable: output of pool2d layer.
+        Variable: output of pool3d layer.
     """
     if pool_type not in ["max", "avg"]:
         raise ValueError(
@@ -1519,19 +1945,20 @@ def pool2d(input,
             "When the global_pooling is False, pool_size must be passed "
             "and be a valid value. Received pool_size: " + str(pool_size))
 
-    pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
-    pool_padding = utils.convert_to_list(pool_padding, 2, 'pool_padding')
-    pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
+    pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
+    pool_padding = utils.convert_to_list(pool_padding, 3, 'pool_padding')
+    pool_stride = utils.convert_to_list(pool_stride, 3, 'pool_stride')
 
     if not isinstance(use_cudnn, bool):
         raise ValueError("use_cudnn should be True or False")
 
-    helper = LayerHelper('pool2d', **locals())
+    l_type = "pool3d"
+    helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_tmp_variable(dtype)
 
     helper.append_op(
-        type="pool2d",
+        type=l_type,
         inputs={"X": input},
         outputs={"Out": pool_out},
         attrs={
@@ -1563,27 +1990,57 @@ def batch_norm(input,
                moving_variance_name=None,
                do_model_average_for_mean_and_var=False):
     """
-    This function helps create an operator to implement
-    the BatchNorm layer using the configurations from the input parameters.
+    **Batch Normalization Layer**
+
+    Can be used as a normalizer function for conv2d and fully_connected operations.
+    The required data format for this layer is one of the following:
+
+    1. NHWC `[batch, in_height, in_width, in_channels]`
+
+    2. NCHW `[batch, in_channels, in_height, in_width]`
+
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
 
     Args:
-        input (Variable): the input variable.
-        act (str): activation type
-        is_test (bool): whether to run batch_norm as test mode.
-        momentum (float): momentum
-        epsilon (float): epsilon, default 1e-05
-        param_attr (ParamAttr|None): attributes for parameter
-        bias_attr (ParamAttr|None): attributes for bias
-        data_layout (str): data layout, default NCHW
-        in_place (bool): if True, do not create tmp variable
-        use_mkldnn (bool): ${use_mkldnn_comment}
-        name (str): The name of this layer. It is optional.
-        moving_mean_name (str): The name of moving mean variable name, optional.
-        moving_variance_name (str): The name of moving variance name, optional.
-        do_model_average_for_mean_and_var (bool):
+        input(variable): The input variable which is a LoDTensor.
+        act(string, Default None): Activation type, linear|relu|prelu|...
+        is_test(bool, Default False): Used for training or training.
+        momentum(float, Default 0.9):
+        epsilon(float, Default 1e-05):
+        param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
+        bias_attr(ParamAttr): The parameter attribute for Parameter `bias`.
+        data_layout(string, default NCHW): NCHW|NHWC
+        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
+        use_mkldnn(bool, Default false): ${use_mkldnn_comment}
+        name(string, Default None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
+        moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
+        do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
 
     Returns:
-        Variable: output of batch_norm layer.
+        Variable: A tensor variable which is the result after applying batch normalization on the input.
+
+    Examples:
+
+        .. code-block:: python
+
+            hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
+            hidden2 = fluid.layers.batch_norm(input=hidden1)
     """
     helper = LayerHelper('batch_norm', **locals())
     dtype = helper.input_dtype()
@@ -1665,6 +2122,7 @@ def batch_norm(input,
     return helper.append_activation(batch_norm_out)
 
 
+@templatedoc()
 def layer_norm(input,
                scale=True,
                shift=True,
@@ -1675,20 +2133,11 @@ def layer_norm(input,
                act=None,
                name=None):
     """
-    **Layer Normalization**
-
-    Assume feature vectors exist on dimensions
-    :attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
-    along these dimensions for each feature vector :math:`a` with size
-    :math:`H`, then normalize each feature vector using the corresponding
-    statistics. After that, apply learnable gain and bias on the normalized
-    tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
-
-    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+    ${comment}
 
     The formula is as follows:
 
-    .. math::
+    ..  math::
 
         \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
 
@@ -1696,6 +2145,15 @@ def layer_norm(input,
 
         h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
 
+    * :math:`a`: the vector representation of the summed inputs to the neurons
+    in that layer.
+
+    * :math:`H`: the number of hidden units in a layers
+
+    * :math:`g`: the trainable scale parameter.
+
+    * :math:`b`: the trainable bias parameter.
+
     Args:
         input(Variable): The input tensor variable.
         scale(bool): Whether to learn the adaptive gain :math:`g` after
@@ -1714,14 +2172,13 @@ def layer_norm(input,
         name (str): The name of this layer. It is optional.
 
     Returns:
-        Variable: A tensor variable with the same shape as the input.
+        ${y_comment}
 
     Examples:
-        .. code-block:: python
 
-            data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-            x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+        >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
+        >>>                          dtype='float32')
+        >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
     """
     helper = LayerHelper('layer_norm', **locals())
     dtype = helper.input_dtype()
@@ -1764,15 +2221,37 @@ def layer_norm(input,
 
 def beam_search_decode(ids, scores, name=None):
     """
-    ${beam_search_decode}
+    Beam Search Decode
+
+    This layers is to pack the output of beam search layer into sentences and
+    associated scores. It is usually called after the beam search layer.
+    Typically, the output of beam search layer is a tensor of selected ids, with
+    a tensor of the score of each id. Beam search layer's output ids, however, 
+    are generated directly during the tree search, and they are stacked by each 
+    level of the search tree. Thus we need to reorganize them into sentences, 
+    based on the score of each id. This layer takes the output of beam search
+    layer as input and repack them into sentences.
 
     Args:
-        ids (Variable): ${ids_comment}
-        scores (Variable): ${scores_comment}
+        ids (Variable): The selected ids, output of beam search layer. 
+        scores (Variable): The associated scores of the ids, out put of beam
+            search layer.
         name (str): The name of this layer. It is optional.
-    
+
     Returns:
-        tuple: a tuple of two output variable: sentence_ids, sentence_scores
+        tuple(Variable): a tuple of two output tensors: sentence_ids, sentence_scores.
+        sentence_ids is a tensor with shape [size, length], where size is the
+        beam size of beam search, and length is the length of each sentence. 
+        Note that the length of sentences may vary.
+        sentence_scores is a tensor with the same shape as sentence_ids.
+
+    Examples:
+        .. code-block:: python
+
+            ids, scores = fluid.layers.beam_search(
+                pre_ids, ids, scores, beam_size, end_id)
+            sentence_ids, sentence_scores = fluid.layers.beam_search_decode(
+                ids, scores)
     """
     helper = LayerHelper('beam_search_decode', **locals())
     sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
@@ -1814,32 +2293,36 @@ def conv2d_transpose(input,
     represent height and width, respectively. The details of convolution transpose
     layer, please refer to the following explanation and references
     `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
 
     For each input :math:`X`, the equation is:
 
     .. math::
 
-        Out = W \\ast X
+        Out = \sigma (W \\ast X + b)
 
-    In the above equation:
+    Where:
 
     * :math:`X`: Input value, a tensor with NCHW format.
     * :math:`W`: Filter value, a tensor with MCHW format.
-    * :math:`\\ast` : Convolution transpose operation.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
-                   different.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
     Example:
 
         - Input:
 
-          Input shape: $(N, C_{in}, H_{in}, W_{in})$
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
 
-          Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
+          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
 
         - Output:
 
-          Output shape: $(N, C_{out}, H_{out}, W_{out})$
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
 
@@ -1893,10 +2376,8 @@ def conv2d_transpose(input,
     Examples:
        .. code-block:: python
 
-          data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d_transpose = fluid.layers.conv2d_transpose(
-              input=data, num_filters=2, filter_size=3)
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
     """
     helper = LayerHelper("conv2d_transpose", **locals())
     if not isinstance(input, Variable):
@@ -1952,6 +2433,175 @@ def conv2d_transpose(input,
     return out
 
 
+def conv3d_transpose(input,
+                     num_filters,
+                     output_size=None,
+                     filter_size=None,
+                     padding=0,
+                     stride=1,
+                     dilation=1,
+                     groups=None,
+                     param_attr=None,
+                     bias_attr=None,
+                     use_cudnn=True,
+                     act=None,
+                     name=None):
+    """
+    **Convlution3D transpose layer**
+
+    The convolution3D transpose layer calculates the output based on the input,
+    filter, and dilations, strides, paddings. Input(Input) and output(Output)
+    are in NCDHW format. Where N is batch size, C is the number of channels,
+    D is the depth of the feature, H is the height of the feature, and W
+    is the width of the feature. Parameters(dilations, strides, paddings) are
+    two elements. These two elements represent height and width, respectively.
+    The details of convolution transpose layer, please refer to the following
+    explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    In the above equation:
+
+    * :math:`X`: Input value, a tensor with NCDHW format.
+    * :math:`W`: Filter value, a tensor with MCDHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+           D_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
+           H_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
+           W_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1
+
+    Args:
+        input(Variable): The input image with [N, C, D, H, W] format.
+        num_filters(int): The number of the filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain three integers, (image_D, image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square. None if use output size to
+            calculate filter_size.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
+            padding_D = padding_H = padding_W = padding. Default: padding = 0.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
+            stride_D = stride_H = stride_W = stride. Default: stride = 1.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups(int): The groups number of the Conv3d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups=1
+        param_attr(ParamAttr): The parameters to the Conv3d_transpose Layer.
+            Default: None
+        bias_attr(ParamAttr): Bias parameter for the Conv3d layer. Default: None
+        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act(str): Activation type. Default: None
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: The tensor variable storing the convolution transpose result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+
+    Examples:
+       .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
+          conv3d_transpose = fluid.layers.conv3d_transpose(input=data, num_filters=2, filter_size=3)
+    """
+    l_type = "conv3d_transpose"
+    helper = LayerHelper(l_type, **locals())
+    if not isinstance(input, Variable):
+        raise TypeError("Input of conv3d_transpose must be Variable")
+    input_channel = input.shape[1]
+
+    padding = utils.convert_to_list(padding, 3, 'padding')
+    stride = utils.convert_to_list(stride, 3, 'stride')
+    dilation = utils.convert_to_list(dilation, 3, 'dilation')
+
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False")
+
+    if filter_size is None:
+        if output_size is None:
+            raise ValueError("output_size must be set when filter_size is None")
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+
+        d_in = input.shape[2]
+        h_in = input.shape[3]
+        w_in = input.shape[4]
+
+        filter_size_d = (output_size[0] - (d_in - 1) * stride[0] + 2 *
+                         padding[0] - 1) / dilation[0] + 1
+        filter_size_h = (output_size[1] - (h_in - 1) * stride[1] + 2 *
+                         padding[1] - 1) / dilation[1] + 1
+        filter_size_w = (output_size[2] - (w_in - 1) * stride[2] + 2 *
+                         padding[2] - 1) / dilation[2] + 1
+        filter_size = [filter_size_d, filter_size_h, filter_size_w]
+    else:
+        filter_size = utils.convert_to_list(filter_size, 3,
+                                            'conv3d_transpose.filter_size')
+
+    groups = 1 if groups is None else groups
+    filter_shape = [input_channel, num_filters / groups] + filter_size
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
+
+    pre_bias = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type=l_type,
+        inputs={'Input': [input],
+                'Filter': [img_filter]},
+        outputs={'Output': pre_bias},
+        attrs={
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn
+        })
+
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+    out = helper.append_activation(pre_act)
+    return out
+
+
 def sequence_expand(x, y, ref_level=-1, name=None):
     """Sequence Expand Layer. This layer will expand the input variable **x**
     according to specified level lod of **y**. Please note that lod level of
@@ -1963,18 +2613,18 @@ def sequence_expand(x, y, ref_level=-1, name=None):
 
         * Case 1
             x is a LoDTensor:
-                x.lod  = [[0,   2,        4]]
+                x.lod  = [[2,        2]]
                 x.data = [[a], [b], [c], [d]]
                 x.dims = [4, 1]
 
             y is a LoDTensor:
-                y.lod = [[0,    2,    4],
-                         [0, 3, 6, 7, 8]]
+                y.lod = [[2,    2],
+                         [3, 3, 1, 1]]
 
             ref_level: 0
 
             then output is a 1-level LoDTensor:
-                out.lod =  [[0,   2,        4,        6,        8]]
+                out.lod =  [[2,        2,        2,        2]]
                 out.data = [[a], [b], [a], [b], [c], [d], [c], [d]]
                 out.dims = [8, 1]
 
@@ -1984,7 +2634,7 @@ def sequence_expand(x, y, ref_level=-1, name=None):
                 x.dims = [3, 1]
 
             y is a LoDTensor:
-                y.lod = [[0, 2, 2, 5]]
+                y.lod = [[2, 0, 3]]
 
             ref_level: -1
 
@@ -2033,7 +2683,7 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
         beam_size (int): ${beam_size_comment}
         end_id (int): ${end_id_comment}
         level (int): ${level_comment}
-    
+
     Returns:
         tuple: a tuple of beam_search output variables: selected_ids, selected_scores
     '''
@@ -2243,23 +2893,24 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
 
 def reduce_mean(input, dim=None, keep_dim=False, name=None):
     """
-    Computes the mean of tensor elements over the given dimension.
+    Computes the mean of the input tensor's elements along the given dimension.
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (list|int|None): The dimensions along which the mean is computed. If
-            :attr:`None`, compute the mean over all elements of :attr:`input`
-            and return a Tensor variable with a single element, otherwise
+        dim (list|int|None): The dimension along which the mean is computed. If
+            `None`, compute the mean over all elements of :attr:`input`
+            and return a variable with a single element, otherwise it
             must be in the range :math:`[-rank(input), rank(input))`. If
-            :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
+            :math:`dim[i] < 0`, the dimension to reduce is 
+            :math:`rank(input) + dim[i]`.
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
-        name(str|None): A name for this layer(optional). If set None, the layer
+        name(str|None): A name for this layer(optional). If set `None`, the layer
                        will be named automatically.
 
     Returns:
-        Variable: The reduced Tensor variable.
+        Variable: The reduced mean Variable.
 
     Examples:
         .. code-block:: python
@@ -2481,7 +3132,7 @@ def split(input, num_or_sections, dim=-1, name=None):
                        will be named automatically.
 
     Returns:
-        List: The list of segmented tensor variables.
+        list(Variable): The list of segmented tensor variables.
 
     Examples:
         .. code-block:: python
@@ -2532,32 +3183,33 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
 
     .. math::
-    y = \frac{x}{ \sqrt{\sum {x^2} + epsion }}
+
+        y = \\frac{x}{ \sqrt{\sum {x^2} + epsion }}
 
     For `x` with more dimensions, this layer independently normalizes each 1-D
     slice along dimension `axis`.
 
     Args:
         x(Variable|list): The input tensor to l2_normalize layer.
-        axis(int): The axis on which to apply normalization. If `axis < 0`,
+        axis(int): The axis on which to apply normalization. If `axis < 0`, \
             the dimension to normalization is rank(X) + axis. -1 is the
             last dimension.
-        epsilon(float): The epsilon value is used to avoid division by zero,
+        epsilon(float): The epsilon value is used to avoid division by zero, \
             the defalut value is 1e-10.
-        name(str|None): A name for this layer(optional). If set None, the layer
+        name(str|None): A name for this layer(optional). If set None, the layer \
             will be named automatically.
 
-
     Returns:
-        Variable: The output tensor variable.
+        Variable: The output tensor variable is the same shape with `x`.
 
     Examples:
+
         .. code-block:: python
 
-          data = fluid.layers.data(name="data",
-                                   shape=(3, 17, 13),
-                                   dtype="float32")
-          normed = fluid.layers.l2_normalize(x=data, axis=1)
+            data = fluid.layers.data(name="data",
+                                     shape=(3, 17, 13),
+                                     dtype="float32")
+            normed = fluid.layers.l2_normalize(x=data, axis=1)
     """
 
     if len(x.shape) == 1:
@@ -2689,25 +3341,51 @@ def topk(input, k, name=None):
     This operator is used to find values and indices of the k largest entries
     for the last dimension.
 
-    If the input is a vector (rank=1), finds the k largest entries in the vector
+    If the input is a vector (1-D Tensor), finds the k largest entries in the vector
     and outputs their values and indices as vectors. Thus values[j] is the j-th
     largest entry in input, and its index is indices[j].
 
     If the input is a Tensor with higher rank, this operator computes the top k
     entries along the last dimension.
 
+    For example:
+
+    .. code-block:: text
+
+        If:
+            input = [[5, 4, 2, 3],
+                     [9, 7, 10, 25],
+                     [6, 2, 10, 1]]
+            k = 2
+
+        Then:
+            The first output:
+            values = [[5, 4],
+                      [10, 25],
+                      [6, 10]]
+
+            The second output:
+            indices = [[0, 1],
+                       [2, 3],
+                       [0, 2]]
+
     Args:
         input(Variable): The input variable which can be a vector or Tensor with
             higher rank.
-        k(int): An integer value to specify the top k largest elements.
+        k(int):  The number of top elements to look for along the last dimension 
+                 of input.
         name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
+                       will be named automatically. 
+                       Default: None
 
     Returns:
-        values(Variable): The k largest elements along each last dimensional
-            slice.
-        indices(Variable): The indices of values within the last dimension of
-            input.
+        Tuple[Variable]: A tuple with two elements. Each element is a Variable. 
+        The first one is k largest elements along each last 
+        dimensional slice. The second one is indices of values 
+        within the last dimension of input.
+
+    Raises:
+        ValueError: If k < 1 or k is not less than the last dimension of input
 
     Examples:
         .. code-block:: python
@@ -2715,7 +3393,7 @@ def topk(input, k, name=None):
             top5_values, top5_indices = layers.topk(input, k=5)
     """
     shape = input.shape
-    if k < 1 and k >= shape[-1]:
+    if k < 1 or k >= shape[-1]:
         raise ValueError("k must be greater than 0 and less than %d." %
                          (shape[-1]))
 
@@ -2733,8 +3411,7 @@ def topk(input, k, name=None):
     return values, indices
 
 
-def edit_distance(input, label, normalized=True, ignored_tokens=None,
-                  name=None):
+def edit_distance(input, label, normalized=True, ignored_tokens=None):
     """
     EditDistance operator computes the edit distances between a batch of
     hypothesis strings and their references. Edit distance, also called
@@ -2748,21 +3425,21 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
 
     "kitten" -> "sitten" -> "sittin" -> "sitting"
 
-    Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with
+    The input is a LoDTensor consisting of all the hypothesis strings with
     the total number denoted by `batch_size`, and the separation is specified
     by the LoD information. And the `batch_size` reference strings are arranged
-    in order in the same way in the LoDTensor Input(Refs).
+    in order in the same way in the input LoDTensor.
 
-    Output(Out) contains the `batch_size` results and each stands for the edit
+    The output contains the `batch_size` results and each stands for the edit
     distance for a pair of strings respectively. If Attr(normalized) is true,
     the edit distance will be divided by the length of reference string.
 
     Args:
         input(Variable): The indices for hypothesis strings.
         label(Variable): The indices for reference strings.
-        normalized(bool): Indicated whether to normalize the edit distance by
+        normalized(bool, default True): Indicated whether to normalize the edit distance by
                           the length of reference string.
-        ignored_tokens(list of int): Tokens that should be removed before
+        ignored_tokens(list<int>, default None): Tokens that should be removed before
                                      calculating edit distance.
         name (str): The name of this layer. It is optional.
 
@@ -2774,7 +3451,6 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
 
             x = fluid.layers.data(name='x', shape=[8], dtype='float32')
             y = fluid.layers.data(name='y', shape=[7], dtype='float32')
-
             cost = fluid.layers.edit_distance(input=x,label=y)
     """
     helper = LayerHelper("edit_distance", **locals())
@@ -2815,6 +3491,7 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
 def ctc_greedy_decoder(input, blank, name=None):
     """
     This op is used to decode sequences by greedy policy by below steps:
+
     1. Get the indexes of max value for each row in input. a.k.a.
        numpy.argmax(input, axis=0).
     2. For each sequence in result of step1, merge repeated tokens between two
@@ -2836,7 +3513,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                       [0.2, 0.2, 0.1, 0.5],
                       [0.5, 0.1, 0.3, 0.1]]
 
-        input.lod = [[0, 4, 8]]
+        input.lod = [[4, 4]]
 
         Then:
 
@@ -2844,7 +3521,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                        [1],
                        [3]]
 
-        output.lod = [[0, 2, 3]]
+        output.lod = [[2, 1]]
 
     Args:
 
@@ -2861,7 +3538,7 @@ def ctc_greedy_decoder(input, blank, name=None):
 
     Returns:
         Variable: CTC greedy decode result. If all the sequences in result were
-        empty, the result LoDTensor will be [-1] with LoD [[0]] and dims [1, 1].
+        empty, the result LoDTensor will be [-1] with LoD [[]] and dims [1, 1].
 
     Examples:
         .. code-block:: python
@@ -2894,35 +3571,33 @@ def warpctc(input, label, blank=0, norm_by_times=False):
     input tensor.
 
     Args:
-        input(Variable): (LodTensor, default: LoDTensor<float>),
-            the unscaled probabilities of variable-length sequences,
-            which is a 2-D Tensor with LoD information.
-            It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
-            sequences' length and num_classes is the true number of classes.
-            (not including the blank label).
-        label(Variable): (LodTensor, default: LoDTensor<int>), the ground truth
-            of variable-length sequence, which is a 2-D Tensor with LoD
-            information. It is of the shape [Lg, 1], where Lg is th sum of
-            all labels' length.
-        blank (int): default 0, the blank label index of Connectionist
-            Temporal Classification (CTC) loss, which is in the
-            half-opened interval [0, num_classes + 1).
-        norm_by_times (bool): default false, whether to normalize
-            the gradients by the number of time-step, which is also the
-            sequence's length. There is no need to normalize the gradients
-            if warpctc layer was follewed by a mean_op.
+       input (Variable): The unscaled probabilities of variable-length sequences,
+         which is a 2-D Tensor with LoD information.
+         It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
+         sequences' length and num_classes is the true number of classes.
+         (not including the blank label).
+       label (Variable): The ground truth of variable-length sequence, 
+         which is a 2-D Tensor with LoD information. It is of the shape [Lg, 1],
+         where Lg is th sum of all labels' length.
+       blank (int, default 0): The blank label index of Connectionist
+         Temporal Classification (CTC) loss, which is in the
+         half-opened interval [0, num_classes + 1).
+       norm_by_times(bool, default false): Whether to normalize the gradients 
+         by the number of time-step, which is also the sequence's length. 
+         There is no need to normalize the gradients if warpctc layer was 
+         follewed by a mean_op.
 
     Returns:
         Variable: The Connectionist Temporal Classification (CTC) loss,
         which is a 2-D Tensor of the shape [batch_size, 1].
 
     Examples:
+
         .. code-block:: python
-            y = layers.data(
-                name='y', shape=[11, 8], dtype='float32', lod_level=1)
-            y_predict = layers.data(
-                name='y_predict', shape=[11, 1], dtype='float32')
-            cost = layers.warpctc(input=y_predict, label=y)
+
+            label = fluid.layers.data(shape=[11, 8], dtype='float32', lod_level=1)
+            predict = fluid.layers.data(shape=[11, 1], dtype='float32')
+            cost = fluid.layers.warpctc(input=predict, label=label)
 
     """
     helper = LayerHelper('warpctc', **locals())
@@ -2952,16 +3627,20 @@ def sequence_reshape(input, new_dim):
 
         x is a LoDTensor:
             x.lod  = [[0, 2, 6]]
-            x.data = [[1, 2], [3, 4],
-                      [5, 6], [7, 8], [9, 10], [11, 12]]
+            x.data = [[1,  2], [3,  4],
+                      [5,  6], [7,  8],
+                      [9, 10], [11, 12]]
             x.dims = [6, 2]
 
         set new_dim = 4
 
         then out is a LoDTensor:
+
             out.lod  = [[0, 1, 3]]
-            out.data = [[1, 2, 3, 4],
-                        [5, 6, 7, 8], [9, 10, 11, 12]]
+
+            out.data = [[1,  2,  3,  4],
+                        [5,  6,  7,  8],
+                        [9, 10, 11, 12]]
             out.dims = [3, 4]
 
     Currently, only 1-level LoDTensor is supported and please make sure
@@ -2969,19 +3648,19 @@ def sequence_reshape(input, new_dim):
     no remainder for each sequence.
 
     Args:
-        input (Variable): (LodTensor, default: LoDTensor<float>), a 2-D LoDTensor
-            with shape being [N, M] where M for dimension.
-        new_dim (int): New dimension which the input LoDTensor is reshaped to.
+
+       input (Variable): A 2-D LoDTensor with shape being [N, M] where M for dimension.
+       new_dim (int): New dimension that the input LoDTensor is reshaped to.
 
     Returns:
+
         Variable: Reshaped LoDTensor according to new dimension.
 
     Examples:
         .. code-block:: python
 
-            x = fluid.layers.data(name='x', shape=[5, 20],
-                              dtype='float32', lod_level=1)
-            x_reshaped = layers.sequence_reshape(input=x, new_dim=10)
+            x = fluid.layers.data(shape=[5, 20], dtype='float32', lod_level=1)
+            x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10)
     """
     helper = LayerHelper('sequence_reshape', **locals())
     out = helper.create_tmp_variable(helper.input_dtype())
@@ -3011,13 +3690,41 @@ def nce(input,
         input (Variable): input variable.
         label (Variable): label.
         num_total_classes (int):${num_total_classes_comment}
-        sample_weight (int): ${sample_weight_comment}
+        sample_weight (Variable|None): A Variable of shape [batch_size, 1] 
+            storing a weight for each sample. The default weight for each 
+            sample is 1.0.
         param_attr (ParamAttr|None): attributes for parameter
         bias_attr (ParamAttr|None): attributes for bias
         num_neg_samples (int): ${num_neg_samples_comment}
-    
+
     Returns:
-        Variable: output of nce layer.
+        Variable: The output nce loss.
+
+    Examples:
+        .. code-block:: python
+
+            window_size = 5
+            words = []
+            for i in xrange(window_size):
+                words.append(layers.data(
+                    name='word_{0}'.format(i), shape=[1], dtype='int64'))
+
+            dict_size = 10000
+            label_word = int(window_size / 2) + 1
+
+            embs = []
+            for i in xrange(window_size):
+                if i == label_word:
+                    continue
+
+                emb = layers.embedding(input=words[i], size=[dict_size, 32],
+                                       param_attr='emb.w', is_sparse=True)
+                embs.append(emb)
+
+            embs = layers.concat(input=embs, axis=1)
+            loss = layers.nce(input=embs, label=words[label_word],
+                          num_total_classes=dict_size, param_attr='nce.w',
+                          bias_attr='nce.b')
     """
     helper = LayerHelper('nce', **locals())
     assert isinstance(input, Variable)
@@ -3068,8 +3775,6 @@ def nce(input,
 
 def transpose(x, perm, name=None):
     """
-    **transpose Layer**
-
     Permute the dimensions of `input` according to `perm`.
 
     The `i`-th dimension  of the returned tensor will correspond to the
@@ -3159,8 +3864,6 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
 
     Examples:
 
-    As an example:
-
         .. code-block:: text
 
             Given:
@@ -3202,9 +3905,9 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
 
             output.dims = {8, 9}
 
-            output.lod = [[0, 4, 8]]
+            output.lod = [[4, 4]]
 
-        The simple usage is:
+     Examples:
 
         .. code-block:: python
 
@@ -3237,29 +3940,13 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
     return out
 
 
+@templatedoc()
 def row_conv(input, future_context_size, param_attr=None, act=None):
-    """Row Conv Operator. This layer will apply lookahead convolution to
-    **input**. The input variable should be a 2D LoDTensor with shape [T, D].
-    Parameters with shape [future_context_size + 1, D] will be created. The math
-    equation of row convolution is as follows:
-
-    .. math::
-        Out_{i} = \sum_{j = i} ^ {i + \\tau} X_{j} \odot W_{i - j}
-
-    In the above equation:
-
-    * :math:`Out_{i}`: The i-th row of output variable with shape [1, D].
-    * :math:`\\tau`: Future context size.
-    * :math:`X_{j}`: The j-th row of input variable with shape [1, D].
-    * :math:`W_{i-j}`: The (i-j)-th row of parameters with shape [1, D].
-
-    More details about row_conv please refer to the paper \
-    (http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf) and
-    the design document \
-    (https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645).
+    """
+    ${comment}
 
     Args:
-        input (Variable): Input variable, a 2D LoDTensor with shape [T, D].
+        input (${x_type}): ${x_comment}.
         future_context_size (int): Future context size. Please note, the shape
             of convolution kernel is [future_context_size + 1, D].
         param_attr (ParamAttr): Attributes of parameters, including
@@ -3267,14 +3954,13 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
         act (str): Non-linear activation to be applied to output variable.
 
     Returns:
-        Variable: The output tensor with same shape as input tensor.
+        ${out_comment}.
 
     Examples:
-        .. code-block:: python
-
-            x = fluid.layers.data(name='x', shape=[16],
-                            dtype='float32', lod_level=1)
-            out = fluid.layers.row_conv(input=x, future_context_size=2)
+        >>> import paddle.fluid as fluid
+        >>> x = fluid.layers.data(name='x', shape=[16],
+        >>>                        dtype='float32', lod_level=1)
+        >>> out = fluid.layers.row_conv(input=x, future_context_size=2)
     """
     helper = LayerHelper('row_conv', **locals())
     dtype = helper.input_dtype()
@@ -3290,42 +3976,23 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
     return helper.append_activation(out)
 
 
+@templatedoc()
 def multiplex(inputs, index):
     """
-    **Multiplex Layer**
-
-    Referring to the given index variable, this layer selects rows from the
-    input variables to construct a multiplex variable. Assuming that there are
-    :math:`m` input variables and :math:`I_i` represents the i-th input
-    variable and :math:`i` is in [0, :math:`m`). All input variables are
-    tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
-    Please note that rank of the input tensor should be at least 2. Each input
-    variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
-    where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
-    * ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
-    variable. The given index variable should be a 2-D tensor with shape
-    [:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
-    Then the output variable will be a tensor with shape [:math:`d_0`,
-    :math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
-    matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
-    row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
+    ${comment}
+
+    >>> import paddle.fluid as fluid
+    >>> x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
+    >>> x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
+    >>> index = fluid.layers.data(name='index', shape=[1], dtype='int32')
+    >>> out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
 
     Args:
-        inputs (list): A list of variables to gather from. All variables have the
-                same shape and the rank is at least 2.
-        index (Variable): Tensor<int32>, index variable which is a 2-D tensor
-                with shape [M, 1] where M is the batch size.
+       inputs (list): ${x_comment}.
+       index (${ids_type}): ${ids_comment}.
 
     Returns:
-        Variable: Multiplex variable gathered from input variables.
-
-    Examples:
-        .. code-block:: python
-
-            x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
-            index = fluid.layers.data(name='index', shape=[1], dtype='int32')
-            out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
+        ${out_comment}.
     """
     helper = LayerHelper('multiplex', **locals())
 
@@ -3411,31 +4078,30 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
 
 def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     """
-    **Smooth L1 Loss Operator. **
-
-    This operator computes the smooth L1 loss for X and Y.
-    The operator takes the first dimension of X and Y as batch size.
+    This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
+    It takes the first dimension of :attr:`x` and :attr:`y` as batch size.
     For each instance, it computes the smooth L1 loss element by element first
-    and then sums all the losses. So the shape of Out is [batch_size, 1].
+    and then sums all the losses. So the shape of ouput Variable is 
+    [batch_size, 1].
 
     Args:
         x (Variable): A tensor with rank at least 2. The input value of smooth
             L1 loss op with shape [batch_size, dim1, ..., dimN].
         y (Variable): A tensor with rank at least 2. The target value of smooth
-            L1 loss op with same shape as x.
+            L1 loss op with same shape as :attr:`x`.
         inside_weight (Variable|None):  A tensor with rank at least 2. This
-            input is optional and should have same shape with x. If provided,
-            the result of (x - y) will be multiplied by this tensor element by
-            element.
+            input is optional and should have same shape with :attr:`x`. If 
+            provided, the result of (:attr:`x` - :attr:`y`) will be multiplied 
+            by this tensor element by element.
         outside_weight (Variable|None): A tensor with rank at least 2. This
-            input is optional and should have same shape with x. If provided,
-            the out smooth L1 loss will be multiplied by this tensor element
-            by element.
-        sigma (float|None): Hyper parameter of smooth L1 loss op. A float scalar
-            with default value 1.0.
+            input is optional and should have same shape with :attr:`x`. If 
+            provided, the out smooth L1 loss will be multiplied by this tensor 
+            element by element.
+        sigma (float|None): Hyper parameter of smooth L1 loss layer. A float 
+           scalar with default value 1.0.
+
     Returns:
-        Variable: A tensor with rank be 2. The output smooth L1 loss with
-            shape [batch_size, 1].
+        Variable: The output smooth L1 loss with shape [batch_size, 1].
 
     Examples:
         .. code-block:: python
@@ -3446,6 +4112,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
             fc = fluid.layers.fc(input=data, size=100)
             out = fluid.layers.smooth_l1(x=fc, y=label)
     """
+
     helper = LayerHelper('smooth_l1_loss', **locals())
     diff = helper.create_tmp_variable(dtype=x.dtype)
     loss = helper.create_tmp_variable(dtype=x.dtype)
@@ -3465,32 +4132,20 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
 
 def one_hot(input, depth):
     """
-    One Hot Operator. This operator creates the one-hot representations for input
-    index values. The following example will help to explain the function of this
-    operator.
+    This layer creates the one-hot representations for input indices.
 
     Args:
-        input(variable):  A Tensor/LodTensor of indices, last dimension must be 1.
-        depth(scalar): an interger defining the depth of the one hot dimension.
+        input(Variable): Input indices, last dimension must be 1.
+        depth(scalar): An interger defining the depth of the one-hot dimension.
 
     Returns:
-         The one-hot tensor or LodTensor, same as input.
+        Variable: The one-hot representations of input.
 
     Examples:
         .. code-block:: python
-
-        X is a LoDTensor:
-          X.lod = [[0, 1, 4]]
-          X.shape = [4, 1]
-          X.data = [[1], [1], [3], [0]]
-        set depth = 4
-        Out is a LoDTensor:
-          Out.lod = [[0, 1, 4]]
-          Out.shape = [4, 4]
-          Out.data = [[0., 1., 0., 0.],
-                      [0., 1., 0., 0.],
-                      [0., 0., 0., 1.],
-                      [1., 0., 0., 0.]]
+        
+            label = layers.data(name="label", shape=[1], dtype="float32")
+            one_hot_label = layers.one_hot(input=label, depth=10)
     """
     helper = LayerHelper("one_hot", **locals())
     one_hot_out = helper.create_tmp_variable(dtype='float32')
@@ -3504,8 +4159,9 @@ def one_hot(input, depth):
 
 def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     """
-    NOTE: The counter will be automatically increased by 1 every mini-batch
-    Return the run counter of the main program, which is started with 1.
+    Create an auto-increase variable
+    which will be automatically increased by 1 every mini-batch
+    Return the run counter of the main program, default is started from 1.
 
     Args:
         counter_name(str): The counter name, default is '@STEP_COUNTER@'.
@@ -3514,6 +4170,12 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
 
     Returns:
         Variable: The global run counter.
+
+    Examples:
+        .. code-block:: python
+
+           global_step = fluid.layers.autoincreased_step_counter(
+               counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1)
     """
     helper = LayerHelper('global_step_counter')
     if counter_name is None:
@@ -3634,73 +4296,74 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
 
 def lod_reset(x, y=None, target_lod=None):
     """
-    LoD Reset Operator. Set LoD of **x** to a new one specified by **y** or
-    **target_lod**. When **y** provided, **y.lod** would be considered as target
-    LoD first, otherwise **y.data** would be considered as target LoD. If **y**
-    is not provided, target LoD should be specified by **target_lod**.
-    If target LoD is specified by **Y.data** or **target_lod**, only one level
-    LoD is supported.
+    Set LoD of :attr:`x` to a new one specified by :attr:`y` or
+    :attr:`target_lod`. When :attr:`y` provided, :attr:`y.lod` would be 
+    considered as target LoD first, otherwise :attr:`y.data` would be 
+    considered as target LoD. If :attr:`y` is not provided, target LoD should 
+    be specified by :attr:`target_lod`. If target LoD is specified by 
+    :attr:`Y.data` or :attr:`target_lod`, only one level LoD is supported.
 
     .. code-block:: text
 
         * Example 1:
 
             Given a 1-level LoDTensor x:
-                x.lod =  [[ 0,     2,                   5      6 ]]
+                x.lod =  [[ 2,           3,                   1 ]]
                 x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 x.dims = [6, 1]
 
-            target_lod: [0, 4, 6]
+            target_lod: [4, 2]
 
             then we get a 1-level LoDTensor:
-                out.lod =  [[ 0,                   4,            6 ]]
+                out.lod =  [[4,                          2]]
                 out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 out.dims = [6, 1]
 
         * Example 2:
 
             Given a 1-level LoDTensor x:
-                x.lod =  [[ 0,     2,                   5      6 ]]
+                x.lod =  [[2,            3,                   1]]
                 x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 x.dims = [6, 1]
 
             y is a Tensor:
-                y.data = [[0, 2, 6]]
+                y.data = [[2, 4]]
                 y.dims = [1, 3]
 
             then we get a 1-level LoDTensor:
-                out.lod =  [[ 0,     2,                          6 ]]
+                out.lod =  [[2,            4]]
                 out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 out.dims = [6, 1]
 
         * Example 3:
 
             Given a 1-level LoDTensor x:
-                x.lod =  [[ 0,      2,                   5     6 ]]
+                x.lod =  [[2,            3,                   1]]
                 x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 x.dims = [6, 1]
 
             y is a 2-level LoDTensor:
-                y.lod =  [[0, 2, 4], [0, 2, 5, 6]]
+                y.lod =  [[2, 2], [2, 2, 1, 1]]
                 y.data = [[1.1], [2.1], [3.1], [4.1], [5.1], [6.1]]
                 y.dims = [6, 1]
 
             then we get a 2-level LoDTensor:
-                out.lod =  [[0, 2, 4], [0, 2, 5, 6]]
+                out.lod =  [[2, 2], [2, 2, 1, 1]]
                 out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
                 out.dims = [6, 1]
 
     Args:
         x (Variable): Input variable which could be a Tensor or LodTensor.
-        y (Variable|None): If provided, output's LoD would be derived from y.
+        y (Variable|None): If provided, output's LoD would be derived 
+                           from :attr:`y`.
         target_lod (list|tuple|None): One level LoD which should be considered
-                                      as target LoD when y not provided.
+                                      as target LoD when :attr:`y` not provided.
 
     Returns:
-        Variable: Output variable with LoD specified by this operator.
+        Variable: Output variable with LoD specified by this layer.
 
     Raises:
-        ValueError: If y and target_lod are both None.
+        ValueError: If :attr:`y` and :attr:`target_lod` are both None.
 
     Examples:
         .. code-block:: python
@@ -3736,9 +4399,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
 
     .. math::
 
-        Output(i, x, y) = Input(i, x, y) / \left(
-        k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)}
-        (Input(j, x, y))^2 \right)^{\beta}
+      Output(i, x, y) = Input(i, x, y) / \\left(k + \\alpha \\sum\\limits^{\\min(C, c + n/2)}_{j = \\max(0, c - n/2)}(Input(j, x, y))^2\\right)^{\\beta}
 
     In the above equation:
 
@@ -3922,34 +4583,20 @@ def label_smooth(label,
     return smooth_label
 
 
+@templatedoc()
 def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
     """
-    Region of interest pooling (also known as RoI pooling) is to perform
-        is to perform max pooling on inputs of nonuniform sizes to obtain
-        fixed-size feature maps (e.g. 7*7).
-    The operator has three steps:
-        1. Dividing each region proposal into equal-sized sections with
-           the pooled_width and pooled_height
-        2. Finding the largest value in each section
-        3. Copying these max values to the output buffer
+    ${comment}
 
     Args:
-        input (Variable): The input for ROI pooling.
-        rois (Variable): ROIs (Regions of Interest) to pool over. It should
-                         be a 2-D one level LoTensor of shape [num_rois, 4].
-                         The layout is [x1, y1, x2, y2], where (x1, y1)
-                         is the top left coordinates, and (x2, y2) is the
-                         bottom right coordinates. The num_rois is the
-                         total number of ROIs in this batch data.
-        pooled_height (integer): The pooled output height. Default: 1
-        pooled_width (integer): The pooled output width. Default: 1
-        spatial_scale (float): Multiplicative spatial scale factor. To
-                               translate ROI coords from their input scale
-                               to the scale used when pooling. Default: 1.0
+        input (Variable): ${x_comment}
+        rois (Variable): ROIs (Regions of Interest) to pool over.
+        pooled_height (integer): ${pooled_height_comment} Default: 1
+        pooled_width (integer): ${pooled_width_comment} Default: 1
+        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
 
     Returns:
-        pool_out (Variable): The output is a 4-D tensor of the shape
-                             (num_rois, channels, pooled_h, pooled_w).
+        Variable: ${out_comment}.
 
     Examples:
         .. code-block:: python
@@ -4021,12 +4668,13 @@ def image_resize(input,
                  name=None,
                  resample='BILINEAR'):
     """
-    Resize a batch of images.
+    **Resize a Batch of Images**
 
     The input must be a tensor of the shape (num_batches, channels, in_h, in_w), 
     and the resizing only applies on the last two dimensions(hight and width).
 
     Supporting resample methods:
+
         'BILINEAR' : Bilinear interpolation
 
     Args:
@@ -4046,8 +4694,8 @@ def image_resize(input,
                        Default: 'BILINEAR'
 
     Returns:
-        out (Variable): The output is a 4-D tensor of the shape
-                        (num_batches, channls, out_h, out_w).
+        Variable: The output is a 4-D tensor of the shape
+        (num_batches, channls, out_h, out_w).
 
     Examples:
         .. code-block:: python
@@ -4131,8 +4779,8 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
         resample (str): resample method, default: BILINEAR.
 
     Returns:
-        out (Variable): The output is a 4-D tensor of the shape
-                        (num_batches, channls, out_h, out_w).
+        Variable: The output is a 4-D tensor of the shape
+        (num_batches, channls, out_h, out_w).
     """
     in_shape = input.shape
     if len(in_shape) != 4:
@@ -4151,6 +4799,8 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
 
 def gather(input, index):
     """
+    **Gather Layer**
+
     Output is obtained by gathering entries of the outer-most dimension 
     of X indexed by `index` and concatenate them together.
 
@@ -4204,10 +4854,6 @@ def random_crop(x, shape, seed=None):
     """
     ${comment}
 
-    Examples:
-        >>> img = fluid.layers.data("img", [3, 256, 256])
-        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
-
     Args:
         x(${x_type}): ${x_comment}
         shape(${shape_type}): ${shape_comment}
@@ -4216,7 +4862,10 @@ def random_crop(x, shape, seed=None):
 
     Returns:
         ${out_comment}
-
+    
+    Examples:
+        >>> img = fluid.layers.data("img", [3, 256, 256])
+        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
     """
     helper = LayerHelper("random_crop", **locals())
     dtype = helper.input_dtype()
@@ -4250,6 +4899,62 @@ def random_crop(x, shape, seed=None):
     return out
 
 
+def log(x):
+    """
+    Calculates the natural log of the given input tensor, element-wise.
+
+    .. math::
+
+        Out = \\ln(x)
+
+    Args:
+        x (Variable): Input tensor. 
+
+    Returns:
+        Variable: The natural log of the input tensor computed element-wise.
+
+    Examples:
+
+        .. code-block:: python
+
+            output = fluid.layers.log(x)
+    """
+    helper = LayerHelper('log', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(type="log", inputs={"X": input}, outputs={"Out": out})
+    return out
+
+
+def relu(x):
+    """
+    Relu takes one input data (Tensor) and produces one output data (Tensor)
+    where the rectified linear function, y = max(0, x), is applied to
+    the tensor elementwise.
+
+    .. math::
+
+        Out = \\max(0, x)
+
+    Args:
+        x (Variable): The input tensor. 
+
+    Returns:
+        Variable: The output tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            output = fluid.layers.relu(x)
+    """
+    helper = LayerHelper('relu', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(type="relu", inputs={"X": input}, outputs={"Out": out})
+    return out
+
+
 def mean_iou(input, label, num_classes):
     """
     Mean Intersection-Over-Union is a common evaluation metric for
@@ -4258,8 +4963,8 @@ def mean_iou(input, label, num_classes):
     IOU is defined as follows: 
     
     .. math::
-        
-        IOU = true_positive / (true_positive + false_positive + false_negative). 
+
+        IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}.
 
     The predictions are accumulated in a confusion matrix and mean-IOU 
     is then calculated from it.
@@ -4267,19 +4972,19 @@ def mean_iou(input, label, num_classes):
 
     Args:
         input (Variable): A Tensor of prediction results for semantic labels with type int32 or int64.
-        label (Variable):  A Tensor of ground truth labels with type int32 or int64. 
+        label (Variable): A Tensor of ground truth labels with type int32 or int64.
                            Its shape should be the same as input.
+        num_classes (int): The possible number of labels.
 
     Returns:
         mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1].
         out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
         out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class. 
 
-
     Examples:
 
         .. code-block:: python
-
+            
             iou, wrongs, corrects = fluid.layers.mean_iou(predict, label, num_classes)
     """
     helper = LayerHelper('mean_iou', **locals())
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 98f169e8f0881fbba6aecb45b43a52c8fd51132d..9e97ec9a6f55680a2eb44ad712ac002df4fecda5 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -17,7 +17,6 @@ __activations__ = [
     'sigmoid',
     'logsigmoid',
     'exp',
-    'relu',
     'tanh',
     'tanh_shrink',
     'softshrink',
@@ -29,7 +28,6 @@ __activations__ = [
     'sin',
     'round',
     'reciprocal',
-    'log',
     'square',
     'softplus',
     'softsign',
@@ -40,8 +38,6 @@ __activations__ = [
     'relu6',
     'pow',
     'stanh',
-    'hard_shrink',
-    'thresholded_relu',
     'hard_sigmoid',
     'swish',
 ]
@@ -64,18 +60,102 @@ __all__ = [
     'logical_or',
     'logical_xor',
     'logical_not',
-    'uniform_random',
     'uniform_random_batch_size_like',
     'gaussian_random',
     'gaussian_random_batch_size_like',
-    'cumsum',
     'scatter',
     'sum',
     'slice',
     'polygon_box_transform',
     'shape',
+    'iou_similarity',
     'maxout',
 ] + __activations__
 
 for _OP in set(__all__):
     globals()[_OP] = generate_layer_fn(_OP)
+
+__all__ += ["uniform_random"]
+
+_uniform_random_ = generate_layer_fn('uniform_random')
+
+
+def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+    return _uniform_random_(**kwargs)
+
+
+uniform_random.__doc__ = _uniform_random_.__doc__ + """
+Examples:
+
+    >>> result = fluid.layers.uniform_random(shape=[32, 784])
+"""
+
+__all__ += ['hard_shrink']
+
+_hard_shrink_ = generate_layer_fn('hard_shrink')
+
+
+def hard_shrink(x, threshold=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+    return _hard_shrink_(**kwargs)
+
+
+hard_shrink.__doc__ = _hard_shrink_.__doc__ + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[784])
+    >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
+"""
+
+__all__ += ['cumsum']
+
+_cum_sum_ = generate_layer_fn('cumsum')
+
+
+def cumsum(x, axis=None, exclusive=None, reverse=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+
+    return _cum_sum_(**kwargs)
+
+
+cumsum.__doc__ = _cum_sum_.__doc__ + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[32, 784])
+    >>> result = fluid.layers.cumsum(data, axis=0)
+"""
+
+__all__ += ['thresholded_relu']
+
+_thresholded_relu_ = generate_layer_fn('thresholded_relu')
+
+
+def thresholded_relu(x, threshold=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+
+    _thresholded_relu_(**kwargs)
+
+
+thresholded_relu.__doc__ = _thresholded_relu_.__doc__ + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[1])
+    >>> result = fluid.layers.thresholded_relu(data, threshold=0.4)
+"""
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 62b01d595a812ee8fc094e40b6dfb5c3f56cd012..149e77b52415025e78fbf4ee641151880b415fb0 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -6,7 +6,7 @@
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
+# Unlessf required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
@@ -35,10 +35,29 @@ __all__ = [
     'argmax',
     'ones',
     'zeros',
+    'reverse',
 ]
 
 
 def create_tensor(dtype, name=None, persistable=False):
+    """
+    Create an variable, which will hold a LoDTensor with data type dtype.
+
+    Args:
+        dtype(string): 'float32'|'int32'|..., the data type of the
+            created tensor.
+        name(string): The name of the created tensor, if not set,
+            the name will be a random unique one.
+        persistable(bool): Set the persistable flag of the create tensor.
+
+    Returns:
+        Variable: The tensor variable storing the created tensor.
+
+    Examples:
+        .. code-block:: python
+
+          tensor = fluid.layers.create_tensor(dtype='float32')
+    """
     helper = LayerHelper("create_tensor", **locals())
     return helper.create_variable(
         name=helper.name, dtype=dtype, persistable=persistable)
@@ -51,7 +70,12 @@ def create_parameter(shape,
                      is_bias=False,
                      default_initializer=None):
     """
-    Create a parameter
+    Create a parameter. The parameter is a learnable variable, which can have
+    gradient, and can be optimized.
+
+    NOTE: this is a very low-level API. This API is useful when you create
+    operator by your self. instead of using layers.
+
     Args:
         shape(list[int]): shape of the parameter
         dtype(string): element type of the parameter
@@ -63,7 +87,12 @@ def create_parameter(shape,
         default_initializer(Initializer): initializer for the parameter
 
     Returns:
-        Parameter: the created parameter
+        the created parameter.
+
+    Examples:
+        >>> W = fluid.layers.create_parameter(shape=[784, 200], dtype='float32')
+        >>> data = fluid.layers.data(name="img", shape=[64, 784], append_batch_size=False)
+        >>> hidden = fluid.layers.matmul(x=data, y=W)
     """
     helper = LayerHelper("create_parameter", **locals())
     if attr is None:
@@ -79,16 +108,29 @@ def create_global_var(shape,
                       force_cpu=False,
                       name=None):
     """
-    Create a global variable. such as global_step
+    Create a new variable in the global block(block 0).
+
     Args:
         shape(list[int]): shape of the variable
-        value(float): the value of the variable
-        dtype(string): element type of the parameter
-        persistable(bool): if this variable is persistable
-        force_cpu(bool): force this variable to be on CPU
+        value(float): the value of the variable. The new created 
+                      variable will be filled with it.
+        dtype(string): data type of the variable
+        persistable(bool): if this variable is persistable. 
+                           Default: False
+        force_cpu(bool): force this variable to be on CPU. 
+                         Default: False
+        name(str|None): The name of the variable. If set to None the variable 
+                        name will be generated automatically. 
+                        Default: None
 
     Returns:
         Variable: the created Variable
+
+    Examples:
+        .. code-block:: python
+
+            var = fluid.create_global_var(shape=[2,3], value=1.0, dtype='float32', 
+                                 persistable=True, force_cpu=True, name='new_var')
     """
     helper = LayerHelper("global_var", **locals())
     var = helper.create_global_variable(
@@ -101,8 +143,21 @@ def create_global_var(shape,
 
 def cast(x, dtype):
     """
-    This function takes in the input with input_dtype
-    and casts it to the output_dtype as the output.
+    This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts 
+    it to the output with :attr:`dtype`.
+
+    Args:
+        x (Variable): The input Variable for casting.
+        dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output Variable.
+
+    Returns:
+        Variable: The output Variable after casting.
+
+    Examples:
+        .. code-block:: python
+             
+            data = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            result = fluid.layers.cast(x=data, dtype='float64')
     """
     helper = LayerHelper('cast', **locals())
     out = helper.create_tmp_variable(dtype=dtype)
@@ -133,7 +188,8 @@ def concat(input, axis=0, name=None):
 
     Examples:
         .. code-block:: python
-          out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
+        
+           out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
     """
     helper = LayerHelper('concat', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -146,19 +202,21 @@ def concat(input, axis=0, name=None):
 
 
 def sums(input, out=None):
-    """This function performs the sum operation on the input and returns the
+    """
+    This function performs the sum operation on the input and returns the
     result as the output.
 
     Args:
         input (Variable|list): The input tensor that has the elements
                                that need to be summed up.
+        out (Variable|None): Output parameter. The sum result.
+                             Default: None
 
     Returns:
-        Variable: The tensor type variable that has the sum of input
-                  written to it.
+        Variable: the sum of input. The same as the argument 'out'
 
     Examples:
-        .. code-block::python
+        .. code-block:: python
 
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
@@ -191,6 +249,7 @@ def assign(input, output):
 
     Examples:
         .. code-block:: python
+
           out = fluid.layers.create_tensor(dtype='float32')
           hidden = fluid.layers.fc(input=data, size=10)
           fluid.layers.assign(hidden, out)
@@ -328,13 +387,13 @@ def argmin(x, axis=0):
         x(Variable): The input to compute the indices of
                      the min elements.
         axis(int): Axis to compute indices along.
-    
+
     Returns:
         Variable: The tensor variable storing the output
-    
+
     Examples:
         .. code-block:: python
-          
+
           out = fluid.layers.argmin(x=in, axis=0)
           out = fluid.layers.argmin(x=in, axis=-1)  
     """
@@ -359,13 +418,13 @@ def argmax(x, axis=0):
         x(Variable): The input to compute the indices of
                      the max elements.
         axis(int): Axis to compute indices along.
-    
+
     Returns:
         Variable: The tensor variable storing the output
-    
+
     Examples:
         .. code-block:: python
-          
+
           out = fluid.layers.argmax(x=in, axis=0)
           out = fluid.layers.argmax(x=in, axis=-1)  
     """
@@ -413,11 +472,12 @@ def zeros(shape, dtype, force_cpu=False):
     It also sets *stop_gradient* to True.
 
     Args:
-        shape(tuple|list|None): Shape of output tensor
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor
+        shape(tuple|list|None): Shape of output tensor.
+        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor.
+        force_cpu(bool, default False): Whether to make output stay on CPU.
 
     Returns:
-        Variable: The tensor variable storing the output
+        Variable: The tensor variable storing the output.
 
     Examples:
         .. code-block:: python
@@ -486,11 +546,27 @@ def save_combine(x, file_path, overwrite=True):
     Saves a list of variables into a single file.
 
     Args:
-        x(list): A list of Tensor/LoDTensor to be saved together in a single file.
+        x(list): A list of Tensor/LoDTensor variables to be saved together in
+                 a single file.
         file_path(str): The file path where variables will be saved.
-        overwrite(bool): Whether or not cover the given file when it has already 
+        overwrite(bool): Whether or not cover the given file when it has already
             existed. If it's set 'False' and the file is existed, a runtime 
             error will be thrown. 
+
+    Returns:
+        There is no return value.
+
+    Examples:
+
+        .. code-block:: python
+
+            v1 = fluid.layers.data(name="data",
+                                   shape=(4, 6),
+                                   dtype="float32")
+            v2 = fluid.layers.data(name="data",
+                                   shape=(6, 8, 4),
+                                   dtype="float32")
+            normed = fluid.layers.save_combine([v1, v2], file_path="output")
     """
     helper = LayerHelper("save_combine", **locals())
     helper.append_op(
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
index 9946d0a4ff33b2f5040f6d2e31aa20fcf9c609a7..61be39c25912604f842ef8a9a6ec5f0d1cf70257 100644
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -18,80 +18,6 @@ import numpy as np
 __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
 
 
-def _validate_lod(lod, tensor_height=-1):
-    """Check whether the input length-based lod info is valid.
-
-    There are several things to check:
-    1. lod should be a list of lists. Empty list is fine.
-    2. The length of each sublist (a lod level) should be at least one.
-    3. Each element in each lod level should be an integer greater than 0.
-    4. The sum of one lod level should be equal to the length of the next lod level.
-    5. The sum of the last lod level should be equal to the tensor height. 
-       Bypass this check if user does not provide tensor_height as input.
-
-    Args:
-        lod: the length-based lod info, e.g., [[2, 3], [2, 1, 2, 3, 4]].
-        tensor_height: the outermost dimension of the tensor with which the input 
-            lod is associated with. 
-
-    Returns:
-        A boolean indicating whether the input lod is valid or not.
-    """
-    assert isinstance(lod, list), "lod should be a list"
-    # Empty lod is fine
-    if len(lod) == 0:
-        return True
-
-    lod_sum = []
-    for level in lod:
-        assert isinstance(level, list), "each item in lod should be a list"
-        # Each level of lod should have at least one length info
-        if len(level) < 1:
-            return False
-        level_sum = 0
-        for lod_len in level:
-            # Each length in a level should be > 0
-            if lod_len <= 0:
-                return False
-            level_sum += lod_len
-        lod_sum.append(level_sum)
-
-    for idx, val in enumerate(lod_sum[:-1]):
-        # Each level's sum should be equal to 
-        # the number of items in the next level
-        if val != len(lod[idx + 1]):
-            return False
-
-    if tensor_height == -1:
-        return True
-    else:
-        # Last level's sum should be equal to the tensor height
-        return lod_sum[-1] == tensor_height
-
-
-def _convert_lod(lod):
-    """Convert a length-based lod to a offset-based lod.
-
-    If the length-based lod is [[2, 3], [2, 1, 2, 3, 4]],
-    then the offset-based lod is [[0, 2, 5], [0, 2, 3, 5, 8, 12]].
-
-    Args:
-        lod: a length-based lod info. 
-
-    Returns:
-        A list of lists as the offset-based lod converted to from the input lod.
-    """
-    new_lod = []
-    for level in lod:
-        cur_len = 0
-        new_level = [cur_len]
-        for lod_len in level:
-            cur_len += lod_len
-            new_level.append(cur_len)
-        new_lod.append(new_level)
-    return new_lod
-
-
 def create_lod_tensor(data, lod, place):
     """Create a lod tensor from a numpy array, a list, or an existing lod tensor.
 
@@ -139,11 +65,11 @@ def create_lod_tensor(data, lod, place):
         flattened_data = flattened_data.reshape([len(flattened_data), 1])
         return create_lod_tensor(flattened_data, lod, place)
     elif isinstance(data, np.ndarray):
-        assert _validate_lod(lod,
-                             data.shape[0]), "the provided lod info is invalid"
         tensor = core.LoDTensor()
         tensor.set(data, place)
-        tensor.set_lod(_convert_lod(lod))
+        tensor.set_recursive_sequence_lengths(lod)
+        assert tensor.has_valid_recursive_sequence_lengths(
+        ), "the provided lod info is invalid"
         return tensor
     else:
         raise TypeError(
@@ -181,9 +107,8 @@ def create_random_int_lodtensor(lod, base_shape, place, low, high):
         A fluid LoDTensor object with tensor data and lod info. 
     """
     assert isinstance(base_shape, list), "base_shape should be a list"
-    converted_lod = _convert_lod(lod)
     # append the total number of basic elements to the front of its shape
-    overall_shape = [converted_lod[-1][-1]] + base_shape
+    overall_shape = [sum(lod[-1])] + base_shape
     # the range of integer data elements is [low, high]    
     data = np.random.random_integers(low, high, overall_shape).astype("int64")
     return create_lod_tensor(data, lod, place)
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 115362c6bf33018342699a442c688e7356f3c206..54fe9356275c313cd18fbb12edc9d35f38bda772 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import re
 from collections import defaultdict
-from paddle.fluid.framework import Program
+from paddle.fluid.framework import Program, Variable
 import framework
 import layers
 from backward import append_backward
@@ -41,7 +41,10 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    def __init__(self, learning_rate, regularization=None):
+    def __init__(self,
+                 learning_rate,
+                 regularization=None,
+                 LARS_weight_decay=0.0):
         if not isinstance(learning_rate, float) and \
                 not isinstance(learning_rate, framework.Variable):
             raise TypeError("learning rate should be float or Variable")
@@ -61,6 +64,7 @@ class Optimizer(object):
         # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
         self._accumulators = defaultdict(lambda: dict())
         self.helper = None
+        self._LARS_weight_decay = LARS_weight_decay
 
     def _create_global_learning_rate(self):
         lr = self.global_learning_rate()
@@ -100,10 +104,15 @@ class Optimizer(object):
         # create learning rate variable for every parameter
         param = param_and_grad[0]
         param_lr = param.optimize_attr['learning_rate']
-        if param_lr == 1.0:
-            return self.global_learning_rate()
+        if type(param_lr) == Variable:
+            # param learning rate has been updated (LARS)
+            print("returns updated param lr ", param_lr)
+            return param_lr
         else:
-            return self.global_learning_rate() * param_lr
+            if param_lr == 1.0:
+                return self.global_learning_rate()
+            else:
+                return self.global_learning_rate() * param_lr
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -210,6 +219,10 @@ class Optimizer(object):
             self._create_accumulators(loss.block,
                                       [p[0] for p in parameters_and_grads])
             self._create_global_learning_rate()
+            if self._LARS_weight_decay > 0.0:
+                layers.append_LARS(parameters_and_grads,
+                                   self.global_learning_rate(),
+                                   self._LARS_weight_decay)
 
             optimize_ops = []
             for param_and_grad in parameters_and_grads:
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index bc8a1aafc82d62501cecfa71be0cc3851c75eae2..99d51ae0076178aca50e36c2c187257a8ba1cbf2 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -76,8 +76,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
     emb_layers.append(mark_embedding)
 
     hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
-        for emb in emb_layers
+        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
     ]
 
     hidden_0 = fluid.layers.sums(input=hidden_0_layers)
@@ -94,8 +93,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
 
     for i in range(1, depth):
         mix_hidden = fluid.layers.sums(input=[
-            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
-            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
         ])
 
         lstm = fluid.layers.dynamic_lstm(
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index 578b1162fbd7e3a1b1c0cc934406818f2e07e019..25bcb8a64103b845adbe2017120ce8d945faf6dd 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -94,7 +94,7 @@ def train(nn_type,
 
     test_program = fluid.default_main_program().clone(for_test=True)
 
-    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001, LARS_weight_decay=0.3)
     optimizer.minimize(avg_loss)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
diff --git a/python/paddle/fluid/tests/test_data_feeder.py b/python/paddle/fluid/tests/test_data_feeder.py
index ce3ba3ebc50d7b015f379b5e80b179463a7b231a..30b7a634a2b978df85d6432854ef12285460be44 100644
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
@@ -22,12 +22,11 @@ class TestDataFeeder(unittest.TestCase):
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
         feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
         result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
-        print(result)
 
         self.assertEqual(result['image'].shape(), [2, 1, 28, 28])
         self.assertEqual(result['label'].shape(), [2, 1])
-        self.assertEqual(result['image'].lod(), [])
-        self.assertEqual(result['label'].lod(), [])
+        self.assertEqual(result['image'].recursive_sequence_lengths(), [])
+        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
 
     def test_lod_level_1_converter(self):
         # lod_level = 1
@@ -42,12 +41,12 @@ class TestDataFeeder(unittest.TestCase):
         # label = [1] * len(data)
         result = feeder.feed(
             [([1, 2, 3], [1]), ([4, 5], [1]), ([6, 7, 8, 9], [1])])
-        print(result)
 
         self.assertEqual(result['sentences'].shape(), [9, 1])
         self.assertEqual(result['label'].shape(), [3, 1])
-        self.assertEqual(result['sentences'].lod(), [[0, 3, 5, 9]])
-        self.assertEqual(result['label'].lod(), [])
+        self.assertEqual(result['sentences'].recursive_sequence_lengths(),
+                         [[3, 2, 4]])
+        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
 
     def test_lod_level_2_converter(self):
         # lod_level = 2
@@ -62,12 +61,12 @@ class TestDataFeeder(unittest.TestCase):
         # label = [1] * len(data)
         result = feeder.feed(
             [([[1, 2, 3], [4, 5]], [1]), ([[6, 7, 8, 9]], [1])])
-        print(result)
 
         self.assertEqual(result['paragraphs'].shape(), [9, 1])
         self.assertEqual(result['label'].shape(), [2, 1])
-        self.assertEqual(result['paragraphs'].lod(), [[0, 2, 3], [0, 3, 5, 9]])
-        self.assertEqual(result['label'].lod(), [])
+        self.assertEqual(result['paragraphs'].recursive_sequence_lengths(),
+                         [[2, 1], [3, 2, 4]])
+        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py
index 013d72f418cf7ac11eb31fd221052039e896e203..b7e7f5801fbbe58626eeec5fc77736d04bb3cefb 100644
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -13,44 +13,41 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
-from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor, _validate_lod, _convert_lod
-import numpy
+from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor
+import numpy as np
 import unittest
 
 
 class TestLoDTensor(unittest.TestCase):
-    def test_validate_lod(self):
-        lod = (1, 2, 1)
-        self.assertRaises(AssertionError, _validate_lod, lod, -1)
-        lod = [[1, 2], (2, 3)]
-        self.assertRaises(AssertionError, _validate_lod, lod, -1)
-        lod = [1, 2, 3]
-        self.assertRaises(AssertionError, _validate_lod, lod, -1)
-
+    def test_pybind_lod(self):
+        tensor = fluid.LoDTensor()
         lod = []
-        self.assertTrue(_validate_lod(lod, -1))
+        tensor.set_recursive_sequence_lengths(lod)
         lod = [[], [1], [3]]
-        self.assertFalse(_validate_lod(lod, -1))
-        lod = [[0], [-1], [3]]
-        self.assertFalse(_validate_lod(lod, -1))
+        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths, lod)
+        lod = [[0], [2], [3]]
+        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths, lod)
 
-        # Each level's sum should be equal to the number of items in the next level
-        # Moreover, last level's sum should be equal to the tensor height
-        lod = [[2, 3], [1, 3, 1, 2, 1]]
-        self.assertTrue(_validate_lod(lod, tensor_height=8))
-        lod = [[1, 3], [2, 1, 3]]
-        self.assertFalse(_validate_lod(lod, tensor_height=6))
-        lod = [[1, 3], [2, 1, 3, 4]]
-        self.assertFalse(_validate_lod(lod, tensor_height=5))
-
-    def test_convert_lod(self):
         lod = [[1, 2, 3]]
-        converted_lod = [[0, 1, 3, 6]]
-        self.assertEqual(_convert_lod(lod), converted_lod)
+        tensor.set_recursive_sequence_lengths(lod)
+        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        tensor.set(np.random.random([6, 1]), fluid.CPUPlace())
+        self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
+        tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
+        self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
 
+        # Each level's sum should be equal to the number of items in the next level
+        # Moreover, last level's sum should be equal to the tensor height
+        lod = [[2, 3], [1, 3, 1, 2, 2]]
+        tensor.set_recursive_sequence_lengths(lod)
+        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        tensor.set(np.random.random([8, 1]), fluid.CPUPlace())
+        self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
         lod = [[2, 3], [1, 3, 1, 2, 1]]
-        converted_lod = [[0, 2, 5], [0, 1, 4, 5, 7, 8]]
-        self.assertEqual(_convert_lod(lod), converted_lod)
+        tensor.set_recursive_sequence_lengths(lod)
+        self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
+        tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
+        self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
 
     def test_create_lod_tensor(self):
         # Create LoDTensor from a list
@@ -60,19 +57,19 @@ class TestLoDTensor(unittest.TestCase):
         self.assertRaises(AssertionError, create_lod_tensor, data, wrong_lod,
                           fluid.CPUPlace())
         tensor = create_lod_tensor(data, correct_lod, fluid.CPUPlace())
-        self.assertEqual(tensor.lod(), [[0, 3, 5]])
+        self.assertEqual(tensor.recursive_sequence_lengths(), correct_lod)
 
         # Create LoDTensor from numpy array
-        data = numpy.random.random([10, 1])
+        data = np.random.random([10, 1])
         lod = [[2, 1], [3, 3, 4]]
         tensor = create_lod_tensor(data, lod, fluid.CPUPlace())
-        self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]])
+        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
 
         # Create LoDTensor from another LoDTensor, they are differnt instances
         new_lod = [[2, 2, 1], [1, 2, 2, 3, 2]]
         new_tensor = create_lod_tensor(tensor, new_lod, fluid.CPUPlace())
-        self.assertEqual(tensor.lod(), [[0, 2, 3], [0, 3, 6, 10]])
-        self.assertEqual(new_tensor.lod(), [[0, 2, 4, 5], [0, 1, 3, 5, 8, 10]])
+        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
+        self.assertEqual(new_tensor.recursive_sequence_lengths(), new_lod)
 
     def test_create_random_int_lodtensor(self):
         # The shape of a word, commonly used in speech and NLP problem, is [1]
@@ -83,7 +80,7 @@ class TestLoDTensor(unittest.TestCase):
         high = dict_size - 1
         tensor = create_random_int_lodtensor(lod, shape,
                                              fluid.CPUPlace(), low, high)
-        self.assertEqual(tensor.lod(), [[0, 2, 5, 10]])
+        self.assertEqual(tensor.recursive_sequence_lengths(), lod)
         self.assertEqual(tensor.shape(), [10, 1])
 
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index ab683bc101728ba008e01f26ff4d3828b3b99787..21182393bd68db4a379fc3ecf83fc85d27ca9490 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -41,8 +41,8 @@ function(py_test_modules TARGET_NAME)
 endfunction()
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
-#list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
-#list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
+list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
+list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
 # TODO(wuyi): this test hungs on CI, will add it back later
 list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
 foreach(TEST_OP ${TEST_OPS})
@@ -50,3 +50,5 @@ foreach(TEST_OP ${TEST_OPS})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
 py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
+py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
+py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 307caae4b0cf4869c1abb755215aa97795d47e15..e056ef9952a519d6c4d580b27f1118a3a91f13af 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -162,7 +162,7 @@ class OpTest(unittest.TestCase):
                     tensor = core.LoDTensor()
                     if isinstance(np_value, tuple):
                         tensor.set(np_value[0], place)
-                        tensor.set_lod(np_value[1])
+                        tensor.set_recursive_sequence_lengths(np_value[1])
                     else:
                         tensor.set(np_value, place)
                     feed_map[name] = tensor
@@ -170,7 +170,8 @@ class OpTest(unittest.TestCase):
                 tensor = core.LoDTensor()
                 if isinstance(self.inputs[var_name], tuple):
                     tensor.set(self.inputs[var_name][0], place)
-                    tensor.set_lod(self.inputs[var_name][1])
+                    tensor.set_recursive_sequence_lengths(self.inputs[var_name][
+                        1])
                 else:
                     tensor.set(self.inputs[var_name], place)
                 feed_map[var_name] = tensor
@@ -293,7 +294,8 @@ class OpTest(unittest.TestCase):
                         str(place))
                     if isinstance(expect, tuple):
                         self.assertListEqual(
-                            actual.lod(), expect[1], "Output (" + sub_out_name +
+                            actual.recursive_sequence_lengths(), expect[1],
+                            "Output (" + sub_out_name +
                             ") has different lod at " + str(place))
             else:
                 idx = find_actual(out_name, fetch_list)
@@ -307,8 +309,8 @@ class OpTest(unittest.TestCase):
                     "Output (" + out_name + ") has diff at " + str(place) +
                     str(actual_t) + "\n" + str(expect_t))
                 if isinstance(expect, tuple):
-                    self.assertListEqual(actual.lod(), expect[1],
-                                         "Output (" + out_name +
+                    self.assertListEqual(actual.recursive_sequence_lengths(),
+                                         expect[1], "Output (" + out_name +
                                          ") has different lod at " + str(place))
 
     def _get_places(self):
@@ -408,7 +410,7 @@ class OpTest(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(np_value, place)
         if lod is not None:
-            tensor.set_lod(lod)
+            tensor.set_recursive_sequence_lengths(lod)
         return tensor
 
     @staticmethod
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 4216d83653b27ec7f18034e576fbedbecc3f1cfe..01e5749bdb9729c697af1ae87d993a2da66217f8 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -128,7 +128,7 @@ def create_or_get_tensor(scope, var_name, var, place):
     tensor = scope.var(var_name).get_tensor()
     if var is not None:
         assert isinstance(var, np.ndarray)
-        tensor.set_lod([[]])
+        tensor.set_recursive_sequence_lengths([])
         tensor.set_dims(var.shape)
         tensor.set(var, place)
     return tensor
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
index 7976dd7c3f14390fb00bc8ab39121b6a686e3039..4e1687477c6b89b34f0b35823f9587704a131e85 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
@@ -26,36 +26,36 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
 
     def append_lod_tensor(self, tensor_array, lod, data):
         lod_tensor = core.LoDTensor()
-        lod_tensor.set_lod(lod)
+        lod_tensor.set_recursive_sequence_lengths(lod)
         lod_tensor.set(data, self.place)
         tensor_array.append(lod_tensor)
 
     def test_get_set(self):
         ids = self.scope.var("ids").get_lod_tensor_array()
         self.append_lod_tensor(
-            ids, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
+            ids, [[3, 3], [1, 1, 1, 1, 1, 1]],
             np.array(
                 [1, 2, 3, 4, 5, 6], dtype="int64"))
         self.append_lod_tensor(
-            ids, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
+            ids, [[3, 3], [1, 0, 2, 2, 0, 1]],
             np.array(
                 [0, 1, 2, 3, 4, 5], dtype="int64"))
         self.append_lod_tensor(
-            ids, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
+            ids, [[3, 3], [0, 1, 1, 1, 1, 1]],
             np.array(
                 [0, 1, 2, 3, 4], dtype="int64"))
 
         scores = self.scope.var("scores").get_lod_tensor_array()
         self.append_lod_tensor(
-            scores, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
+            scores, [[3, 3], [1, 1, 1, 1, 1, 1]],
             np.array(
                 [1, 2, 3, 4, 5, 6], dtype="float64"))
         self.append_lod_tensor(
-            scores, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
+            scores, [[3, 3], [1, 0, 2, 2, 0, 1]],
             np.array(
                 [0, 1, 2, 3, 4, 5], dtype="float64"))
         self.append_lod_tensor(
-            scores, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
+            scores, [[3, 3], [0, 1, 1, 1, 1, 1]],
             np.array(
                 [0, 1, 2, 3, 4], dtype="float64"))
 
@@ -73,9 +73,11 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
 
         beam_search_decode_op.run(self.scope, self.place)
 
-        expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
-        self.assertEqual(sentence_ids.lod(), expected_lod)
-        self.assertEqual(sentence_scores.lod(), expected_lod)
+        expected_lod = [[4, 4], [1, 2, 3, 3, 1, 3, 3, 3]]
+        self.assertEqual(sentence_ids.recursive_sequence_lengths(),
+                         expected_lod)
+        self.assertEqual(sentence_scores.recursive_sequence_lengths(),
+                         expected_lod)
 
         expected_data = np.array(
             [2, 1, 0, 3, 1, 0, 3, 2, 1, 5, 4, 3, 2, 4, 4, 3, 6, 5, 4], "int64")
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
index bc708f3aff54f54d290684d68afa503a50a32dac..5a14178c278c76b060b79facc041f0853d09c370 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -48,18 +48,18 @@ class BeamSearchOpTester(unittest.TestCase):
         op.run(self.scope, core.CPUPlace())
         selected_ids = self.scope.find_var("selected_ids").get_tensor()
         print 'selected_ids', np.array(selected_ids)
-        print 'lod', selected_ids.lod()
+        print 'lod', selected_ids.recursive_sequence_lengths()
 
     def _create_pre_ids(self):
         np_data = np.array([[1, 2, 3, 4]], dtype='int64')
         tensor = create_tensor(self.scope, "pre_ids", np_data)
 
     def _create_ids(self):
-        self.lod = [[0, 1, 4], [0, 1, 2, 3, 4]]
+        self.lod = [[1, 3], [1, 1, 1, 1]]
         np_data = np.array(
             [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64')
         tensor = create_tensor(self.scope, "ids", np_data)
-        tensor.set_lod(self.lod)
+        tensor.set_recursive_sequence_lengths(self.lod)
 
     def _create_scores(self):
         np_data = np.array(
@@ -71,7 +71,7 @@ class BeamSearchOpTester(unittest.TestCase):
             ],
             dtype='float32')
         tensor = create_tensor(self.scope, "scores", np_data)
-        tensor.set_lod(self.lod)
+        tensor.set_recursive_sequence_lengths(self.lod)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index f7461ee6dab699064153332116449c8e20a0bac0..1a245fd756cb2bcaca720f10fa35fd3d2a45cd4d 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -65,23 +65,25 @@ def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None):
         distance (numpy.array) : The distance of two entries with shape [M, N].
         lod (list of int): The offsets of each input in this batch.
     """
-    n = len(lod) - 1
+    n = len(lod)
     m = distance.shape[1]
     match_indices = -1 * np.ones((n, m), dtype=np.int)
     match_dist = np.zeros((n, m), dtype=np.float32)
-    for i in range(len(lod) - 1):
-        bipartite_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :],
-                        match_dist[i, :])
+    cur_offset = 0
+    for i in range(n):
+        bipartite_match(distance[cur_offset:(cur_offset + lod[i]), :],
+                        match_indices[i, :], match_dist[i, :])
         if match_type == 'per_prediction':
-            argmax_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :],
-                         match_dist[i, :], dist_threshold)
+            argmax_match(distance[cur_offset:(cur_offset + lod[i]), :],
+                         match_indices[i, :], match_dist[i, :], dist_threshold)
+        cur_offset += lod[i]
     return match_indices, match_dist
 
 
 class TestBipartiteMatchOpWithLoD(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
-        lod = [[0, 5, 11, 23]]
+        lod = [[5, 6, 12]]
         dist = np.random.random((23, 217)).astype('float32')
         match_indices, match_dist = batch_bipartite_match(dist, lod[0])
 
@@ -98,7 +100,7 @@ class TestBipartiteMatchOpWithLoD(OpTest):
 class TestBipartiteMatchOpWithoutLoD(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
-        lod = [[0, 8]]
+        lod = [[8]]
         dist = np.random.random((8, 17)).astype('float32')
         match_indices, match_dist = batch_bipartite_match(dist, lod[0])
 
@@ -115,7 +117,7 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
 class TestBipartiteMatchOpWithPerPredictionType(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
-        lod = [[0, 5, 11, 23]]
+        lod = [[5, 6, 12]]
         dist = np.random.random((23, 237)).astype('float32')
         match_indices, match_dist = batch_bipartite_match(dist, lod[0],
                                                           'per_prediction', 0.5)
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
index b4c48d85f2c564d877c0a29e64dd2944d2b26ea3..4ce9a4783e2332b6882164a70e1462c6a6d31bef 100644
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -81,15 +81,19 @@ def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type,
     n = target_box.shape[0]
     m = prior_box.shape[0]
     output_box = np.zeros((n, m, 4), dtype=np.float32)
-    for i in range(len(lod) - 1):
+    cur_offset = 0
+    for i in range(len(lod)):
         if (code_type == "EncodeCenterSize"):
-            box_coder(target_box[lod[i]:lod[i + 1], :], prior_box,
-                      prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
+            box_coder(target_box[cur_offset:(cur_offset + lod[i]), :],
+                      prior_box, prior_box_var,
+                      output_box[cur_offset:(cur_offset + lod[i]), :, :],
                       code_type, box_normalized)
         elif (code_type == "DecodeCenterSize"):
-            box_coder(target_box[lod[i]:lod[i + 1], :, :], prior_box,
-                      prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
+            box_coder(target_box[cur_offset:(cur_offset + lod[i]), :, :],
+                      prior_box, prior_box_var,
+                      output_box[cur_offset:(cur_offset + lod[i]), :, :],
                       code_type, box_normalized)
+        cur_offset += lod[i]
     return output_box
 
 
@@ -99,7 +103,7 @@ class TestBoxCoderOp(OpTest):
 
     def setUp(self):
         self.op_type = "box_coder"
-        lod = [[0, 1, 2, 3, 4, 5]]
+        lod = [[1, 1, 1, 1, 1]]
         prior_box = np.random.random((10, 4)).astype('float32')
         prior_box_var = np.random.random((10, 4)).astype('float32')
         target_box = np.random.random((5, 10, 4)).astype('float32')
@@ -152,7 +156,7 @@ class TestBoxCoderOpWithLoD(OpTest):
 
     def setUp(self):
         self.op_type = "box_coder"
-        lod = [[0, 4, 12, 20]]
+        lod = [[4, 8, 8]]
         prior_box = np.random.random((10, 4)).astype('float32')
         prior_box_var = np.random.random((10, 4)).astype('float32')
         target_box = np.random.random((20, 4)).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
index 050df2801c98e8f4167cdd1b4dde858c9f9f07dd..23932194f0ca97954ec9ade3fdcaebd7a32749a0 100644
--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
@@ -144,10 +144,10 @@ class TestChunkEvalOp(OpTest):
         starts = sorted(starts)
         self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = self.gen_chunks(
             infer, label, starts)
-        self.inputs = {
-            'Inference': (infer, [starts]),
-            'Label': (label, [starts])
-        }
+        lod = []
+        for i in range(len(starts) - 1):
+            lod.append(starts[i + 1] - starts[i])
+        self.inputs = {'Inference': (infer, [lod]), 'Label': (label, [lod])}
         precision = float(
             self.num_correct_chunks
         ) / self.num_infer_chunks if self.num_infer_chunks else 0
diff --git a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
index f397f542bb07519886d75618e2a915c2dbf61fce..122b076c2d3e3a69f52a2c335e2bc89707b4fa9b 100644
--- a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
@@ -22,9 +22,9 @@ from op_test import OpTest
 class CRFDecoding(object):
     def __init__(self, emission_weights, transition_weights,
                  seq_start_positions):
-        assert (emission_weights.shape[0] == seq_start_positions[-1])
+        assert (emission_weights.shape[0] == sum(seq_start_positions))
         self.tag_num = emission_weights.shape[1]
-        self.seq_num = len(seq_start_positions) - 1
+        self.seq_num = len(seq_start_positions)
 
         self.seq_start_positions = seq_start_positions
         self.x = emission_weights
@@ -34,9 +34,9 @@ class CRFDecoding(object):
         self.w = transition_weights[2:, :]
 
         self.track = np.zeros(
-            (seq_start_positions[-1], self.tag_num), dtype="int64")
+            (sum(seq_start_positions), self.tag_num), dtype="int64")
         self.decoded_path = np.zeros(
-            (seq_start_positions[-1], 1), dtype="int64")
+            (sum(seq_start_positions), 1), dtype="int64")
 
     def _decode_one_sequence(self, decoded_path, x):
         seq_len, tag_num = x.shape
@@ -71,9 +71,11 @@ class CRFDecoding(object):
             decoded_path[i - 1] = max_idx = track[i, max_idx]
 
     def decode(self):
+        cur_pos = 0
         for i in range(self.seq_num):
-            start = self.seq_start_positions[i]
-            end = self.seq_start_positions[i + 1]
+            start = cur_pos
+            cur_pos += self.seq_start_positions[i]
+            end = cur_pos
             self._decode_one_sequence(self.decoded_path[start:end, :],
                                       self.x[start:end, :])
         return self.decoded_path
@@ -90,11 +92,13 @@ class TestCRFDecodingOp1(OpTest):
         TAG_NUM = 17
         MAX_SEQ_LEN = 10
 
-        lod = [[0]]
+        lod = [[]]
+        total_len = 0
         for i in range(SEQ_NUM):
-            lod[-1].append(lod[-1][-1] + random.randint(1, MAX_SEQ_LEN))
+            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
+            total_len += lod[-1][-1]
         emission = np.random.uniform(-1, 1,
-                                     [lod[-1][-1], TAG_NUM]).astype("float64")
+                                     [total_len, TAG_NUM]).astype("float64")
         transition = np.random.uniform(-0.5, 0.5,
                                        [TAG_NUM + 2, TAG_NUM]).astype("float64")
 
@@ -126,7 +130,8 @@ class TestCRFDecodingOp2(OpTest):
         self.op_type = "crf_decoding"
         TAG_NUM = 5
 
-        lod = [[0, 1, 3, 6, 10]]
+        lod = [[1, 2, 3, 4]]
+        total_len = sum(lod[-1])
         transition = np.repeat(
             np.arange(
                 TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
@@ -135,13 +140,13 @@ class TestCRFDecodingOp2(OpTest):
         emission = np.repeat(
             np.arange(
                 TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
-            lod[-1][-1],
+            total_len,
             axis=0)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
+            low=0, high=TAG_NUM, size=(total_len, 1), dtype="int64")
         predicted_labels = np.ones(
-            (lod[-1][-1], 1), dtype="int64") * (TAG_NUM - 1)
+            (total_len, 1), dtype="int64") * (TAG_NUM - 1)
         expected_output = (labels == predicted_labels).astype("int64")
 
         self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py
index f166031a1cbbaa5e312f5c7919b39648d0dad013..131b4076f45ae25b45bb3f64da07a5c3aacc43d5 100644
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
@@ -22,14 +22,16 @@ from test_softmax_op import stable_softmax
 def CTCAlign(input, lod, blank, merge_repeated):
     lod0 = lod[0]
     result = []
-    for i in range(len(lod0) - 1):
+    cur_offset = 0
+    for i in range(len(lod0)):
         prev_token = -1
-        for j in range(lod0[i], lod0[i + 1]):
+        for j in range(cur_offset, cur_offset + lod0[i]):
             token = input[j][0]
             if (token != blank) and not (merge_repeated and
                                          token == prev_token):
                 result.append(token)
             prev_token = token
+        cur_offset += lod0[i]
     result = np.array(result).reshape([len(result), 1]).astype("int32")
     if len(result) == 0:
         result = np.array([-1])
@@ -39,7 +41,7 @@ def CTCAlign(input, lod, blank, merge_repeated):
 class TestCTCAlignOp(OpTest):
     def config(self):
         self.op_type = "ctc_align"
-        self.input_lod = [[0, 11, 18]]
+        self.input_lod = [[11, 7]]
         self.blank = 0
         self.merge_repeated = False
         self.input = np.array(
@@ -66,7 +68,7 @@ class TestCTCAlignOp(OpTest):
 class TestCTCAlignOpCase1(TestCTCAlignOp):
     def config(self):
         self.op_type = "ctc_align"
-        self.input_lod = [[0, 11, 19]]
+        self.input_lod = [[11, 8]]
         self.blank = 0
         self.merge_repeated = True
         self.input = np.array(
@@ -77,7 +79,7 @@ class TestCTCAlignOpCase1(TestCTCAlignOp):
 class TestCTCAlignOpCase2(TestCTCAlignOp):
     def config(self):
         self.op_type = "ctc_align"
-        self.input_lod = [[0, 4]]
+        self.input_lod = [[4]]
         self.blank = 0
         self.merge_repeated = True
         self.input = np.array([0, 0, 0, 0]).reshape([4, 1]).astype("int32")
diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
index f545ad155ccd28c2d34e424d307eed49b37f20fb..05d3367ad8ec2bc3df794015a7c25e943a26c68c 100644
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -74,13 +74,13 @@ class TestDetectionMAPOp(OpTest):
         self.evaluate_difficult = True
         self.ap_type = "integral"
 
-        self.label_lod = [[0, 2, 4]]
+        self.label_lod = [[2, 2]]
         # label difficult xmin ymin xmax ymax
         self.label = [[1, 0, 0.1, 0.1, 0.3, 0.3], [1, 1, 0.6, 0.6, 0.8, 0.8],
                       [2, 0, 0.3, 0.3, 0.6, 0.5], [1, 0, 0.7, 0.1, 0.9, 0.3]]
 
         # label score xmin ymin xmax ymax difficult
-        self.detect_lod = [[0, 3, 7]]
+        self.detect_lod = [[3, 4]]
         self.detect = [
             [1, 0.3, 0.1, 0.0, 0.4, 0.3], [1, 0.7, 0.0, 0.1, 0.2, 0.3],
             [1, 0.9, 0.7, 0.6, 0.8, 0.8], [2, 0.8, 0.2, 0.1, 0.4, 0.4],
@@ -89,7 +89,7 @@ class TestDetectionMAPOp(OpTest):
         ]
 
         # label score true_pos false_pos
-        self.tf_pos_lod = [[0, 3, 7]]
+        self.tf_pos_lod = [[3, 4]]
         self.tf_pos = [[1, 0.9, 1, 0], [1, 0.7, 1, 0], [1, 0.3, 0, 1],
                        [1, 0.2, 1, 0], [2, 0.8, 0, 1], [2, 0.1, 1, 0],
                        [3, 0.2, 0, 1]]
@@ -112,15 +112,19 @@ class TestDetectionMAPOp(OpTest):
             for i, count in enumerate(class_pos_count):
                 class_pos_count_dict[i] = count
 
-            for i in range(len(true_pos_lod[0]) - 1):
-                start = true_pos_lod[0][i]
-                end = true_pos_lod[0][i + 1]
+            cur_pos = 0
+            for i in range(len(true_pos_lod[0])):
+                start = cur_pos
+                cur_pos += true_pos_lod[0][i]
+                end = cur_pos
                 for j in range(start, end):
                     true_pos_dict[i].append(true_pos[j])
 
-            for i in range(len(false_pos_lod[0]) - 1):
-                start = false_pos_lod[0][i]
-                end = false_pos_lod[0][i + 1]
+            cur_pos = 0
+            for i in range(len(false_pos_lod[0])):
+                start = cur_pos
+                cur_pos += false_pos_lod[0][i]
+                end = cur_pos
                 for j in range(start, end):
                     false_pos_dict[i].append(false_pos[j])
 
@@ -130,19 +134,19 @@ class TestDetectionMAPOp(OpTest):
             label_number = self.class_num
 
             out_class_pos_count = []
-            out_true_pos_lod = [0]
+            out_true_pos_lod = []
             out_true_pos = []
-            out_false_pos_lod = [0]
+            out_false_pos_lod = []
             out_false_pos = []
 
             for i in range(label_number):
                 out_class_pos_count.append([label_count[i]])
                 true_pos_list = true_pos[i]
                 out_true_pos += true_pos_list
-                out_true_pos_lod.append(len(out_true_pos))
+                out_true_pos_lod.append(len(true_pos_list))
                 false_pos_list = false_pos[i]
                 out_false_pos += false_pos_list
-                out_false_pos_lod.append(len(out_false_pos))
+                out_false_pos_lod.append(len(false_pos_list))
 
             return out_class_pos_count, out_true_pos, [
                 out_true_pos_lod
@@ -241,7 +245,7 @@ class TestDetectionMAPOpSkipDiff(TestDetectionMAPOp):
 
         self.evaluate_difficult = False
 
-        self.tf_pos_lod = [[0, 2, 6]]
+        self.tf_pos_lod = [[2, 4]]
         # label score true_pos false_pos
         self.tf_pos = [[1, 0.7, 1, 0], [1, 0.3, 0, 1], [1, 0.2, 1, 0],
                        [2, 0.8, 0, 1], [2, 0.1, 1, 0], [3, 0.2, 0, 1]]
@@ -267,9 +271,9 @@ class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
     def init_test_case(self):
         super(TestDetectionMAPOpMultiBatch, self).init_test_case()
         self.class_pos_count = [0, 2, 1]
-        self.true_pos_lod = [[0, 0, 3, 5]]
+        self.true_pos_lod = [[0, 3, 2]]
         self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
-        self.false_pos_lod = [[0, 0, 3, 5]]
+        self.false_pos_lod = [[0, 3, 2]]
         self.false_pos = [[0.7, 0.], [0.3, 1.], [0.2, 0.], [0.8, 1.], [0.1, 0.]]
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
index 2314bb2ed8a4eeb34752fd5d040f8a8476798aa6..562e66b0625083fe840d64967249f0215cfda1f9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -16,6 +16,7 @@ import os
 import time
 import unittest
 from multiprocessing import Process
+import signal
 
 import numpy
 
@@ -24,9 +25,6 @@ import paddle.fluid.layers as layers
 
 
 class TestSendOp(unittest.TestCase):
-    @unittest.skip(
-        "This test is buggy. We cannot use time.sleep to sync processes, the connection may fail in unittest."
-    )
     def test_send(self):
         # Run init_serv in a thread
         place = fluid.CPUPlace()
@@ -35,7 +33,9 @@ class TestSendOp(unittest.TestCase):
         p.daemon = True
         p.start()
 
-        time.sleep(10)
+        self.ps_timeout = 5
+        self._wait_ps_ready(p.pid)
+
         with open("/tmp/paddle.%d.port" % p.pid, "r") as fn:
             selected_port = int(fn.readlines()[0])
         self.init_client(place, selected_port)
@@ -44,9 +44,23 @@ class TestSendOp(unittest.TestCase):
         self.assertTrue(numpy.allclose(self.local_out, self.dist_out))
 
         # FIXME(typhoonzero): find a way to gracefully shutdown the server.
-        os.system("kill -9 %d" % p.pid)
+        os.kill(p.pid, signal.SIGKILL)
         p.join()
 
+    def _wait_ps_ready(self, pid):
+        start_left_time = self.ps_timeout
+        sleep_time = 0.5
+        while True:
+            assert start_left_time >= 0, "wait ps ready failed"
+            time.sleep(sleep_time)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                start_left_time -= sleep_time
+
     def init_serv(self, place):
         main = fluid.Program()
 
@@ -84,7 +98,10 @@ class TestSendOp(unittest.TestCase):
                 dtype="float32",
                 persistable=False,
                 shape=[32, 32])
-            o = layers.Send("127.0.0.1:%d" % port, [x], [get_var])
+            fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
+            layers.Send("127.0.0.1:%d" % port, [x])
+            o = layers.Recv("127.0.0.1:%d" % port, [get_var])
+
         exe = fluid.Executor(place)
         self.dist_out = exe.run(main, fetch_list=o)  # o is a list
 
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
index 95af51f1b2f8cd9492baa9cb14fe31ffa586f2fc..0f289af284773caf8515f9cbdd38e0d4481e4e44 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -136,16 +136,16 @@ class BaseRNN(object):
         feed_dict = dict()
 
         for iname in self.inputs:
-            lod = [0]
+            lod = []
             np_flatten = []
             for seq_id in xrange(len(self.inputs[iname])):
                 seq_len = len(self.inputs[iname][seq_id])
-                lod.append(lod[-1] + seq_len)
+                lod.append(seq_len)
                 np_flatten.extend(self.inputs[iname][seq_id])
 
             t = fluid.Tensor()
             t.set(numpy.array(np_flatten), place)
-            t.set_lod([lod])
+            t.set_recursive_sequence_lengths([lod])
             feed_dict[iname] = t
 
         for pname in self.params:
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index d3f63ee2c414a71309be8f0af6d3e5912078ecdb..92e718662dfd7998be3ede2994f160059679fa8a 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -39,20 +39,20 @@ class TestDyRnnStaticInput(unittest.TestCase):
 
     def prepare_x_tensor(self):
         self.x_tensor_dim = 10
-        lod = [[0, 2, 3, 6]]
-        shape = [lod[0][-1], self.x_tensor_dim]
+        lod = [[2, 1, 3]]
+        shape = [sum(lod[0]), self.x_tensor_dim]
         self.x_tensor_data = np.random.random(shape).astype('float32')
         self.x_tensor = core.LoDTensor()
-        self.x_tensor.set_lod(lod)
+        self.x_tensor.set_recursive_sequence_lengths(lod)
         self.x_tensor.set(self.x_tensor_data, self.place)
 
     def prepare_static_input_tensor(self):
         self.static_input_tensor_dim = 4
-        lod = [[0, 1, 3, 6]]
-        shape = [lod[0][-1], self.static_input_tensor_dim]
+        lod = [[1, 2, 3]]
+        shape = [sum(lod[0]), self.static_input_tensor_dim]
         self.static_input_data = np.random.random(shape).astype('float32')
         self.static_input_tensor = core.LoDTensor()
-        self.static_input_tensor.set_lod(lod)
+        self.static_input_tensor.set_recursive_sequence_lengths(lod)
         self.static_input_tensor.set(self.static_input_data, self.place)
 
     def fetch_value(self, var):
@@ -69,7 +69,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
         ndarray = np.zeros(shape=dims).astype('float32')
         for i in xrange(np.product(dims)):
             ndarray.ravel()[i] = lod_tensor.get_float_element(i)
-        return ndarray, lod_tensor.lod()
+        return ndarray, lod_tensor.recursive_sequence_lengths()
 
     def build_graph(self, only_forward=False):
         x_tensor = fluid.layers.data(
@@ -131,21 +131,20 @@ class TestDyRnnStaticInput(unittest.TestCase):
             framework.grad_var_name('static_input_tensor'))
         return static_input_grad, loss
 
-    def get_seq_len_from_lod(self, lod):
-        return [lod[0][i + 1] - lod[0][i] for i in xrange(len(lod[0]) - 1)]
-
     def get_expected_static_step_outs(self):
-        x_lod = self.x_tensor.lod()
-        x_seq_len = self.get_seq_len_from_lod(x_lod)
+        x_lod = self.x_tensor.recursive_sequence_lengths()
+        x_seq_len = x_lod[0]
         x_seq_len_sorted = sorted(x_seq_len)
         x_sorted_indices = np.argsort(x_seq_len)[::-1]
 
-        static_lod = self.static_input_tensor.lod()
-        static_sliced = [
-            self.static_input_data[static_lod[0][i]:static_lod[0][i + 1]]
-            for i in xrange(len(static_lod[0]) - 1)
-        ]
-        static_seq_len = self.get_seq_len_from_lod(static_lod)
+        static_lod = self.static_input_tensor.recursive_sequence_lengths()
+        static_sliced = []
+        cur_offset = 0
+        for i in xrange(len(static_lod[0])):
+            static_sliced.append(self.static_input_data[cur_offset:(
+                cur_offset + static_lod[0][i])])
+            cur_offset += static_lod[0][i]
+        static_seq_len = static_lod[0]
         static_reordered = []
         for i in xrange(len(x_sorted_indices)):
             static_reordered.extend(static_sliced[x_sorted_indices[i]].tolist())
@@ -159,11 +158,13 @@ class TestDyRnnStaticInput(unittest.TestCase):
 
         for i in xrange(self._max_sequence_len):
             end = len(x_seq_len) - bisect.bisect_left(x_seq_len_sorted, i + 1)
-            lod = [0]
+            lod = []
+            total_len = 0
             for i in xrange(end):
-                lod.append(static_seq_len_reordered[i] + lod[-1])
+                lod.append(static_seq_len_reordered[i])
+                total_len += lod[-1]
             static_step_lods.append([lod])
-            end = lod[-1]
+            end = total_len
             static_step_outs.append(
                 np.array(static_reordered[:end]).astype('float32'))
 
@@ -199,7 +200,9 @@ class TestDyRnnStaticInput(unittest.TestCase):
             self.static_input_tensor.set_float_element(i, origin)
             numeric_gradients.ravel()[i] = (y_pos - y_neg) / self._delta / 2
         self.assertTrue(np.allclose(actual_gradients, numeric_gradients, 0.001))
-        self.assertTrue(np.allclose(actual_lod, self.static_input_tensor.lod()))
+        self.assertTrue(
+            np.allclose(actual_lod,
+                        self.static_input_tensor.recursive_sequence_lengths()))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
index 2957fb50586c8bce74bbf8066e0e9bf24d79cb7d..816562621b4fc749f3c6b0eca8ee3c5850ef1ba9 100644
--- a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
@@ -52,23 +52,29 @@ class TestEditDistanceOp(OpTest):
     def setUp(self):
         self.op_type = "edit_distance"
         normalized = False
-        x1 = np.array([[0, 12, 3, 5, 8, 2]]).astype("int64")
-        x2 = np.array([[0, 12, 4, 7, 8]]).astype("int64")
+        x1 = np.array([[12, 3, 5, 8, 2]]).astype("int64")
+        x2 = np.array([[12, 4, 7, 8]]).astype("int64")
         x1 = np.transpose(x1)
         x2 = np.transpose(x2)
-        x1_lod = [0, 1, 5]
-        x2_lod = [0, 3, 4]
+        x1_lod = [1, 4]
+        x2_lod = [3, 1]
 
-        num_strs = len(x1_lod) - 1
+        num_strs = len(x1_lod)
         distance = np.zeros((num_strs, 1)).astype("float32")
         sequence_num = np.array(2).astype("int64")
+
+        x1_offset = 0
+        x2_offset = 0
         for i in range(0, num_strs):
             distance[i] = Levenshtein(
-                hyp=x1[x1_lod[i]:x1_lod[i + 1]],
-                ref=x2[x2_lod[i]:x2_lod[i + 1]])
+                hyp=x1[x1_offset:(x1_offset + x1_lod[i])],
+                ref=x2[x2_offset:(x2_offset + x2_lod[i])])
+            x1_offset += x1_lod[i]
+            x2_offset += x2_lod[i]
             if normalized is True:
-                len_ref = x2_lod[i + 1] - x2_lod[i]
+                len_ref = x2_lod[i]
                 distance[i] = distance[i] / len_ref
+
         self.attrs = {'normalized': normalized}
         self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
         self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
@@ -81,23 +87,29 @@ class TestEditDistanceOpNormalized(OpTest):
     def setUp(self):
         self.op_type = "edit_distance"
         normalized = True
-        x1 = np.array([[0, 10, 3, 6, 5, 8, 2]]).astype("int64")
-        x2 = np.array([[0, 10, 4, 6, 7, 8]]).astype("int64")
+        x1 = np.array([[10, 3, 6, 5, 8, 2]]).astype("int64")
+        x2 = np.array([[10, 4, 6, 7, 8]]).astype("int64")
         x1 = np.transpose(x1)
         x2 = np.transpose(x2)
-        x1_lod = [0, 1, 3, 6]
-        x2_lod = [0, 2, 3, 5]
+        x1_lod = [1, 2, 3]
+        x2_lod = [2, 1, 2]
 
-        num_strs = len(x1_lod) - 1
+        num_strs = len(x1_lod)
         distance = np.zeros((num_strs, 1)).astype("float32")
         sequence_num = np.array(3).astype("int64")
+
+        x1_offset = 0
+        x2_offset = 0
         for i in range(0, num_strs):
             distance[i] = Levenshtein(
-                hyp=x1[x1_lod[i]:x1_lod[i + 1]],
-                ref=x2[x2_lod[i]:x2_lod[i + 1]])
+                hyp=x1[x1_offset:(x1_offset + x1_lod[i])],
+                ref=x2[x2_offset:(x2_offset + x2_lod[i])])
+            x1_offset += x1_lod[i]
+            x2_offset += x2_lod[i]
             if normalized is True:
-                len_ref = x2_lod[i + 1] - x2_lod[i]
+                len_ref = x2_lod[i]
                 distance[i] = distance[i] / len_ref
+
         self.attrs = {'normalized': normalized}
         self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
         self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
diff --git a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
index 9d724a6479f061996359b1efcc5f61f0564331c7..8b9da843115409c65055927d317867d1290c8f0e 100644
--- a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
@@ -24,17 +24,16 @@ class TestFeedFetch(unittest.TestCase):
         input_array = np.ones((4, 4, 6)).astype("float32")
         input_array[0, 0, 0] = 3
         input_array[3, 3, 5] = 10
-        input_tensor = core.LoDTensor([[0, 2, 4]])
+        input_tensor = core.LoDTensor([[2, 2]])
         input_tensor.set(input_array, place)
 
         core.set_feed_variable(scope, input_tensor, "feed", 0)
 
         output_tensor = core.get_fetch_variable(scope, "feed", 0)
 
-        output_lod = output_tensor.lod()
-        self.assertEqual(0, output_lod[0][0])
+        output_lod = output_tensor.recursive_sequence_lengths()
+        self.assertEqual(2, output_lod[0][0])
         self.assertEqual(2, output_lod[0][1])
-        self.assertEqual(4, output_lod[0][2])
 
         output_array = np.array(output_tensor)
         self.assertEqual(3, output_array[0, 0, 0])
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
index 533d8ccfac82a2e298af16181ab16bf7aa3db282..0c75cf33f5f208d11081a6802910c25553b8c4ec 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
@@ -55,7 +55,7 @@ class TestFillConstantBatchSizeLikeWithLoDTensor(OpTest):
         self.op_type = "fill_constant_batch_size_like"
         self.inputs = {
             'Input': (np.random.random((31, 28)).astype("float32"),
-                      [[0, 9, 23, 31]])
+                      [[9, 14, 8]])
         }
         self.attrs = {
             'value': 3.5,
diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py
index 3a13eb872a8646cede126b667864dfc3784ebd0b..8fbf1560859aa295fc40b36129d0f0d07d55dd9f 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -20,8 +20,8 @@ from test_lstm_op import identity, sigmoid, tanh, relu
 
 
 class TestGRUOp(OpTest):
-    lod = [[0, 2, 6, 9]]
-    batch_size = lod[0][-1]
+    lod = [[2, 4, 3]]
+    batch_size = sum(lod[0])
     frame_size = 5
     activate = {
         'identity': identity,
@@ -33,10 +33,10 @@ class TestGRUOp(OpTest):
     @staticmethod
     def seq_to_batch(lod, is_reverse):
         idx_in_seq_list = []
-        seq_starts = lod[0]
-        seq_lens = []
-        for i in range(len(seq_starts) - 1):
-            seq_lens.append(seq_starts[i + 1] - seq_starts[i])
+        seq_lens = lod[0]
+        seq_starts = [0]
+        for i in range(len(seq_lens)):
+            seq_starts.append(seq_starts[-1] + seq_lens[i])
         sorted_seqs = sorted(
             range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x])
         num_batch = seq_lens[sorted_seqs[0]]
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 587e2025e1045f63a5825f884d4dcad8b4685e62..15a72cb605911dfe957fb927763174521a30a085 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -364,5 +364,22 @@ class TestMSRAInitializer(unittest.TestCase):
         self.assertEqual(init_op.attr('seed'), 134)
 
 
+class TestMSRAInitializer(unittest.TestCase):
+    def test_bilinear_initializer(self):
+        """Test the bilinear initializer with supplied arguments
+        """
+        program = framework.Program()
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[8, 1, 3, 3],
+            lod_level=0,
+            name="param",
+            initializer=initializer.BilinearInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'assign_value')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
index 8f62ac20a5c13257a1519128292e2abc4962bf84..eff4212d91e609a7ef531280bbd3cf3671a59830 100644
--- a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
@@ -58,8 +58,8 @@ class TestIOUSimilarityOpWithLoD(TestIOUSimilarityOp):
 
     def setUp(self):
         super(TestIOUSimilarityOpWithLoD, self).setUp()
-        self.boxes1_lod = [[0, 1, 2]]
-        self.output_lod = [[0, 1, 2]]
+        self.boxes1_lod = [[1, 1]]
+        self.output_lod = [[1, 1]]
 
         self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
         self.outputs = {'Out': (self.output, self.output_lod)}
diff --git a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
index f49f7635f76c9feb5b5593438cb445df9488c69b..696d0ab4fa81a409a2bf0d6f6f23779ec26eb6d2 100644
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
@@ -105,11 +105,13 @@ class TestLinearChainCrfOp(OpTest):
         MAX_SEQ_LEN = 5
 
         # the linear_chain_crf operator only supports sequence (LoD level = 1)
-        lod = [[0]]
+        lod = [[]]
+        seq_start_pos = [0]
         for i in range(SEQ_NUM):
-            lod[-1].append(lod[-1][-1] + random.randint(1, MAX_SEQ_LEN))
-        emission = np.random.uniform(-1, 1,
-                                     [lod[-1][-1], TAG_NUM]).astype("float64")
+            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
+            seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1])
+        emission = np.random.uniform(
+            -1, 1, [seq_start_pos[-1], TAG_NUM]).astype("float64")
         emission_row_max = np.amax(emission, axis=1, keepdims=True)
         emission_exps = np.exp(emission - emission_row_max)
 
@@ -118,14 +120,14 @@ class TestLinearChainCrfOp(OpTest):
         transition_exps = np.exp(transition)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
+            low=0, high=TAG_NUM, size=(seq_start_pos[-1], 1), dtype="int64")
 
         self.inputs = {
             "Emission": (emission, lod),
             "Transition": transition,
             "Label": (labels, lod)
         }
-        crf = LinearChainCrfForward(lod[0], emission, emission_row_max,
+        crf = LinearChainCrfForward(seq_start_pos, emission, emission_row_max,
                                     emission_exps, transition, transition_exps,
                                     labels)
         alpha, log_likelihood = crf.crf_forward_compute()
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index d1d709551c77908db88be6fda7ac74d4e922138e..9dec2acb1d7101f8f00565c56e0469edb143d0c6 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -57,17 +57,18 @@ class TestListenAndServOp(OpTest):
     def setUp(self):
         self.ps_timeout = 5
         self.ip = "127.0.0.1"
-        self.port = "6173"
+        self.port = "0"
         self.trainers = 1
-        self.trainer_id = 1
+        self.trainer_id = 0
 
     def _start_pserver(self, use_cuda, sync_mode):
         p = Process(
             target=run_pserver,
             args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
                   self.trainer_id))
+        p.daemon = True
         p.start()
-        return p.pid
+        return p
 
     def _wait_ps_ready(self, pid):
         start_left_time = self.ps_timeout
@@ -89,18 +90,20 @@ class TestListenAndServOp(OpTest):
 
     def test_handle_signal_in_serv_op(self):
         # run pserver on CPU in sync mode
-        pid = self._start_pserver(False, True)
-        self._wait_ps_ready(pid)
+        p1 = self._start_pserver(False, True)
+        self._wait_ps_ready(p1.pid)
 
         # raise SIGTERM to pserver
-        os.kill(pid, signal.SIGTERM)
+        os.kill(p1.pid, signal.SIGKILL)
+        p1.join()
 
         # run pserver on CPU in async mode
-        pid = self._start_pserver(False, False)
-        self._wait_ps_ready(pid)
+        p2 = self._start_pserver(False, False)
+        self._wait_ps_ready(p2.pid)
 
         # raise SIGTERM to pserver
-        os.kill(pid, signal.SIGTERM)
+        os.kill(p2.pid, signal.SIGKILL)
+        p2.join()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
index 093eecb8370b8ae7e4c43ce7ca6f50f5d302bd60..bac5e502318397b43e9867d5fc9e4e8cd33394b8 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
@@ -30,7 +30,8 @@ class TestLoDRankTable(unittest.TestCase):
 
         tensor = core.LoDTensor()
         tensor.set(numpy.random.random(size=(17, 100)), cpu)
-        tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
+        tensor.set_recursive_sequence_lengths(
+            [[1, 2], [5, 1, 1], [3, 1, 5, 1, 3, 3, 1]])
         exe.run(scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
index 6b6d4c824aeae319dacf224408ce96a0d9c5bb35..77905c4b96499c855fd5c5e704b8051ccdb7a323 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
@@ -21,11 +21,15 @@ class TestLodResetOpByAttr(OpTest):
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float32")
-        lod = [[0, 3, 5, 10]]
-        target_lod_0 = [0, 7, 10]
+        lod = [[3, 2, 5]]
+        # target_offset_lod and target_lod are the same lod info represented
+        # in offset-based format and length-based format, respectively.
+        target_offset_lod = [0, 7, 10]
+        target_lod = [7, 3]
         self.inputs = {'X': (x, lod)}
-        self.attrs = {'target_lod': target_lod_0}
-        self.outputs = {'Out': (x, [target_lod_0])}
+        # The `target_lod` attribute is still based on offset
+        self.attrs = {'target_lod': target_offset_lod}
+        self.outputs = {'Out': (x, [target_lod])}
 
     def test_check_output(self):
         self.check_output()
@@ -38,13 +42,16 @@ class TestLodResetOpByInput(OpTest):
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float32")
-        lod = [[0, 3, 5, 10]]
-        target_lod_0 = [0, 4, 7, 10]
+        lod = [[3, 2, 5]]
+        # target_offset_lod and target_lod are the same lod info represented
+        # in offset-based format and length-based format, respectively.
+        target_offset_lod = [0, 4, 7, 10]
+        target_lod = [4, 3, 3]
         self.inputs = {
             'X': (x, lod),
-            'Y': np.array([target_lod_0]).astype('int32')
+            'Y': np.array([target_offset_lod]).astype('int32')
         }
-        self.outputs = {'Out': (x, [target_lod_0])}
+        self.outputs = {'Out': (x, [target_lod])}
 
     def test_check_output(self):
         self.check_output()
@@ -57,15 +64,16 @@ class TestLodResetOpBoth(OpTest):
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float32")
-        lod = [[0, 3, 5, 10]]
-        target_lod_0_attr = [0, 7, 10]
-        target_lod_0_in = [0, 4, 7, 10]
+        lod = [[3, 2, 5]]
+        target_offset_lod_attr = [0, 7, 10]
+        target_offset_lod_in = [0, 4, 7, 10]
+        target_lod_in = [4, 3, 3]
         self.inputs = {
             'X': (x, lod),
-            'Y': np.array(target_lod_0_in).astype('int32')
+            'Y': np.array(target_offset_lod_in).astype('int32')
         }
-        self.attrs = {'target_lod': target_lod_0_attr}
-        self.outputs = {'Out': (x, [target_lod_0_in])}
+        self.attrs = {'target_lod': target_offset_lod_attr}
+        self.outputs = {'Out': (x, [target_lod_in])}
 
     def test_check_output(self):
         self.check_output()
@@ -78,11 +86,11 @@ class TestLodResetOpYIsLoDTensor(OpTest):
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float32")
-        lod = [[0, 3, 5, 10]]
+        lod = [[3, 2, 5]]
         y = np.random.random((10, 10)).astype("float32")
-        target_lod_0 = [[0, 4, 7, 10]]
-        self.inputs = {'X': (x, lod), 'Y': (y, target_lod_0)}
-        self.outputs = {'Out': (x, target_lod_0)}
+        target_lod = [[4, 3, 3]]
+        self.inputs = {'X': (x, lod), 'Y': (y, target_lod)}
+        self.outputs = {'Out': (x, target_lod)}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
index 63b17a5ccd62ed79b3d611e039c2b2705a133272..118c22fbb1ff6be5859ae9e4aed6218b0c77deec 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
@@ -27,7 +27,7 @@ class TestLoDTensorArray(unittest.TestCase):
         for i in xrange(10):
             t = core.LoDTensor()
             t.set(numpy.array([i], dtype='float32'), cpu)
-            t.set_lod([[0, 1]])
+            t.set_recursive_sequence_lengths([[1]])
             tensor_array.append(t)
 
         self.assertEqual(10, len(tensor_array))
@@ -35,17 +35,17 @@ class TestLoDTensorArray(unittest.TestCase):
         for i in xrange(10):
             t = tensor_array[i]
             self.assertEqual(numpy.array(t), numpy.array([i], dtype='float32'))
-            self.assertEqual([[0, 1]], t.lod())
+            self.assertEqual([[1]], t.recursive_sequence_lengths())
 
             t = core.LoDTensor()
             t.set(numpy.array([i + 10], dtype='float32'), cpu)
-            t.set_lod([[0, 2]])
+            t.set_recursive_sequence_lengths([[1]])
             tensor_array[i] = t
             t = tensor_array[i]
             self.assertEqual(
                 numpy.array(t), numpy.array(
                     [i + 10], dtype='float32'))
-            self.assertEqual([[0, 2]], t.lod())
+            self.assertEqual([[1]], t.recursive_sequence_lengths())
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
index 66a03640c148d769787593f41a44cd4d1aaa10b1..cebe6997bb4152519dabbabfc0404d6036bc4e65 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
@@ -29,7 +29,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
         expect = map(lambda x: numpy.array(x).astype('int32'),
                      [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
         self.main(
@@ -42,7 +42,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 3, 9, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 0, 1]])
         expect = map(lambda x: numpy.array(x).astype('int32'),
                      [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
         self.main(
@@ -55,7 +55,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(20).reshape(20, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 2, 5], [0, 3, 9, 11, 17, 20]])
+        tensor.set_recursive_sequence_lengths([[2, 3], [3, 6, 2, 6, 3]])
 
         expect = [
             numpy.array(
@@ -65,7 +65,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
                 [17, 18, 19], dtype='int32')
         ]
 
-        lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
+        lod = [[[2, 3]], [[6, 6]], [[3]]]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -77,8 +77,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set(
             numpy.arange(31).reshape(31, 1).astype('int32'), self.place())
 
-        tensor.set_lod([[0, 3, 5, 9, 11],
-                        [0, 3, 7, 11, 11, 12, 17, 19, 21, 23, 30, 31]])
+        tensor.set_recursive_sequence_lengths(
+            [[3, 2, 4, 2], [3, 4, 4, 0, 1, 5, 2, 2, 2, 7, 1]])
 
         expect = [
             numpy.array(
@@ -88,7 +88,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             ], [17, 18, 3, 4, 5, 6, 11, 30], [19, 20, 7, 8, 9, 10], [21, 22]]
         ]
 
-        lod = [[[0, 5, 8, 8, 15]], [[0, 2, 6, 7, 8]], [[0, 2, 6]], [[0, 2]]]
+        lod = [[[5, 3, 0, 7]], [[2, 4, 1, 1]], [[2, 4]], [[2]]]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -99,8 +99,9 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
-                        [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
+        tensor.set_recursive_sequence_lengths(
+            [[2, 3, 1], [2, 3, 1, 4, 2, 1],
+             [3, 4, 4, 6, 4, 1, 1, 4, 4, 8, 6, 1, 4]])
 
         expect = [
             numpy.array(
@@ -108,8 +109,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], range(
                 22, 39) + range(7, 21), range(39, 46)]
         ]
-        lod = [[[0, 1, 3, 4], [0, 1, 4, 8, 12]],
-               [[0, 4, 7], [0, 1, 5, 9, 17, 21, 27, 31]], [[0, 2], [0, 6, 7]]]
+        lod = [[[1, 2, 1], [1, 3, 4, 4]], [[4, 3], [1, 4, 4, 8, 4, 6, 4]],
+               [[2], [6, 1]]]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -120,8 +121,9 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
-                        [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
+        tensor.set_recursive_sequence_lengths(
+            [[2, 3, 1], [2, 3, 1, 4, 2, 1],
+             [3, 4, 4, 6, 4, 1, 1, 4, 4, 8, 6, 1, 4]])
         self.main(
             tensor=tensor,
             expect_array=None,
@@ -162,12 +164,13 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             exp_tensor, exp_lod = exp
             exp_tensor = numpy.expand_dims(exp_tensor, axis=1)
             self.assertTrue(numpy.allclose(exp_tensor, numpy.array(array[i])))
-            self.assertEqual(exp_lod, array[i].lod())
+            self.assertEqual(exp_lod, array[i].recursive_sequence_lengths())
 
     def check_tensor_same(self, actual, expect):
         self.assertTrue(
             numpy.allclose(numpy.array(actual), numpy.array(expect)))
-        self.assertEqual(actual.lod(), expect.lod())
+        self.assertEqual(actual.recursive_sequence_lengths(),
+                         expect.recursive_sequence_lengths())
 
 
 class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
@@ -188,7 +191,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
 
         tensor = core.LoDTensor()
         tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
 
         g_vars = program.global_block().var(x.name + "@GRAD")
 
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py
index e726f99d49877a1bc464090092ec80b97ab15d0c..705a24bd8f39a55e0a352944d961f8d33aaf96ff 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
@@ -84,15 +84,17 @@ def lstm(
         h = g_o * act_cell(c)
         return h, c
 
-    def _reverse(x, lod):
+    def _reverse(x, offset):
         y = np.zeros_like(x)
-        for i in range(len(lod) - 1):
-            b, e = lod[i], lod[i + 1]
+        for i in range(len(offset) - 1):
+            b, e = offset[i], offset[i + 1]
             y[b:e, :] = np.flip(x[b:e, :], 0)
         return y
 
-    offset = lod[0]
-    batch_size = len(offset) - 1
+    offset = [0]
+    for l in lod[0]:
+        offset.append(offset[-1] + l)
+    batch_size = len(lod[0])
     hidden = []
     cell = []
     input = _reverse(input, offset) if is_reverse else input
@@ -100,7 +102,7 @@ def lstm(
         input = input + np.tile(w_b, (offset[-1], 1))
     for i in range(batch_size):
         # compute one sequence
-        seq_len = offset[i + 1] - offset[i]
+        seq_len = lod[0][i]
         x = input[offset[i]:offset[i + 1], :]
         h_pre = h0[i]  # 1 x D
         c_pre = c0[i]  # 1 x D
@@ -124,7 +126,7 @@ def lstm(
 
 class TestLstmOp(OpTest):
     def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
+        self.lod = [[2, 3, 2]]
         self.D = 16
 
         self.act_gate = 'sigmoid'
@@ -139,8 +141,8 @@ class TestLstmOp(OpTest):
         self.set_argument()
         self.op_type = 'lstm'
 
-        T = self.lod[0][-1]
-        N = len(self.lod[0]) - 1
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
 
         x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
         if self.has_initial_state:
@@ -186,7 +188,7 @@ class TestLstmOp(OpTest):
 
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
@@ -196,7 +198,7 @@ class TestLstmOp(OpTest):
 
 # class TestLstmOpHasInitial(TestLstmOp):
 #     def set_argument(self):
-#         self.lod = [[0, 2, 5, 7]]
+#         self.lod = [[2, 3, 2]]
 #         self.D = 16
 
 #         self.act_gate = 'sigmoid'
@@ -209,7 +211,7 @@ class TestLstmOp(OpTest):
 
 #     def test_check_grad(self):
 #         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -218,7 +220,7 @@ class TestLstmOp(OpTest):
 #             max_relative_error=5e-4)
 
 #     def test_check_grad_ingore_bias(self):
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -228,7 +230,7 @@ class TestLstmOp(OpTest):
 #             no_grad_set=set('Bias'))
 
 #     def test_check_grad_ingore_weight(self):
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -238,7 +240,7 @@ class TestLstmOp(OpTest):
 #             no_grad_set=set('Weight'))
 
 #     def test_check_grad_ingore_input(self):
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -248,7 +250,7 @@ class TestLstmOp(OpTest):
 #             no_grad_set=set('Input'))
 
 #     def test_check_grad_ingore_h0(self):
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -258,7 +260,7 @@ class TestLstmOp(OpTest):
 #             no_grad_set=set('H0'))
 
 #     def test_check_grad_ingore_c0(self):
-#         N = len(self.lod[0]) - 1
+#         N = len(self.lod[0])
 #         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
 #         self.outputs['BatchCellPreAct'] = np.zeros(
 #             (N, self.D)).astype('float64')
@@ -269,7 +271,7 @@ class TestLstmOp(OpTest):
 
 # class TestLstmOpRerverse(TestLstmOp):
 #     def set_argument(self):
-#         self.lod = [[0, 2, 5, 7]]
+#         self.lod = [[2, 3, 2]]
 #         self.D = 16
 
 #         self.act_gate = 'sigmoid'
@@ -282,7 +284,7 @@ class TestLstmOp(OpTest):
 
 # class TestLstmOpNotUsePeepholes(TestLstmOp):
 #     def set_argument(self):
-#         self.lod = [[0, 2, 5, 7]]
+#         self.lod = [[2, 3, 2]]
 #         self.D = 16
 
 #         self.act_gate = 'sigmoid'
diff --git a/python/paddle/fluid/tests/unittests/test_lstmp_op.py b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
index afff133f6c6cfe45d1aca4014dc8b92e6562e6b8..ed2262da4bc727657c2e65d69cb1922891e17b09 100644
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
@@ -64,15 +64,17 @@ def lstmp(
         r = act_proj(r)
         return r, c
 
-    def _reverse(x, lod):
+    def _reverse(x, offset):
         y = np.zeros_like(x)
-        for i in range(len(lod) - 1):
-            b, e = lod[i], lod[i + 1]
+        for i in range(len(offset) - 1):
+            b, e = offset[i], offset[i + 1]
             y[b:e, :] = np.flip(x[b:e, :], 0)
         return y
 
-    offset = lod[0]
-    batch_size = len(offset) - 1
+    offset = [0]
+    for l in lod[0]:
+        offset.append(offset[-1] + l)
+    batch_size = len(lod[0])
     # recurrent projection state
     projection = []
     cell = []
@@ -81,7 +83,7 @@ def lstmp(
         input = input + np.tile(w_b, (offset[-1], 1))
     for i in range(batch_size):
         # compute one sequence
-        seq_len = offset[i + 1] - offset[i]
+        seq_len = lod[0][i]
         x = input[offset[i]:offset[i + 1], :]
         r_pre = np.dot(h0[i], w_rh)  # 1 x P
         r_pre = act_proj(r_pre)
@@ -117,8 +119,8 @@ class TestLstmpOp(LstmTest.TestLstmOp):
         self.reset_argument()
         self.op_type = 'lstmp'
 
-        T = self.lod[0][-1]
-        N = len(self.lod[0]) - 1
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
 
         x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
         if self.has_initial_state:
@@ -166,7 +168,7 @@ class TestLstmpOp(LstmTest.TestLstmOp):
 
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -183,7 +185,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
 
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -195,7 +197,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             max_relative_error=1e-2)
 
     def test_check_grad_ingore_bias(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -207,7 +209,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('Bias'))
 
     def test_check_grad_ingore_weight(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -219,7 +221,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('Weight'))
 
     def test_check_grad_ingore_proj_weight(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -231,7 +233,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('ProjWeight'))
 
     def test_check_grad_ingore_input(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -243,7 +245,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('Input'))
 
     def test_check_grad_ingore_h0(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -255,7 +257,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('H0'))
 
     def test_check_grad_ingore_c0(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
index c27573c3d69037bc48e0b6a90636b3f027f15a41..54ee85c1a7a539fe9517f32adb35ab99b5ae2a07 100644
--- a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
@@ -70,7 +70,7 @@ class TestMineHardExamplesOp(OpTest):
 
         self.updated_match_indices = self.match_indices
 
-        self.neg_indices_lod = [[0, 1, 2]]
+        self.neg_indices_lod = [[1, 1]]
         self.neg_indices = np.array([[1], [0]]).astype('int32')
 
 
@@ -92,7 +92,7 @@ class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp):
         self.updated_match_indices = np.array([[0, -1, -1],
                                                [-1, -1, -1]]).astype('int32')
 
-        self.neg_indices_lod = [[0, 1, 3]]
+        self.neg_indices_lod = [[1, 2]]
         self.neg_indices = np.array([[2], [0], [2]]).astype('int32')
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index 6459913c0162374e17d0249627e7107a195babf8..aacd8ae45af10a2b19d2903ab121e9bb4f9de7ff 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -135,12 +135,12 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
     batch_size = scores.shape[0]
 
     det_outs = []
-    lod = [0]
+    lod = []
     for n in range(batch_size):
         nmsed_outs, nmsed_num = multiclass_nms(boxes[n], scores[n], background,
                                                score_threshold, nms_threshold,
                                                nms_top_k, keep_top_k)
-        lod.append(lod[-1] + nmsed_num)
+        lod.append(nmsed_num)
         if nmsed_num == 0: continue
 
         for c, indices in nmsed_outs.iteritems():
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
index cd78cce8729ab2b5a0bb4817cf3022e53932283a..d13f2b3afde10f9b4e632094fa216d8729069afa 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
@@ -27,9 +27,9 @@ class TestOneHotOp(OpTest):
         self.op_type = 'one_hot'
         depth = 10
         dimension = 12
-        x_lod = [[0, 4, 5, 8, 11]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
-        x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
@@ -50,9 +50,9 @@ class TestOneHotOp_default_dtype(OpTest):
         self.op_type = 'one_hot'
         depth = 10
         dimension = 12
-        x_lod = [[0, 4, 5, 8, 11]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
-        x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
@@ -75,11 +75,11 @@ class TestOneHotOp_exception(OpTest):
         self.place = core.CPUPlace()
         self.dimension = 12
         self.x = core.LoDTensor()
-        x_lod = [[0, 4, 5, 8, 11]]
-        data = [np.random.randint(11, 20) for i in xrange(x_lod[0][-1])]
-        data = np.array(data).astype('int').reshape([x_lod[0][-1], 1])
+        x_lod = [[4, 1, 3, 3]]
+        data = [np.random.randint(11, 20) for i in xrange(sum(x_lod[0]))]
+        data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
         self.x.set(data, self.place)
-        self.x.set_lod(x_lod)
+        self.x.set_recursive_sequence_lengths(x_lod)
 
     def test_check_output(self):
         program = Program()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index 163975555ec2cea5c169cc1da3c4324d91ba3616..1ea7a6a5682318fb5f4ef8b3a08911df3cd44acf 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -173,6 +173,7 @@ class TestCRFModel(unittest.TestCase):
                           pe.run(feed=feeder.feed(cur_batch),
                                  fetch_list=[avg_cost.name]))[0]
 
+    @unittest.skip(reason="CI hangs")
     def test_update_sparse_parameter_all_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
@@ -181,6 +182,7 @@ class TestCRFModel(unittest.TestCase):
         self.check_network_convergence(
             is_sparse=True, build_strategy=build_strategy, use_cuda=False)
 
+    @unittest.skip(reason="CI hangs")
     def test_update_dense_parameter_all_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
@@ -189,6 +191,7 @@ class TestCRFModel(unittest.TestCase):
         self.check_network_convergence(
             is_sparse=False, build_strategy=build_strategy, use_cuda=False)
 
+    @unittest.skip(reason="CI hangs")
     def test_update_sparse_parameter_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
@@ -197,6 +200,7 @@ class TestCRFModel(unittest.TestCase):
         self.check_network_convergence(
             is_sparse=True, build_strategy=build_strategy, use_cuda=False)
 
+    @unittest.skip(reason="CI hangs")
     def test_update_dense_parameter_reduce(self):
         build_strategy = fluid.BuildStrategy()
         build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py
index c75080fbb96d472810e5d6a1d02a77c456006f66..e01af42a58b86042fd0282928d1a78d9c3239fe3 100644
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
@@ -28,7 +28,7 @@ class TestPrintOpCPU(unittest.TestCase):
         self.x_tensor = core.LoDTensor()
         tensor_np = np.random.random(size=(2, 3)).astype('float32')
         self.x_tensor.set(tensor_np, self.place)
-        self.x_tensor.set_lod([[0, 1, 1]])
+        self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
 
     def build_network(self, only_forward, **kargs):
         x = layers.data('x', shape=[3], dtype='float32', lod_level=1)
@@ -62,7 +62,7 @@ class TestPrintOpGPU(TestPrintOpCPU):
         self.x_tensor = core.LoDTensor()
         tensor_np = np.random.random(size=(2, 3)).astype('float32')
         self.x_tensor.set(tensor_np, self.place)
-        self.x_tensor.set_lod([[0, 1, 1]])
+        self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
index 76d0d2f2fe80e409dc1b7fa858d43fbc6ad960ef..a70321bd800bf25eeb9e5d197ea7e08626b9aede 100644
--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
@@ -70,11 +70,10 @@ class TestReorderLoDTensor(unittest.TestCase):
                 lod_level_i = numpy.random.randint(
                     low=1,
                     high=5,
-                    size=self.num_seq if i == 0 else lod_level_i[-1])
-                lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
+                    size=self.num_seq if i == 0 else sum(lod_level_i)).tolist()
                 data_lod.append(lod_level_i)
             data_value = numpy.random.random(
-                size=[data_lod[-1][-1] if data_lod else self.num_seq
+                size=[sum(data_lod[-1]) if data_lod else self.num_seq
                       ] + data_shape).astype('float32')
             self.data[data_name] = (data_value, data_lod)
 
@@ -84,29 +83,36 @@ class TestReorderLoDTensor(unittest.TestCase):
             tensor = fluid.Tensor()
             tensor.set(self.data[desc[0]][0], place)
             if self.data[desc[0]][1]:
-                tensor.set_lod(self.data[desc[0]][1])
+                tensor.set_recursive_sequence_lengths(self.data[desc[0]][1])
             self.inputs[desc[0]] = tensor
 
     def reorder(self):
-        level = 0
+        def convert_to_offset(lod):
+            offset_lod = [[0] for i in lod]
+            for i, level in enumerate(lod):
+                for seq_len in level:
+                    offset_lod[i].append(offset_lod[i][-1] + seq_len)
+            return offset_lod
 
+        level = 0
         # compute the rank_table according to ref_lod
         ref_lod = self.data[self.data_desc[1][0]][1][level]
         rank_table = []  # list of (index, length)
-        for i in range(len(ref_lod) - 1):
-            rank_table.append((i, ref_lod[i + 1] - ref_lod[i]))
+        for i in range(len(ref_lod)):
+            rank_table.append((i, ref_lod[i]))
         rank_table = sorted(rank_table, lambda x, y: y[1] - x[1])
 
         # compute the input sequence info according to input_lod
         input_value, input_lod = self.data[self.data_desc[0][0]]
+        offset_lod = convert_to_offset(input_lod)
 
         input_table = []  # list of (offset, length, sub_lod)
-        if input_lod:
-            for i in range(len(input_lod[level]) - 1):
+        if offset_lod:
+            for i in range(len(offset_lod[level]) - 1):
                 start_idx = i
                 end_idx = i + 1
                 sub_lod = []
-                for lod_level_i in input_lod[level:]:
+                for lod_level_i in offset_lod[level:]:
                     sub_lod_i = []
                     for idx in range(start_idx, end_idx):
                         sub_lod_i.append(lod_level_i[idx + 1] - lod_level_i[
@@ -132,10 +138,9 @@ class TestReorderLoDTensor(unittest.TestCase):
 
             input_seq_sub_lod = input_table[index][2]
             if len(output_lod) == 0:
-                output_lod = [[0] for i in input_seq_sub_lod]
-            for i, sub_lod_i in enumerate(input_seq_sub_lod):
-                for idx_sub in sub_lod_i:
-                    output_lod[i].append(output_lod[i][-1] + idx_sub)
+                output_lod = [[] for i in input_seq_sub_lod]
+            for i, level in enumerate(input_seq_sub_lod):
+                output_lod[i].extend(level)
         return output_value, output_lod
 
     def test_reorder_lod_tensor(self):
@@ -148,7 +153,8 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_output), expect_output, atol=0.001))
-            self.assertEqual(expect_output_lod, actual_output.lod())
+            self.assertEqual(expect_output_lod,
+                             actual_output.recursive_sequence_lengths())
         # check gradient
         expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
         expect_grad_lod = self.data[self.data_desc[0][0]][1]
@@ -156,7 +162,8 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_grad), expect_grad, atol=0.001))
-            self.assertEqual(expect_grad_lod, actual_grad.lod())
+            self.assertEqual(expect_grad_lod,
+                             actual_grad.recursive_sequence_lengths())
 
     def test_reorder_tensor(self):
         self.data_desc[0][-1] = 0  # input is tensor
@@ -168,7 +175,8 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_output), expect_output, atol=0.001))
-            self.assertEqual(expect_output_lod, actual_output.lod())
+            self.assertEqual(expect_output_lod,
+                             actual_output.recursive_sequence_lengths())
         # check gradient
         expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
         expect_grad_lod = self.data[self.data_desc[0][0]][1]
@@ -176,14 +184,14 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_grad), expect_grad, atol=0.001))
-            self.assertEqual(expect_grad_lod, actual_grad.lod())
+            self.assertEqual(expect_grad_lod,
+                             actual_grad.recursive_sequence_lengths())
 
         # compare outputs between LodTensors with explicit and implicit lod
         # use the same data but set the input lod explicitly
-        input_lod = [[
-            i for i in range(len(self.data[self.data_desc[0][0]][0]) + 1)
-        ]]
-        self.inputs[self.data_desc[0][0]].set_lod(input_lod)
+        input_lod = [[1] * len(self.data[self.data_desc[0][0]][0])]
+        self.inputs[self.data_desc[0][0]].set_recursive_sequence_lengths(
+            input_lod)
         # preserve the output of LodTensor with implicit lod to compare
         expect_output = [
             numpy.array(actual_output) for actual_output in self.actual_outputs
diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
index 3d754aff3a73e7168e2123483b26e5e3a3585a4e..df5684ab173a4889dd7b693f9246bafd12e0345f 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -107,7 +107,7 @@ class TestROIPoolOp(OpTest):
         rois = []
         self.rois_lod = [[]]
         for bno in range(self.batch_size):
-            self.rois_lod[0].append(len(rois))
+            self.rois_lod[0].append(bno + 1)
             for i in range(bno + 1):
                 x1 = np.random.random_integers(
                     0, self.width / self.spatial_scale - self.pooled_width)
@@ -121,7 +121,6 @@ class TestROIPoolOp(OpTest):
 
                 roi = [bno, x1, y1, x2, y2]
                 rois.append(roi)
-        self.rois_lod[0].append(len(rois))
         self.rois_num = len(rois)
         self.rois = np.array(rois).astype("int64")
 
diff --git a/python/paddle/fluid/tests/unittests/test_row_conv_op.py b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
index 30f1efbcbcb11332c85c9d5489f22c17b06c2b36..07dcd108689ae6069e30fe22029258d192215549 100644
--- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
@@ -19,8 +19,10 @@ from op_test import OpTest
 
 def row_conv_forward(x, lod, wt):
     out = np.zeros_like(x)
-    seq_info = lod[0]
-    num_sequences = len(seq_info) - 1
+    num_sequences = len(lod[0])
+    seq_info = [0]
+    for seq_len in lod[0]:
+        seq_info.append(seq_info[-1] + seq_len)
     context_length = wt.shape[0]
 
     for i in range(num_sequences):  # loop over number of sequences
@@ -32,7 +34,6 @@ def row_conv_forward(x, lod, wt):
         cur_timesteps = end - start
         for j in range(cur_timesteps):  # loop over different timesteps
             for k in range(context_length):
-
                 if j + k >= cur_timesteps:
                     continue
                 curoutput[j, :] += curinput[j + k, :] * wt[k, :]
@@ -44,8 +45,8 @@ class TestRowConvOp1(OpTest):
     def setUp(self):
 
         self.op_type = "row_conv"
-        lod = [[0, 2, 5, 7]]
-        T = lod[0][-1]
+        lod = [[2, 3, 2]]
+        T = sum(lod[0])
         D = 16
         context_length = 2
 
@@ -75,8 +76,8 @@ class TestRowConvOp2(OpTest):
     def setUp(self):
 
         self.op_type = "row_conv"
-        lod = [[0, 20, 50, 100]]
-        T = lod[0][-1]
+        lod = [[20, 30, 50]]
+        T = sum(lod[0])
         D = 35
         context_length = 35
 
diff --git a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
index 10592d127fafdf202c65fcfa91b5c464cc60e96c..11ffa761a690eb1f9f6dc50c45128a99301741db 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
@@ -18,14 +18,19 @@ import sys
 from op_test import OpTest
 
 
-def to_abs_lod(lod):
-    if len(lod) == 0 or len(lod) == 1:
-        return lod
+def to_abs_offset_lod(lod):
+    offset_lod = [[0] for i in lod]
+    for i, level in enumerate(lod):
+        for seq_len in level:
+            offset_lod[i].append(offset_lod[i][-1] + seq_len)
+
+    if len(offset_lod) == 0 or len(offset_lod) == 1:
+        return offset_lod
     import copy
-    new_lod = copy.deepcopy(lod)
-    for idx, val in enumerate(lod[0]):
-        new_lod[0][idx] = lod[1][val]
-    return new_lod
+    new_offset_lod = copy.deepcopy(offset_lod)
+    for idx, val in enumerate(offset_lod[0]):
+        new_offset_lod[0][idx] = offset_lod[1][val]
+    return new_offset_lod
 
 
 def seq_concat(inputs, level):
@@ -35,11 +40,11 @@ def seq_concat(inputs, level):
     x1 = inputs['X'][1][1][0]
     level_idx = len(lod0) - level - 1
     outs = []
-    for i in range(len(lod0[level_idx]) - 1):
-        sub_x0 = x0[to_abs_lod(lod0)[level_idx][i]:to_abs_lod(lod0)[level_idx][
-            i + 1], :]
-        sub_x1 = x1[to_abs_lod(lod1)[level_idx][i]:to_abs_lod(lod1)[level_idx][
-            i + 1], :]
+    for i in range(len(lod0[level_idx])):
+        sub_x0 = x0[to_abs_offset_lod(lod0)[level_idx][i]:to_abs_offset_lod(
+            lod0)[level_idx][i + 1], :]
+        sub_x1 = x1[to_abs_offset_lod(lod1)[level_idx][i]:to_abs_offset_lod(
+            lod1)[level_idx][i + 1], :]
         outs.append(np.concatenate((sub_x0, sub_x1), axis=0))
     return np.concatenate(outs, axis=0)
 
@@ -48,9 +53,9 @@ class TestSeqConcatOp(OpTest):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod0 = [[2, 2], [1, 1, 1, 1]]
         x1 = np.random.random((4, 8, 3)).astype('float32')
-        lod1 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod1 = [[2, 2], [1, 1, 1, 1]]
         axis = 1
         level = 1
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
@@ -72,14 +77,14 @@ class TestSeqConcatOpLevelZeroNestedSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod0 = [[2, 2], [1, 1, 1, 1]]
         x1 = np.random.random((7, 6, 3)).astype('float32')
-        lod1 = [[0, 2, 4], [0, 1, 3, 5, 7]]
+        lod1 = [[2, 2], [1, 2, 2, 2]]
         axis = 0
         level = 0
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        out_lod = [[0, 2, 4], [0, 2, 5, 8, 11]]
+        out_lod = [[2, 2], [2, 3, 3, 3]]
         self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
@@ -87,14 +92,14 @@ class TestSeqConcatOplevelOneNestedSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod0 = [[2, 2], [1, 1, 1, 1]]
         x1 = np.random.random((7, 6, 3)).astype('float32')
-        lod1 = [[0, 3, 4], [0, 1, 3, 5, 7]]
+        lod1 = [[3, 1], [1, 2, 2, 2]]
         axis = 0
         level = 1
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        out_lod = [[0, 5, 8], [0, 1, 2, 3, 5, 7, 8, 9, 11]]
+        out_lod = [[5, 3], [1, 1, 1, 2, 2, 1, 1, 2]]
         self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
@@ -102,14 +107,14 @@ class TestSeqConcatOpLevelZeroSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 3, 4)).astype('float32')
-        lod0 = [[0, 1, 2, 3, 4]]
+        lod0 = [[1, 1, 1, 1]]
         x1 = np.random.random((7, 3, 4)).astype('float32')
-        lod1 = [[0, 1, 3, 5, 7]]
+        lod1 = [[1, 2, 2, 2]]
         axis = 0
         level = 0
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        out_lod = [[0, 2, 5, 8, 11]]
+        out_lod = [[2, 3, 3, 3]]
         self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_seq_conv.py b/python/paddle/fluid/tests/unittests/test_seq_conv.py
index 51dbf1f61834ff0093d76ed546be27a585697d40..9701d9adef1fd272f2520f66607acded6a8c25c6 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
@@ -75,35 +75,38 @@ class TestSeqProject(OpTest):
         pading_data = self.pad_data
         out = np.zeros((self.input_size[0], self.context_length *
                         self.input_size[1])).astype('float32')
-        lod = lod[0]
+        offset = [0]
+        for seq_len in lod[0]:
+            offset.append(offset[-1] + seq_len)
         begin_pad = np.max([0, -self.context_start])
 
-        for i in range(len(lod) - 1):
+        for i in range(len(offset) - 1):
             for j in range(self.context_length):
-                in_begin = lod[i] + self.context_start + j
-                in_end = lod[i + 1] + self.context_start + j
-                out_begin = lod[i]
-                out_end = lod[i + 1]
-                if in_begin < lod[i]:
-                    pad_size = np.min([lod[i] - in_begin, lod[i + 1] - lod[i]])
+                in_begin = offset[i] + self.context_start + j
+                in_end = offset[i + 1] + self.context_start + j
+                out_begin = offset[i]
+                out_end = offset[i + 1]
+                if in_begin < offset[i]:
+                    pad_size = np.min(
+                        [offset[i] - in_begin, offset[i + 1] - offset[i]])
                     if self.padding_trainable:
                         sub_w = pading_data[j:j + pad_size, :]
-                        out[lod[i]:lod[i] + pad_size, j * self.input_size[1]:(
-                            j + 1) * self.input_size[1]] = sub_w
-                    out_begin = lod[i] + pad_size
-                    in_begin = lod[i]
+                        out[offset[i]:offset[i] + pad_size, j * self.input_size[
+                            1]:(j + 1) * self.input_size[1]] = sub_w
+                    out_begin = offset[i] + pad_size
+                    in_begin = offset[i]
 
-                if in_end > lod[i + 1]:
+                if in_end > offset[i + 1]:
                     pad_size = np.min(
-                        [in_end - lod[i + 1], lod[i + 1] - lod[i]])
+                        [in_end - offset[i + 1], offset[i + 1] - offset[i]])
                     if self.padding_trainable:
                         sub_w = pading_data[begin_pad + self.context_start + j -
                                             pad_size:begin_pad +
                                             self.context_start + j, :]
-                        out[lod[i + 1] - pad_size:lod[i + 1], j * self.
+                        out[offset[i + 1] - pad_size:offset[i + 1], j * self.
                             input_size[1]:(j + 1) * self.input_size[1]] = sub_w
-                    in_end = lod[i + 1]
-                    out_end = lod[i + 1] - pad_size
+                    in_end = offset[i + 1]
+                    out_end = offset[i + 1] - pad_size
                 if in_end <= in_begin:
                     continue
 
@@ -175,7 +178,11 @@ class TestSeqProject(OpTest):
         self.context_stride = 1
 
         self.input_size = [self.input_row, 23]
-        self.lod = [[0, 4, 5, 8, self.input_row]]
+        offset_lod = [[0, 4, 5, 8, self.input_row]]
+        self.lod = [[]]
+        # convert from offset-based lod to length-based lod
+        for i in range(len(offset_lod[0]) - 1):
+            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
         self.output_represention = 8  # output feature size
 
 
@@ -188,7 +195,11 @@ class TestSeqProjectCase1(TestSeqProject):
         self.context_stride = 1
 
         self.input_size = [self.input_row, 23]
-        self.lod = [[0, 4, 5, 8, self.input_row]]
+        offset_lod = [[0, 4, 5, 8, self.input_row]]
+        self.lod = [[]]
+        # convert from offset-based lod to length-based lod
+        for i in range(len(offset_lod[0]) - 1):
+            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
         self.output_represention = 8  # output feature size
 
 
@@ -203,8 +214,12 @@ class TestSeqProjectCase2(TestSeqProject):
         self.input_size = [self.input_row, 23]
         idx = range(self.input_size[0])
         del idx[0]
-        self.lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
-                    [self.input_size[0]]]
+        offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
+                      [self.input_size[0]]]
+        self.lod = [[]]
+        # convert from offset-based lod to length-based lod
+        for i in range(len(offset_lod[0]) - 1):
+            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
         self.output_represention = 8  # output feature size
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py
index 2e48ef0e880839f6d5b4e515a174f427a35e7e6f..0b3659d7a67956f7546d368346bd102eeedf1d97 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
@@ -18,26 +18,34 @@ from op_test import OpTest
 
 
 class TestSeqAvgPool(OpTest):
+    def convert_to_offset(self, lod):
+        offset = [[0] for i in lod]
+        for i, level in enumerate(lod):
+            for seq_len in level:
+                offset[i].append(offset[i][-1] + seq_len)
+        return offset
+
     def set_data(self):
         self.op_type = 'sequence_pool'
         # one level, batch size is 4
         x = np.random.uniform(0.1, 1, [11, 23]).astype('float32')
-        lod = [[0, 4, 5, 8, 11]]
+        lod = [[4, 1, 3, 3]]
         self.inputs = {'X': (x, lod)}
+        offset = self.convert_to_offset(lod)
 
         out = np.zeros((4, 23)).astype('float32')
         self.outputs = {'Out': out}
-        return x, lod, out
+        return x, offset, out
 
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "AVERAGE"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = sub_x.mean(axis=0)
 
     def setUp(self):
-        x, lod, out = self.set_data()
-        self.compute(x, lod, out)
+        x, offset, out = self.set_data()
+        self.compute(x, offset, out)
 
     def test_check_output(self):
         self.check_output()
@@ -50,10 +58,10 @@ class TestSeqAvgPool(OpTest):
 
 
 class TestSeqSumPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "SUM"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = sub_x.sum(axis=0)
 
 
@@ -61,46 +69,47 @@ class TestSeqMaxPool(TestSeqAvgPool):
     def set_data(self):
         self.op_type = 'sequence_pool'
         x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
-        lod = [[0, 4, 5, 8, 13]]
-        for i in range(4):
-            l = lod[0][i + 1] - lod[0][i]
-            x[lod[0][i] + np.random.randint(l), :] += 2.0
+        lod = [[4, 1, 3, 5]]
+        offset = self.convert_to_offset(lod)
+        for i in range(len(offset[0]) - 1):
+            l = offset[0][i + 1] - offset[0][i]
+            x[offset[0][i] + np.random.randint(l), :] += 2.0
 
         self.inputs = {'X': (x, lod)}
 
         out = np.zeros((4, 23)).astype('float32')
         self.outputs = {'Out': out}
-        return x, lod, out
+        return x, offset, out
 
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "MAX"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = np.amax(sub_x, axis=0)
 
 
 class TestSeqSqrtPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "SQRT"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            len = lod[0][i + 1] - lod[0][i]
-            out[i] = sub_x.sum(axis=0) / np.sqrt(len)
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
+            seq_len = offset[0][i + 1] - offset[0][i]
+            out[i] = sub_x.sum(axis=0) / np.sqrt(seq_len)
 
 
 class TestSeqLastPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "LAST"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = sub_x[-1, :]
 
 
 class TestSeqFirstPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "FIRST"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = sub_x[0, :]
 
 
@@ -109,35 +118,39 @@ class TestSeqAvgPool2D(TestSeqAvgPool):
         self.op_type = 'sequence_pool'
         # one level, batch size is 4
         x = np.random.uniform(0.1, 1, [13, 3, 17]).astype('float32')
-        lod = [[0, 4, 5, 8, 13]]
+        lod = [[4, 1, 3, 5]]
         self.inputs = {'X': (x, lod)}
+        offset = self.convert_to_offset(lod)
 
         out = np.zeros((4, 3, 17)).astype('float32')
         self.outputs = {'Out': out}
-        return x, lod, out
+        return x, offset, out
 
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "AVERAGE"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x.mean(axis=0), (3, 17))
 
 
 class TestSeqSumPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "SUM"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
 
 
 class TestSeqSqrtPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "SQRT"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
-            len = lod[0][i + 1] - lod[0][i]
-            out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(len), (3, 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
+            seq_len = offset[0][i + 1] - offset[0][i]
+            out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(seq_len), (3, 17))
 
     def test_check_grad(self):
         # Remove MaxIndex after check_grad is refined.
@@ -150,36 +163,40 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
     def set_data(self):
         self.op_type = 'sequence_pool'
         x = np.random.uniform(0.1, 1, [13, 3, 11]).astype('float32')
-        lod = [[0, 4, 5, 8, 13]]
+        lod = [[4, 1, 3, 5]]
         self.inputs = {'X': (x, lod)}
-        for i in range(4):
-            l = lod[0][i + 1] - lod[0][i]
-            x[lod[0][i] + np.random.randint(l), :] += 1.0
+        offset = self.convert_to_offset(lod)
+        for i in range(len(offset[0]) - 1):
+            l = offset[0][i + 1] - offset[0][i]
+            x[offset[0][i] + np.random.randint(l), :] += 1.0
 
         out = np.zeros((4, 3, 11)).astype('float32')
         self.outputs = {'Out': out}
-        return x, lod, out
+        return x, offset, out
 
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "MAX"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 11))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 11))
             out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
 
 
 class TestSeqLastPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "LAST"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x[-1, :], (3, 17))
 
 
 class TestSeqFirstPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "FIRST"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x[0, :], (3, 17))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
index ebab77e8041d5ff1bd845fb121e5901116fd0254..8f0765277ae85af2b17ad96d4fd0c1148c393ff0 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
@@ -18,15 +18,17 @@ from op_test import OpTest
 
 
 def sequence_erase(in_seq, lod0, tokens):
-    new_lod0 = [0]
+    new_lod0 = []
     out_seq = []
-    for i in range(0, len(lod0) - 1):
+    offset = 0
+    for i in range(0, len(lod0)):
         num_out = 0
-        for dat in in_seq[lod0[i]:lod0[i + 1]]:
+        for dat in in_seq[offset:(offset + lod0[i])]:
             if dat not in tokens:
                 out_seq.append(dat)
                 num_out += 1
-        new_lod0.append(new_lod0[-1] + num_out)
+        offset += lod0[i]
+        new_lod0.append(num_out)
     return np.array(out_seq).astype("int32"), new_lod0
 
 
@@ -34,7 +36,7 @@ class TestSequenceEraseOpInt32(OpTest):
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        lod = [[0, 9, 13, 24, 30]]
+        lod = [[9, 4, 11, 6]]
         tokens = [2, 3, 5]
         out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
         self.attrs = {'tokens': tokens}
@@ -49,7 +51,7 @@ class TestSequenceEraseOpInt64(OpTest):
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
-        lod = [[0, 9, 13, 24, 30]]
+        lod = [[9, 4, 11, 6]]
         tokens = [2, 3, 5]
         out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
         self.attrs = {'tokens': tokens}
@@ -64,7 +66,7 @@ class TestSequenceEraseOpEmpty(OpTest):
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        lod = [[0, 9, 13, 24, 30]]
+        lod = [[9, 4, 11, 6]]
         tokens = []
         out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
         self.attrs = {'tokens': tokens}
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index 4c8ec1426c6e103498af544ea5928ec630707d46..0bbd31814efdff6050733f6876ef64e3fcaaaf76 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -21,7 +21,7 @@ class TestSequenceExpand(OpTest):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
         y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
-        y_lod = [[0, 1, 4, 8]]
+        y_lod = [[1, 3, 4]]
         self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
 
     def compute(self):
@@ -37,23 +37,27 @@ class TestSequenceExpand(OpTest):
         out = np.zeros(shape=((0, ) + x_data.shape[1:]), dtype=x_data.dtype)
 
         if x_lod is None:
-            x_idx = [i for i in xrange(x_data.shape[0] + 1)]
+            # x_idx = [i for i in xrange(x_data.shape[0] + 1)]
+            x_idx = [1] * x_data.shape[0]
         else:
             x_idx = x_lod[0]
-            out_lod = [[0]]
+            out_lod = [[]]
+
+        offset = 0
+        for i in xrange(len(y_lod[ref_level])):
+            repeat_num = y_lod[ref_level][i]
+            x_len = x_idx[i]
 
-        for i in xrange(1, len(y_lod[ref_level])):
-            repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1]
-            x_len = x_idx[i] - x_idx[i - 1]
             if repeat_num > 0:
-                x_sub = x_data[x_idx[i - 1]:x_idx[i], :]
+                x_sub = x_data[offset:(offset + x_len), :]
                 stacked_x_sub = x_sub
                 for r in range(repeat_num - 1):
                     stacked_x_sub = np.vstack((stacked_x_sub, x_sub))
                 out = np.vstack((out, stacked_x_sub))
                 if x_lod is not None:
                     for j in xrange(repeat_num):
-                        out_lod[0].append(out_lod[0][-1] + x_len)
+                        out_lod[0].append(x_len)
+            offset += x_len
 
         if x_lod is None:
             self.outputs = {'Out': out}
@@ -75,9 +79,9 @@ class TestSequenceExpand(OpTest):
 class TestSequenceExpandCase1(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
-        x_lod = [[0, 2, 5]]
+        x_lod = [[2, 3]]
         y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
-        y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]]
+        y_lod = [[2, 3], [2, 2, 3, 3, 3]]
         self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
         self.attrs = {'ref_level': 0}
 
@@ -85,9 +89,9 @@ class TestSequenceExpandCase1(TestSequenceExpand):
 class TestSequenceExpandCase2(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
-        x_lod = [[0, 1]]
+        x_lod = [[1]]
         y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
-        y_lod = [[0, 2], [0, 2]]
+        y_lod = [[2], [1, 1]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
         self.attrs = {'ref_level': 0}
 
@@ -95,9 +99,9 @@ class TestSequenceExpandCase2(TestSequenceExpand):
 class TestSequenceExpandCase3(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
-        x_lod = [[0, 1, 2, 3, 4]]
-        y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
-        y_lod = [[0, 2, 4, 4, 6]]
+        x_lod = [[1, 1, 1, 1]]
+        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
+        y_lod = [[2, 2, 2, 2]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
@@ -105,9 +109,9 @@ class TestSequenceExpandCase4(TestSequenceExpand):
     def set_data(self):
         data = np.random.uniform(0.1, 1, [5 * 2, 1])
         x_data = np.array(data).reshape([5, 2]).astype('float32')
-        x_lod = [[0, 2, 5]]
-        y_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
-        y_lod = [[0, 1, 3], [0, 1, 3]]
+        x_lod = [[2, 3]]
+        y_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
+        y_lod = [[2], [2, 3]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
index efeab560392d8c03b1bb5db83f59c12d4fef64b0..68f2e5eba35ed318281d14e397dc6d363bcb4079 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
@@ -22,7 +22,7 @@ class TestSequenceReshape(OpTest):
     def setUp(self):
         self.op_type = 'sequence_reshape'
         dimension = 12
-        x_lod = [[0, 4, 5, 8, 11]]
+        x_lod = [[4, 1, 3, 3]]
         x = np.random.uniform(0.1, 1, [11, 24]).astype('float32')
 
         self.inputs = {'X': (x, x_lod)}
@@ -34,13 +34,13 @@ class TestSequenceReshape(OpTest):
 
     def compute_output(self, x, x_lod, dimension):
         x_width = x.shape[1]
-        out_lod = [[0]]
-        for i in xrange(len(x_lod[0]) - 1):
-            seq_len = x_lod[0][i + 1] - x_lod[0][i]
+        out_lod = [[]]
+        for i in xrange(len(x_lod[0])):
+            seq_len = x_lod[0][i]
             offset = (seq_len * x_width) / dimension
             assert int(offset) * dimension == seq_len * x_width
-            out_lod[0].append(out_lod[0][-1] + int(offset))
-        out = np.zeros(shape=(out_lod[0][-1], dimension)).astype('float32')
+            out_lod[0].append(int(offset))
+        out = np.zeros(shape=(sum(out_lod[0]), dimension)).astype('float32')
         out.ravel()[:] = x.ravel()[:]
         return out, out_lod
 
@@ -55,7 +55,7 @@ class TestSequenceReshape_reduce(TestSequenceReshape):
     def setUp(self):
         self.op_type = 'sequence_reshape'
         dimension = 24
-        x_lod = [[0, 4, 6, 8, 12]]
+        x_lod = [[4, 2, 2, 4]]
         x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
 
         self.inputs = {'X': (x, x_lod)}
@@ -70,7 +70,7 @@ class TestSequenceReshape_same(TestSequenceReshape):
     def setUp(self):
         self.op_type = 'sequence_reshape'
         dimension = 12
-        x_lod = [[0, 4, 6, 8, 12]]
+        x_lod = [[4, 2, 2, 4]]
         x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
 
         self.inputs = {'X': (x, x_lod)}
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
index 660b4a171d09ddfc0e78b650a467db6b576c7ee3..313e485d1e3080f2c59c68256cbc5c81aa6558cd 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
@@ -29,20 +29,20 @@ class TestSequenceSliceOp(OpTest):
 
         self.inputs = {'X': (x, lod), 'Offset': offset, 'Length': length}
         outs = []  #np.zeros((100, 3, 2)).astype('float32')
-        out_lod = [[0]]
-        out_lod_offset = 0
+        out_lod = [[]]
+        lod_offset = 0
         for i in range(len(offset)):
-            sub_x = x[lod[0][i] + offset[i, 0]:lod[0][i] + offset[i, 0] +
+            sub_x = x[lod_offset + offset[i, 0]:lod_offset + offset[i, 0] +
                       length[i, 0], :]
-            out_lod_offset = out_lod_offset + len(sub_x)
             outs.append(sub_x)
-            out_lod[0].append(out_lod_offset)
+            out_lod[0].append(len(sub_x))
+            lod_offset += lod[0][i]
         outs = np.concatenate(outs, axis=0)
         self.outputs = {'Out': (outs, out_lod)}
 
     def init_test_case(self):
         self.x_dim = (100, 3, 2)
-        self.x_lod = [[0, 20, 40, 60, 80, 100]]
+        self.x_lod = [[20, 20, 20, 20, 20]]
         self.offset = [[1], [2], [3], [4], [5]]
         self.length = [[10], [8], [6], [4], [2]]
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
index d6dc99bb3106feee33daa52bffb386f07cc16de5..e91a69a0f8039651225039beb2a42e8dffeb62d3 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
@@ -26,15 +26,16 @@ class TestSequenceSoftmaxOp(OpTest):
         self.init_op_type()
 
         x = np.random.uniform(0.1, 1, (11, 1)).astype("float32")
-        lod = [[0, 4, 5, 8, 11]]
+        lod = [[4, 1, 3, 3]]
 
         out = np.zeros((11, 1)).astype("float32")
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            sub_x = sub_x.reshape(1, lod[0][i + 1] - lod[0][i])
+        offset = 0
+        for i in range(len(lod[0])):
+            sub_x = x[offset:offset + lod[0][i], :]
+            sub_x = sub_x.reshape(1, lod[0][i])
             sub_out = stable_softmax(sub_x)
-            out[lod[0][i]:lod[0][i + 1], :] = sub_out.reshape(
-                lod[0][i + 1] - lod[0][i], 1)
+            out[offset:offset + lod[0][i], :] = sub_out.reshape(lod[0][i], 1)
+            offset += lod[0][i]
 
         self.inputs = {"X": (x, lod)}
         self.outputs = {"Out": out}
diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
index 1d93230e7b74c5b6c00bbe125e3ae2d3a649b4b9..b779f0fb014bbba62927754ea6f36828a32e6c0a 100644
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
@@ -54,12 +54,12 @@ class TestShrinkRNNMemoryReferLoD(TestShrinkRNNMemoryBase):
     def test_refer_lod(self):
         cpu = core.CPUPlace()
         x_tensor = core.LoDTensor()
-        x_tensor.set_lod([[0, 2, 5, 6]])
+        x_tensor.set_recursive_sequence_lengths([[2, 3, 1]])
         tensor_np = np.random.random(size=(6, 100)).astype('float32')
         x_tensor.set(tensor_np, cpu)
 
         rank_table_tensor = core.LoDTensor()
-        rank_table_tensor.set_lod([[0, 1, 3, 6]])
+        rank_table_tensor.set_recursive_sequence_lengths([[1, 2, 3]])
         rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
                               cpu)
 
@@ -83,7 +83,7 @@ class TestShrinkRNNMemoryNoLoD(TestShrinkRNNMemoryBase):
         x_tensor.set(tensor_np, cpu)
 
         rank_table_tensor = core.LoDTensor()
-        rank_table_tensor.set_lod([[0, 1, 3, 6]])
+        rank_table_tensor.set_recursive_sequence_lengths([[1, 2, 3]])
         rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
                               cpu)
 
diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
index 02cc7da84918041c33bf5c8def46025bc87a2b9e..0916ed7c9f1e2d6d90c6908983fdc8b177aecbb9 100644
--- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
@@ -56,7 +56,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
     def test_split_and_merge_lod_tensor_level_0(self):
         tensor = core.LoDTensor()
         tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
 
         mask_np = np.array([0, 1, 0]).astype('bool')
         mask_np = np.expand_dims(mask_np, axis=1)
@@ -68,15 +68,15 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         expect_true_tensor = np.expand_dims(expect_true_tensor, axis=1)
         expect_true = core.LoDTensor()
         expect_true.set(expect_true_tensor, self.place())
-        expect_true.set_lod([[0, 6]])
+        expect_true.set_recursive_sequence_lengths([[6]])
 
         expect_false_tensor = np.array([0, 1, 2, 9]).astype('int32')
         expect_false_tensor = np.expand_dims(expect_false_tensor, axis=1)
-        expect_false_lod = [[0, 3, 4]]
+        expect_false_lod = [[3, 1]]
 
         expect_false = core.LoDTensor()
         expect_false.set(expect_false_tensor, self.place())
-        expect_false.set_lod(expect_false_lod)
+        expect_false.set_recursive_sequence_lengths(expect_false_lod)
 
         self.main(
             tensor=tensor,
@@ -126,7 +126,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
 
     def check_tensor_same(self, actual, expect):
         self.assertTrue(np.allclose(np.array(actual), np.array(expect)))
-        self.assertEqual(actual.lod(), expect.lod())
+        self.assertEqual(actual.recursive_sequence_lengths(),
+                         expect.recursive_sequence_lengths())
 
 
 class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
@@ -151,7 +152,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
 
         tensor = core.LoDTensor()
         tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
 
         mask_np = np.array([0, 1, 0]).astype('bool')
         mask_np = np.expand_dims(mask_np, axis=1)
diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
index ccb41e56c5555b8c79674449c9139ada0bc47aac..bd208897520122b6a5dcf71da325b1b9dba632f6 100644
--- a/python/paddle/fluid/tests/unittests/test_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
@@ -22,22 +22,23 @@ def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod):
     if len(gt_lod) != len(neg_lod):
         raise AssertionError("The input arguments are illegal.")
 
-    batch_size = len(gt_lod) - 1
+    batch_size = len(gt_lod)
 
     match_indices = -1 * np.ones((batch_size, num_prior)).astype('int32')
-    neg_indices = np.zeros((neg_lod[-1], 1)).astype('int32')
+    neg_indices = np.zeros((sum(neg_lod), 1)).astype('int32')
 
+    offset = 0
     for n in range(batch_size):
-        gt_num = gt_lod[n + 1] - gt_lod[n]
+        gt_num = gt_lod[n]
         ids = random.sample([i for i in range(num_prior)], gt_num)
         match_indices[n, ids] = [i for i in range(gt_num)]
 
         ret_ids = set([i for i in range(num_prior)]) - set(ids)
-        s = neg_lod[n]
-        e = neg_lod[n + 1]
-        l = e - s
+        l = neg_lod[n]
         neg_ids = random.sample(ret_ids, l)
-        neg_indices[s:e, :] = np.array(neg_ids).astype('int32').reshape(l, 1)
+        neg_indices[offset:offset + neg_lod[n], :] = np.array(neg_ids).astype(
+            'int32').reshape(l, 1)
+        offset += neg_lod[n]
 
     return match_indices, neg_indices
 
@@ -56,24 +57,28 @@ def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
     # init weight for target label
     trg_label_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
 
+    gt_offset = 0
+    neg_offset = 0
     for i in range(batch_size):
         cur_indices = match_indices[i]
         col_ids = np.where(cur_indices > -1)
         col_val = cur_indices[col_ids]
 
-        gt_start = gt_lod[i]
         # target bbox
-        for v, c in zip(col_val + gt_start, col_ids[0].tolist()):
+        for v, c in zip(col_val + gt_offset, col_ids[0].tolist()):
             trg_box[i][c][:] = encoded_box[v][c][:]
         # weight for target bbox
         trg_box_wt[i][col_ids] = 1.0
 
-        trg_label[i][col_ids] = gt_label[col_val + gt_start]
+        trg_label[i][col_ids] = gt_label[col_val + gt_offset]
         trg_label_wt[i][col_ids] = 1.0
         # set target label weight to 1.0 for the negative samples
         if neg_indices is not None:
-            neg_ids = neg_indices[neg_lod[i]:neg_lod[i + 1]]
+            neg_ids = neg_indices[neg_offset:neg_offset + neg_lod[i]]
             trg_label_wt[i][neg_ids] = 1.0
+        # update offset
+        gt_offset += gt_lod[i]
+        neg_offset += neg_lod[i]
 
     return trg_box, trg_box_wt, trg_label, trg_label_wt
 
@@ -83,11 +88,11 @@ class TestTargetAssginFloatType(OpTest):
         self.op_type = "target_assign"
         num_prior = 120
         num_class = 21
-        gt_lod = [0, 5, 11, 23]
-        neg_lod = [0, 4, 7, 13]
+        gt_lod = [5, 6, 12]
+        neg_lod = [4, 3, 6]
         mismatch_value = 0
-        batch_size = len(gt_lod) - 1
-        num_gt = gt_lod[-1]
+        batch_size = len(gt_lod)
+        num_gt = sum(gt_lod)
 
         encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
         gt_label = np.random.randint(
@@ -121,11 +126,11 @@ class TestTargetAssginIntType(OpTest):
         self.op_type = "target_assign"
         num_prior = 120
         num_class = 21
-        gt_lod = [0, 5, 11, 23]
-        neg_lod = [0, 4, 7, 13]
+        gt_lod = [5, 6, 12]
+        neg_lod = [4, 3, 6]
         mismatch_value = 0
-        batch_size = len(gt_lod) - 1
-        num_gt = gt_lod[-1]
+        batch_size = len(gt_lod)
+        num_gt = sum(gt_lod)
 
         encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
         gt_label = np.random.randint(
diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py
index 379081c3287ce81dbf2bd7307cb5eac2620b13db..f17edd3025b17549892bbd47935a1d2452cefac3 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -69,15 +69,14 @@ class TestTensor(unittest.TestCase):
         array[0, 0, 0] = 3
         array[3, 3, 5] = 10
         lod_tensor.set(array, place)
-        lod_tensor.set_lod([[0, 2, 4]])
+        lod_tensor.set_recursive_sequence_lengths([[2, 2]])
 
         lod_v = numpy.array(lod_tensor)
         self.assertTrue(numpy.alltrue(array == lod_v))
 
-        lod = lod_tensor.lod()
-        self.assertEqual(0, lod[0][0])
+        lod = lod_tensor.recursive_sequence_lengths()
+        self.assertEqual(2, lod[0][0])
         self.assertEqual(2, lod[0][1])
-        self.assertEqual(4, lod[0][2])
 
     def test_float_lod_tensor(self):
         place = core.CPUPlace()
@@ -97,21 +96,21 @@ class TestTensor(unittest.TestCase):
         lod_v = numpy.array(lod_tensor)
         self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
         self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-        self.assertEqual(len(lod_tensor.lod()), 0)
+        self.assertEqual(len(lod_tensor.recursive_sequence_lengths()), 0)
 
-        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
-        lod_tensor.set_lod(lod_py)
-        lod = lod_tensor.lod()
+        lod_py = [[2, 1], [1, 2, 2]]
+        lod_tensor.set_recursive_sequence_lengths(lod_py)
+        lod = lod_tensor.recursive_sequence_lengths()
         self.assertListEqual(lod_py, lod)
 
     def test_lod_tensor_init(self):
         scope = core.Scope()
         place = core.CPUPlace()
-        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
+        lod_py = [[2, 1], [1, 2, 2]]
         lod_tensor = core.LoDTensor()
 
         lod_tensor.set_dims([5, 2, 3, 4])
-        lod_tensor.set_lod(lod_py)
+        lod_tensor.set_recursive_sequence_lengths(lod_py)
         lod_tensor.alloc_float(place)
         tensor_array = numpy.array(lod_tensor)
         tensor_array[0, 0, 0, 0] = 1.0
@@ -121,17 +120,17 @@ class TestTensor(unittest.TestCase):
         lod_v = numpy.array(lod_tensor)
         self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
         self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-        self.assertListEqual(lod_py, lod_tensor.lod())
+        self.assertListEqual(lod_py, lod_tensor.recursive_sequence_lengths())
 
     def test_lod_tensor_gpu_init(self):
         if not core.is_compiled_with_cuda():
             return
         place = core.CUDAPlace(0)
-        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
+        lod_py = [[2, 1], [1, 2, 2]]
         lod_tensor = core.LoDTensor()
 
         lod_tensor.set_dims([5, 2, 3, 4])
-        lod_tensor.set_lod(lod_py)
+        lod_tensor.set_recursive_sequence_lengths(lod_py)
         lod_tensor.alloc_float(place)
         tensor_array = numpy.array(lod_tensor)
         tensor_array[0, 0, 0, 0] = 1.0
@@ -141,7 +140,7 @@ class TestTensor(unittest.TestCase):
         lod_v = numpy.array(lod_tensor)
         self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
         self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-        self.assertListEqual(lod_py, lod_tensor.lod())
+        self.assertListEqual(lod_py, lod_tensor.recursive_sequence_lengths())
 
     def test_empty_tensor(self):
         place = core.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index ac638f7836f8205f80e31cfd5eb8892b2c7aee08..9f1aaee472f918da7deb8816a0a4654dafe74a30 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -34,8 +34,8 @@ class CTCForward(object):
 
         self.level = 0
         self.num_classes = softmax.shape[1]
-        self.batch_size = len(softmax_lod[self.level]) - 1
-        assert self.batch_size == len(labels_lod[self.level]) - 1
+        self.batch_size = len(softmax_lod[self.level])
+        assert self.batch_size == len(labels_lod[self.level])
 
         self.loss = np.zeros([self.batch_size, 1], dtype="float32")
         self.gradient = np.zeros(self.softmax.shape, dtype="float32")
@@ -156,16 +156,20 @@ class CTCForward(object):
         return -log_prob
 
     def forward(self):
+        softmax_offset = 0
+        labels_offset = 0
         for i in range(self.batch_size):
-            softmax_start_i = self.softmax_lod[self.level][i]
-            softmax_end_i = self.softmax_lod[self.level][i + 1]
-            labels_start_i = self.labels_lod[self.level][i]
-            labels_end_i = self.labels_lod[self.level][i + 1]
+            softmax_start_i = softmax_offset
+            softmax_end_i = softmax_offset + self.softmax_lod[self.level][i]
+            labels_start_i = labels_offset
+            labels_end_i = labels_offset + self.labels_lod[self.level][i]
 
             softmax_a_sequence = self.softmax[softmax_start_i:softmax_end_i, :]
             labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
             self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
                                                    labels_a_sequence)
+            softmax_offset += self.softmax_lod[self.level][i]
+            labels_offset += self.labels_lod[self.level][i]
         return self.loss
 
 
@@ -173,8 +177,8 @@ class TestWarpCTCOp(OpTest):
     def config(self):
         self.batch_size = 4
         self.num_classes = 8
-        self.logits_lod = [[0, 4, 5, 8, 11]]
-        self.labels_lod = [[0, 3, 4, 8, 12]]
+        self.logits_lod = [[4, 1, 3, 3]]
+        self.labels_lod = [[3, 1, 4, 4]]
         self.blank = self.num_classes - 1
         self.norm_by_times = False
 
@@ -184,11 +188,13 @@ class TestWarpCTCOp(OpTest):
 
         logits = np.random.uniform(
             0.1, 1.0,
-            [self.logits_lod[0][-1], self.num_classes]).astype("float32")
+            [sum(self.logits_lod[0]), self.num_classes]).astype("float32")
         softmax = np.apply_along_axis(stable_softmax, 1, logits)
         # labels should not be blank
         labels = np.random.randint(
-            0, self.num_classes - 1, [self.labels_lod[0][-1], 1], dtype="int32")
+            0,
+            self.num_classes - 1, [sum(self.labels_lod[0]), 1],
+            dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
                          self.blank, self.norm_by_times)
@@ -196,9 +202,8 @@ class TestWarpCTCOp(OpTest):
 
         max_sequence_length = 0
         for i in range(self.batch_size):
-            max_sequence_length = max(
-                max_sequence_length,
-                self.logits_lod[0][i + 1] - self.logits_lod[0][i])
+            max_sequence_length = max(max_sequence_length,
+                                      self.logits_lod[0][i])
         self.gradient = np.zeros(
             [max_sequence_length, self.batch_size, self.num_classes],
             dtype="float32")
@@ -222,8 +227,8 @@ class TestWarpCTCOpCase1(TestWarpCTCOp):
     def config(self):
         self.batch_size = 4
         self.num_classes = CUDA_BLOCK_SIZE + 2
-        self.logits_lod = [[0, 4, 5, 8, 11]]
-        self.labels_lod = [[0, 3, 4, 8, 12]]
+        self.logits_lod = [[4, 1, 3, 3]]
+        self.labels_lod = [[3, 1, 4, 4]]
         self.blank = 0
         self.norm_by_times = False
 
diff --git a/python/paddle/fluid/tests/unittests/test_weight_normalization.py b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
index 2adf917bc5d3bb35842a817c57a983627b759f22..436f9b9f86fb86270e47c8e30c5c0701787ca0f1 100644
--- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
@@ -76,11 +76,11 @@ class TestWeightNormalization(unittest.TestCase):
                 lod_level_i = numpy.random.randint(
                     low=1,
                     high=5,
-                    size=self.batch_size if i == 0 else lod_level_i[-1])
-                lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
+                    size=self.batch_size
+                    if i == 0 else sum(lod_level_i)).tolist()
                 data_lod.append(lod_level_i)
             data_value = numpy.random.random(
-                size=[data_lod[-1][-1] if data_lod else self.batch_size
+                size=[sum(data_lod[-1]) if data_lod else self.batch_size
                       ] + data_shape).astype('float32')
             self.data[data_name] = (data_value, data_lod)
 
@@ -90,7 +90,7 @@ class TestWeightNormalization(unittest.TestCase):
             tensor = fluid.Tensor()
             tensor.set(self.data[desc[0]][0], place)
             if self.data[desc[0]][1]:
-                tensor.set_lod(self.data[desc[0]][1])
+                tensor.set_recursive_sequence_lengths(self.data[desc[0]][1])
             self.inputs[desc[0]] = tensor
 
     def weight_normalize(self):
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index 1dc94a80c9d3999d34fdf0edbf82ffe297bd95d7..a995ee10f29a714b674fae4b31070e6ba2ca9953 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -22,7 +22,7 @@ def as_lodtensor(np_array, lod, place):
     tensor = core.LoDTensor()
     tensor.set(np_value, place)
     if lod is not None:
-        tensor.set_lod(lod)
+        tensor.set_recursive_sequence_lengths(lod)
     return tensor
 
 
@@ -73,7 +73,7 @@ def set_input(scope, op, inputs, place):
         if isinstance(var, tuple) or isinstance(var, np.ndarray):
             tensor = scope.find_var(var_name).get_tensor()
             if isinstance(var, tuple):
-                tensor.set_lod(var[1])
+                tensor.set_recursive_sequence_lengths(var[1])
                 var = var[0]
             tensor.set_dims(var.shape)
             tensor.set(var, place)
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 9c604170b8b53c9cbcf39b4978ae60ccad84648c..99146bcfe57ee5192f7586ac1a4a8fbc5b94035d 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -24,7 +24,7 @@ Steps to transpile trainer:
 1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
 2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
 3. modify trainer program add split_op to each grad variable.
-4. append send_op to send splited variables to server and 
+4. append send_op to send splited variables to server and
 5. add recv_op to fetch params(splited blocks or origin param) from server.
 6. append concat_op to merge splited blocks to update local weights.
 
@@ -44,7 +44,7 @@ import numpy as np
 from ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
 from ..framework import Program, default_main_program, \
-                        default_startup_program, \
+                        default_startup_program, Block, \
                         Variable, Parameter, grad_var_name
 from details import *
 
@@ -471,7 +471,7 @@ class DistributeTranspiler:
                 self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
                                          self.origin_program, merged_var)
             else:
-                self._append_pserver_non_opt_ops(block, op, endpoint)
+                self._append_pserver_non_opt_ops(block, op)
 
         def __op_have_grad_input__(op):
             for varname in op.input_arg_names:
@@ -479,13 +479,39 @@ class DistributeTranspiler:
                     return varname
             return ""
 
+        def __clone_lr_op_sub_block__(op, program, new_block):
+            if not op.has_attr('sub_block'):
+                return
+
+            origin_block_desc = op.attr('sub_block')
+            origin_block = self.origin_program.block(origin_block_desc.id)
+            assert isinstance(origin_block, Block)
+            # we put the new sub block to new block to follow the block
+            # hierarchy of the original blocks
+            new_sub_block = program.create_block(new_block.idx)
+
+            # clone vars
+            for var in origin_block.vars:
+                new_sub_block.clone_variable(var)
+
+            # clone ops
+            for op in origin_block.ops:
+                self._clone_lr_op(program, new_sub_block, op)
+                # clone sub_block of op
+                __clone_lr_op_sub_block__(op, program, new_sub_block)
+
+            # reset the block of op
+            op.set_attr('sub_block', new_sub_block)
+
         # append lr decay ops to the child block if exists
         lr_ops = self._get_lr_ops()
         if len(lr_ops) > 0:
             lr_decay_block = pserver_program.create_block(
                 pserver_program.num_blocks - 1)
             for _, op in enumerate(lr_ops):
-                self._append_pserver_non_opt_ops(lr_decay_block, op, endpoint)
+                self._append_pserver_non_opt_ops(lr_decay_block, op)
+                # append sub blocks to pserver_program in lr_decay_op
+                __clone_lr_op_sub_block__(op, pserver_program, lr_decay_block)
 
         # append op to the current block
         grad_to_block_id = []
@@ -1116,7 +1142,29 @@ class DistributeTranspiler:
                     break
         return grad_block
 
-    def _append_pserver_non_opt_ops(self, optimize_block, opt_op, endpoint):
+    def _clone_lr_op(self, program, block, op):
+        inputs = self._get_input_map_from_op(
+            self.origin_program.global_block().vars, op)
+        for key, varlist in inputs.iteritems():
+            if not isinstance(varlist, list):
+                varlist = [varlist]
+            for var in varlist:
+                if var not in program.global_block().vars:
+                    block.clone_variable(var)
+
+        outputs = self._get_output_map_from_op(
+            self.origin_program.global_block().vars, op)
+        for key, varlist in outputs.iteritems():
+            if not isinstance(varlist, list):
+                varlist = [varlist]
+            for var in varlist:
+                if var not in program.global_block().vars:
+                    block.clone_variable(var)
+
+        block.append_op(
+            type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs)
+
+    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
         program = optimize_block.program
         # Append the ops for parameters that do not need to be optimized/updated
         inputs = self._get_input_map_from_op(
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index 202aa76084432b4b2378470919b2e924301f2130..0629f2916b339a6cd19ccadf435a67a17d6da4cc 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -19,16 +19,30 @@ from ..executor import global_scope
 
 
 class InferenceTranspiler:
+    '''
+    Convert the fluid program to optimized inference program. 
+    
+    There are several optimizations, only fuse batch normalization is supported now.
+
+    Examples:
+   
+    .. code-block:: python
+
+        # As InferenceTranspiler will modify the original program,
+        # please clone before use it.
+        inference_transpiler_program = program.clone()
+        t = fluid.InferenceTranspiler()
+        t.transpile(inference_transpiler_program, place)
+    '''
+
     def transpile(self, program, place, scope=None):
         '''
-        Transpile the program. Support only fuse batch normalization now.
-
-        :param program: program to transpile 
-        :type program: Program
-        :param place: inference place 
-        :type place: Place
-        :param scope: inference scope 
-        :type scope: Scope or None
+        Run the transpiler.
+
+        Args:
+            program (Program): program to transpile
+            place (Place): inference place
+            scope (Scope|None): inference Scope
         '''
         if not isinstance(program, Program):
             raise TypeError("program should be as Program type")
@@ -49,36 +63,43 @@ class InferenceTranspiler:
         can be integrated with them. Doing so will give us a forward acceleration, 
         especially in environments like mobile or embedded.
                     
-        For input X:
-        - Conv process:        X = input * W + bias 
-        - Batch norm process:  X' = (X - mean) / std 
-        - Scale Process:       Y = a * X' + b
+        For input :math:`X`:
+
+        - Conv process:        :math:`X = input * W + bias` 
+        - Batch norm process:  :math:`X' = (X - mean) / std` 
+        - Scale Process:       :math:`Y = a * X' + b`
 
         After fuse into one operation:
 
-        Y = (input * W + bias - mean) / std * a + b
-          = input * a * W / std + ((bias - mean) / std * a + b)
+        .. math::
+
+            Y &= (input * W + bias - mean) / std * a + b \\\\
+              &= input * a * W / std + ((bias - mean) / std * a + b)
 
         The operator transformation is: 
+
         - before:
+
           - conv->batch_norm->any_other_op (bias == 0)
           - conv->elementwise_add->batch_norm->any_other_op (bias != 0)
+            
         - after: 
+
           - conv->elementwise_add->any_other_op
         
         The transpile stages are:
+
         1. insert elementwise_add op when bias == 0.
         2. fuse the batch_norm's parameters to conv and elementwise_add operators.
         3. remove batch_norm ops which are not used in any other ops.
         4. adjust the input of any_other_op to be the output of elementwise_add operator.
         5. remove unused variables.
 
-        :param program: program to transpile 
-        :type program: Program
-        :param place: inference place 
-        :type place: Place
-        :param scope: inference scope 
-        :type scope: Scope
+        Args:
+            program (Program): program to transpile
+            place (Place): inference place
+            scope (Scope): inference Scope
+        
         '''
         self.scope = scope
         self.place = place
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 9ff0ae6fca27d4681891b2033e2f8f95bd825942..8bfb554845d9b128f000d6c90cf626416a198eef 100644
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -157,9 +157,11 @@ class ControlFlowGraph(object):
             if op.type() == "fill_constant" and op.attr("force_cpu") == True:
                 self._skip_opt.update(op.output_arg_names())
 
-    def release_memory(self):
+    def release_memory(self, skip_opt_set=None):
         self._dataflow_analyze()
         self._update_skip_opt_set()
+        if skip_opt_set:
+            self._skip_opt.update(skip_opt_set)
         fwd_id = 0
         bwd_id = 0
         for i in range(self.op_size):
@@ -183,7 +185,7 @@ class ControlFlowGraph(object):
                 else:
                     bwd_id += 1
 
-    def memory_optimize(self, level=0):
+    def memory_optimize(self, skip_opt_set=None, level=0):
         def compare_shape(x_shape, cache_shape, opt_level):
             if opt_level == 0:
                 return x_shape == cache_shape
@@ -200,6 +202,9 @@ class ControlFlowGraph(object):
 
         self._dataflow_analyze()
         self._update_skip_opt_set()
+        # update skip set to meet users' demand
+        if skip_opt_set:
+            self._skip_opt.update(skip_opt_set)
         self.pool = []
         for i in range(self.op_size):
             op = self._ops[i]
@@ -358,7 +363,7 @@ def _get_cfgs(input_program):
     return cfgs
 
 
-def memory_optimize(input_program, print_log=False, level=0):
+def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
     """Optimize memory by reusing var memory.
 
       Note: it doesn't not support subblock nested in subblock.
@@ -374,10 +379,10 @@ def memory_optimize(input_program, print_log=False, level=0):
     PRINT_LOG = print_log
     cfgs = _get_cfgs(input_program)
     for cfg in cfgs:
-        cfg.memory_optimize(level)
+        cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)
 
 
-def release_memory(input_program):
+def release_memory(input_program, skip_opt_set=None):
     cfgs = _get_cfgs(input_program)
     for cfg in cfgs:
-        cfg.release_memory()
+        cfg.release_memory(skip_opt_set=skip_opt_set)
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index b194af76dc529fd52b0aedfab9c41d625fe64c0d..a9775e10ef51fae493523149ee3dbbf227a1aaa9 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -7,7 +7,7 @@ for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
     if [[ $file =~ ^(paddle/api/.*|paddle/capi/.*|paddle/contrib/.*|paddle/cuda/.*|paddle/function/.*|paddle/gserver/.*|paddle/math/.*|paddle/optimizer/.*|paddle/parameter/.*|paddle/pserver/.*|paddle/trainer/.*|paddle/utils/.*) ]]; then
         continue;
     else
-        cpplint $file;
+        cpplint --filter=-readability/fn_size $file;
         TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
     fi
 done