Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into argmin_argmax

5c9dcc4d · sneaxiy · fa002cd6 · a29cb4be · 5c9dcc4d · 5c9dcc4d
178 changed file
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -22,6 +22,7 @@
 | jczaja | Jacek Czaja |
 | JiayiFeng | Jia-Yi Feng |
 | kbinias | Krzysztof Binias |
+| kexinzhao | Ke-Xin Zhao |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
 | lipeng-unisound | Peng Li |

--- a/benchmark/fluid/models/machine_translation.py
+++ b/benchmark/fluid/models/machine_translation.py
@@ -173,21 +173,6 @@ def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
        return avg_cost, feeding_list


-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    lod_t = core.LoDTensor()
-    lod_t.set(flattened_data, place)
-    lod_t.set_lod([lod])
-    return lod_t, lod[-1]
-
-
 def lodtensor_to_ndarray(lod_tensor):
    dims = lod_tensor.get_dims()
    ndarray = np.zeros(shape=dims).astype('float32')

--- a/benchmark/fluid/models/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -125,18 +125,3 @@ def get_model(args):
        batch_size=args.batch_size)

    return loss, inference_program, adam, train_reader, test_reader, batch_acc
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = numpy.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -39,7 +39,7 @@ function(copy TARGET)
        message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
    endif()
    math(EXPR len "${copy_lib_SRCS_len} - 1")
-    
+
    add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
    foreach(index RANGE ${len})
        list(GET copy_lib_SRCS ${index} src)
@@ -155,6 +155,15 @@ copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
 )

+if(WITH_CONTRIB)
+   set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
+   copy(contrib_inference_lib DEPS paddle_inference_api
+        SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
+        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
+        DSTS ${contrib_dst_dir} ${contrib_dst_dir}
+   )
+endif()
+
 set(module "platform")
 copy(platform_lib DEPS profiler_py_proto
  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h

--- a/doc/fluid/api/gen_doc.sh
+++ b/doc/fluid/api/gen_doc.sh
 #!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler > layers.rst
+python gen_doc.py layers --submodules control_flow device io nn ops tensor detection learning_rate_scheduler metric > layers.rst

 for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer
 do

--- a/doc/fluid/api/initializer.rst
+++ b/doc/fluid/api/initializer.rst
@@ -33,6 +33,13 @@ Xavier
    :members:
    :noindex:

+Bilinear
+--------
+
+..  autoclass:: paddle.fluid.initializer.Bilinear
+    :members:
+    :noindex:
+
 force_init_on_cpu
 -----------------

@@ -73,3 +80,10 @@ XavierInitializer
    :members:
    :noindex:

+BilinearInitializer
+-------------------
+
+..  autoclass:: paddle.fluid.initializer.BilinearInitializer
+    :members:
+    :noindex:
+
--- a/doc/fluid/api/io.rst
+++ b/doc/fluid/api/io.rst
@@ -59,3 +59,39 @@ get_inference_program
 ..  autofunction:: paddle.fluid.io.get_inference_program
    :noindex:

+save_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.save_checkpoint
+    :noindex:
+
+load_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.load_checkpoint
+    :noindex:
+
+clean_checkpoint
+----------------
+
+..  autofunction:: paddle.fluid.io.clean_checkpoint
+    :noindex:
+
+load_persist_vars_without_grad
+------------------------------
+
+..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
+    :noindex:
+
+save_persist_vars_without_grad
+------------------------------
+
+..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
+    :noindex:
+
+get_latest_checkpoint_serial
+----------------------------
+
+..  autofunction:: paddle.fluid.io.get_latest_checkpoint_serial
+    :noindex:
+
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -181,6 +181,12 @@ Print
 ..  autofunction:: paddle.fluid.layers.Print
    :noindex:

+is_empty
+--------
+
+..  autofunction:: paddle.fluid.layers.is_empty
+    :noindex:
+
 device
 ======

@@ -219,6 +225,12 @@ Send
 ..  autofunction:: paddle.fluid.layers.Send
    :noindex:

+Recv
+----
+
+..  autofunction:: paddle.fluid.layers.Recv
+    :noindex:
+
 open_recordio_file
 ------------------

@@ -255,6 +267,25 @@ double_buffer
 ..  autofunction:: paddle.fluid.layers.double_buffer
    :noindex:

+random_data_generator
+---------------------
+
+..  autofunction:: paddle.fluid.layers.random_data_generator
+    :noindex:
+
+Preprocessor
+------------
+
+..  autoclass:: paddle.fluid.layers.Preprocessor
+    :members:
+    :noindex:
+
+load
+----
+
+..  autofunction:: paddle.fluid.layers.load
+    :noindex:
+
 nn
 ==

@@ -342,6 +373,12 @@ conv2d
 ..  autofunction:: paddle.fluid.layers.conv2d
    :noindex:

+conv3d
+------
+
+..  autofunction:: paddle.fluid.layers.conv3d
+    :noindex:
+
 sequence_pool
 -------------

@@ -366,6 +403,12 @@ pool2d
 ..  autofunction:: paddle.fluid.layers.pool2d
    :noindex:

+pool3d
+------
+
+..  autofunction:: paddle.fluid.layers.pool3d
+    :noindex:
+
 batch_norm
 ----------

@@ -384,6 +427,12 @@ conv2d_transpose
 ..  autofunction:: paddle.fluid.layers.conv2d_transpose
    :noindex:

+conv3d_transpose
+----------------
+
+..  autofunction:: paddle.fluid.layers.conv3d_transpose
+    :noindex:
+
 sequence_expand
 ---------------

@@ -594,6 +643,48 @@ roi_pool
 ..  autofunction:: paddle.fluid.layers.roi_pool
    :noindex:

+dice_loss
+---------
+
+..  autofunction:: paddle.fluid.layers.dice_loss
+    :noindex:
+
+image_resize
+------------
+
+..  autofunction:: paddle.fluid.layers.image_resize
+    :noindex:
+
+image_resize_short
+------------------
+
+..  autofunction:: paddle.fluid.layers.image_resize_short
+    :noindex:
+
+resize_bilinear
+---------------
+
+..  autofunction:: paddle.fluid.layers.resize_bilinear
+    :noindex:
+
+gather
+------
+
+..  autofunction:: paddle.fluid.layers.gather
+    :noindex:
+
+random_crop
+-----------
+
+..  autofunction:: paddle.fluid.layers.random_crop
+    :noindex:
+
+mean_iou
+--------
+
+..  autofunction:: paddle.fluid.layers.mean_iou
+    :noindex:
+
 ops
 ===

@@ -699,12 +790,6 @@ logical_not
 ..  autofunction:: paddle.fluid.layers.logical_not
    :noindex:

-uniform_random
--------------
-
-..  autofunction:: paddle.fluid.layers.uniform_random
-    :noindex:
-
 uniform_random_batch_size_like
 ------------------------------

@@ -723,12 +808,6 @@ gaussian_random_batch_size_like
 ..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
    :noindex:

-cumsum
------
-
-..  autofunction:: paddle.fluid.layers.cumsum
-    :noindex:
-
 scatter
 -------

@@ -741,6 +820,30 @@ sum
 ..  autofunction:: paddle.fluid.layers.sum
    :noindex:

+slice
+-----
+
+..  autofunction:: paddle.fluid.layers.slice
+    :noindex:
+
+polygon_box_transform
+---------------------
+
+..  autofunction:: paddle.fluid.layers.polygon_box_transform
+    :noindex:
+
+shape
+-----
+
+..  autofunction:: paddle.fluid.layers.shape
+    :noindex:
+
+maxout
+------
+
+..  autofunction:: paddle.fluid.layers.maxout
+    :noindex:
+
 sigmoid
 -------

@@ -897,18 +1000,6 @@ stanh
 ..  autofunction:: paddle.fluid.layers.stanh
    :noindex:

-hard_shrink
-----------
-
-..  autofunction:: paddle.fluid.layers.hard_shrink
-    :noindex:
-
-thresholded_relu
----------------
-
-..  autofunction:: paddle.fluid.layers.thresholded_relu
-    :noindex:
-
 hard_sigmoid
 ------------

@@ -921,6 +1012,30 @@ swish
 ..  autofunction:: paddle.fluid.layers.swish
    :noindex:

+uniform_random
+--------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random
+    :noindex:
+
+hard_shrink
+-----------
+
+..  autofunction:: paddle.fluid.layers.hard_shrink
+    :noindex:
+
+cumsum
+------
+
+..  autofunction:: paddle.fluid.layers.cumsum
+    :noindex:
+
+thresholded_relu
+----------------
+
+..  autofunction:: paddle.fluid.layers.thresholded_relu
+    :noindex:
+
 tensor
 ======

@@ -978,6 +1093,18 @@ fill_constant
 ..  autofunction:: paddle.fluid.layers.fill_constant
    :noindex:

+argmin
+------
+
+..  autofunction:: paddle.fluid.layers.argmin
+    :noindex:
+
+argmax
+------
+
+..  autofunction:: paddle.fluid.layers.argmax
+    :noindex:
+
 ones
 ----

@@ -993,6 +1120,12 @@ zeros
 detection
 =========

+prior_box
+---------
+
+..  autofunction:: paddle.fluid.layers.prior_box
+    :noindex:
+
 multi_box_head
 --------------

@@ -1080,3 +1213,18 @@ noam_decay
 ..  autofunction:: paddle.fluid.layers.noam_decay
    :noindex:

+metric
+======
+
+accuracy
+--------
+
+..  autofunction:: paddle.fluid.layers.accuracy
+    :noindex:
+
+auc
+---
+
+..  autofunction:: paddle.fluid.layers.auc
+    :noindex:
+
--- a/doc/fluid/api/optimizer.rst
+++ b/doc/fluid/api/optimizer.rst
@@ -89,6 +89,13 @@ DecayedAdagradOptimizer
    :members:
    :noindex:

+RMSPropOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
+
 Adadelta
 --------


--- a/doc/fluid/api/profiler.rst
+++ b/doc/fluid/api/profiler.rst
@@ -23,3 +23,15 @@ profiler
 ..  autofunction:: paddle.fluid.profiler.profiler
    :noindex:

+start_profiler
+--------------
+
+..  autofunction:: paddle.fluid.profiler.start_profiler
+    :noindex:
+
+stop_profiler
+-------------
+
+..  autofunction:: paddle.fluid.profiler.stop_profiler
+    :noindex:
+
--- a/doc/v2/dev/contribute_to_paddle_cn.md
+++ b/doc/v2/dev/contribute_to_paddle_cn.md
@@ -104,7 +104,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
 ➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
 ```

-关于构建和测试的更多信息，请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。
+关于构建和测试的更多信息，请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)。

 ## 提交（commit）


--- a/paddle/contrib/CMakeLists.txt
+++ b/paddle/contrib/CMakeLists.txt
@@ -14,4 +14,3 @@
 #

 add_subdirectory(inference)
-add_subdirectory(tape)
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -50,7 +50,7 @@ cc_test(test_paddle_inference_api
 inference_api_test(test_paddle_inference_api_impl
                    ARGS test_word2vec test_image_classification)

-if (WITH_ANAKIN)
+if (WITH_ANAKIN AND WITH_TESTING) # only needed in CI
    # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
    # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
    # compile the libinference_anakin_api.a and compile with anakin.so.

--- a/paddle/contrib/tape/CMakeLists.txt
+++ b/paddle/contrib/tape/CMakeLists.txt
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-if(APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
-endif(APPLE)
-
-cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES})
-cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable)
-
-cc_test(test_tape
-        SRCS test_tape.cc
-        DEPS tape tape_variable)
--- a/paddle/contrib/tape/README.md
+++ b/paddle/contrib/tape/README.md
-# Dynamic Graph on Fluid
-
-PaddlePaddle Fluid is targeting the autodiff without tape, which, however, is very
-challenging and we are still way from there. DyNet and PyTorch provide a good design
-idea, the *tape*, that significantly eases the challenge.  Also, DyNet provides
-a C++ API that is as convenient as Python but with higher efficiency and could
-conveniently integrate with industrial/production systems. This package, `tape`,
-combines the good of
-
-1. tape from PyTorch and DyNet
-2. C++ API and core from DyNet
-3. rich set of operators from PaddlePaddle
-
-## Overview
-
-We can implement Dynet-like Tape(See this [survey](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/survey/dynamic_graph.md))
-by wrapping Paddle Fluid's `Operator` and `Variable`.
-
-The user API is straight forward since
-
-1. it is imperative. And it uses host language's control flow logic.
-1. it avoids extra concepts such as `Scope` and `Executor`.
-
-All of these benefits come at the cost of just adding one line `reset_global_tape`
-at every iteration.
-
-## Code Structure
-
-In short, the `Tape` contains a vector of `OpHandle`s. And an `OpHandle` contains its
-`type`, the pointers to the `Variable`s, and necessary attributes.
-
-```c++
-class Variable {
-public:
-  VriableHandle Grad(); // returns its gradient variable
-private:
-  framework::VarDesc desc_; // compile time infershape, necessary for lazy execution
-  framework::Variable var_; // run time variable, holds data memory
-};
-
-using VariableHandle = shared_ptr<Variable>;
-
-struct OpHandle {
-  string type_;
-  map<string, vector<VariableHandle>> inputs_;
-  map<string, vector<VariableHandle>> outputs_;
-  AttributeMap attrs_;
-};
-
-class Tape {
-public:
-  void AddOp(OpHandle); // add op
-  void Forward();       // execute the tape_
-  void Backward();      // execute the backward of the tape_
-private:
-  vector<OpHandle> tape_;
-};
-```
-
-We uses `Function` to indicate layers. It takes care of parameter
-initialization and `AddOp` to the Tape when it is called.
-
-```c++
-class Linear {
- public:
-  Linear(int in_dim, int out_dim, const std::string &act)
-      : w_(new Variable("LinearWeight")),
-        b_(new Variable("LinearBias")),
-        act_(act) {
-    Tape init_tape;
-
-    std::string initializer = "fill_constant";
-    framework::AttributeMap attrs;
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{in_dim, out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
-
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
-
-    init_tape.Forward();
-  }
-
-  VariableHandle operator()(VariableHandle input) {
-    VariableHandle pre_bias(new Variable("linear"));
-    get_global_tape().AddOp("mul",
-                            {{"X", {input}}, {"Y", {w_}}},
-                            {{"Out", {pre_bias}}},
-                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
-    VariableHandle pre_act(new Variable("linear"));
-    get_global_tape().AddOp("elementwise_add",
-                            {{"X", {pre_bias}}, {"Y", {b_}}},
-                            {{"Out", {pre_act}}},
-                            {{"axis", 1}});
-    VariableHandle post_act(new Variable("linear"));
-    get_global_tape().AddOp(act_,
-                            {{"X", {pre_act}}},
-                            {{"Out", {post_act}}},
-                            {});
-    return post_act;
-  }
-
-  std::vector<VariableHandle> Params() { return {w_, b_}; }
-
- private:
-  VariableHandle w_;
-  VariableHandle b_;
-  std::string act_;
-};
-```
-
-## User API
-
-```c++
-// Model function
-paddle::tape::Linear linear1(3, 3, "relu"); // init weight and bias
-paddle::tape::Linear linear2(3, 3, "relu"); // init weight and bias
-paddle::tape::Mean mean;
-
-// Optimizer
-paddle::tape::SGD sgd(0.001);
-
-// Data Feeder
-paddle::tape::Fill data_feeder(...);
-VariableHandle input(new paddle::tape::Variable("input"));
-VariableHandle label(new paddle::tape::Variable("label"));
-
-for (int i = 0; i < 2; ++i) {
-  reset_global_tape();
-
-  data_feeder(input, label);
-
-  auto loss = softmax(linear2(linear1(input)), label); // compile time InferShape & InferVarType
-  LOG(INFO) << loss.value(); // Run forward up to loss
-
-  // Run backward, store gradient of w at w->Grad()
-  get_global_tape.Backward(loss);
-
-  // Update w
-  sgd(linear1.Params());
-  sgd(linear2.Params());
-}
-```
-
-<details>
-  <summary></summary>
-digraph G {
-
-	subgraph cluster_0 {
-                node [shape=record,style=filled];
-		style=filled;
-		color=lightgrey;
-                linear1 [label="{type: mul | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1}} |  {output |<before_bias1> Out: before_bias1}}"];
-                elementwise_add1 [label="{type: elementwise_add | {input | {<before_bias1>X: before_bias1 |<bias1> Y: bias1}} |  {output |<before_act1> Out: before_act1}}"];
-                relu1 [label="{type: relu | {input | {<before_act1>X: before_act1 }} |  {output |<after_act1> Out: after_act1}}"];
-
-		linear1 -> elementwise_add1->relu1;
-		label = "forward tape";
-	}
-
-        linear1:before_mul1->before_mul1
-        linear1:weight1->weight1
-        linear1:before_bias1->before_bias1
-
-        elementwise_add1:bias1->bias1
-        elementwise_add1:before_bias1->before_bias1
-        elementwise_add1:before_act1->before_act1
-
-        relu1:before_act1->before_act1
-        relu1:after_act1->after_act1
-
-	subgraph cluster_1 {
-                node [shape=record,style=filled];
-		style=filled;
-		color=lightgrey;
-                linear1_grad [label="{type: mul_grad | {input | {<before_mul1>X: before_mul1 |<weight1> Y: weight1|<before_bias1_grad> Out_grad: before_bias1_grad}} |  {output |{<before_mul1_grad>X_grad: before_mul1_grad |<weight1_grad> Y_grad: weight1_grad}}}"];
-
-                elementwise_add1_grad [label="{type: elementwise_add_grad | {input | <before_act1_grad> Out_grad: before_act1_grad} |  {output |{<before_bias1_grad>X_grad: before_bias1_grad |<bias1_grad> Y_grad: bias1_grad}}}"];
-
-                relu1_grad [label="{type: relu_grad |  {input |<after_act1_grad> Out_grad: after_act1_grad} | {ouput | {<before_act1_grad>X_grad: before_act1_grad }}}"];
-
-		linear1_grad -> elementwise_add1_grad ->relu1_grad [dir=back];
-                label = "backward tape";
-	}
-
-        relu1_grad:after_act1_grad->after_act1_grad
-        relu1_grad:before_act1_grad->before_act1_grad
-
-        elementwise_add1_grad:before_act1_grad->before_act1_grad
-        elementwise_add1_grad:before_bias1_grad->before_bias1_grad
-        elementwise_add1_grad:bias1_grad->bias1_grad
-
-        linear1_grad:before_mul1->before_mul1
-        linear1_grad:weight1->weight1
-        linear1_grad:before_bias1_grad->before_bias1_grad
-        linear1_grad:before_mul1_grad->before_mul1_grad
-        linear1_grad:weight1_grad->weight1_grad
-
-
-	subgraph cluster_2 {
-                node [shape=record];
-                label = "Linear1";
-                weight1
-                bias1
-	}
-
-        weight1 -> weight1_grad [ label="Grad()", style="dashed" ];
-        bias1 -> bias1_grad [ label="Grad()", style="dashed"];
-
-	
-
-}
-</details>
-
-![Image](https://github.com/tonyyang-svail/Paddle/blob/cpp_tap/paddle/contrib/tape/computation_graph.png)
-
-## Code Reuse
-
-We want to stay close to Paddle Fluid as much as possible.
-
-### Reuse All Operators
-
-As all Ops are registered at `OpInfoMap`, the effort of adding a new `Function`
-is about 10 lines of code, similar to expose an operator to Python.
-
-### Reuse Compile Time InferShape and InferVarType
-
-Note that all the symbolic information is stored at `tape::Varaible::desc_`, instead
-of `ProgramDesc.block.vars`, we create a temporary `BlockDesc` to do `InferShape` and
-`InferVarType` every time we `AddOp` to the tape.
-
-### Reuse Operator::Run
-
-We use smart pointer, instead of `Scope`, to manage memory. So we create a temporary
-`Scope` for every `Operator::Run()`.
-
-## Possible Feature
-
-### Release Memory on Backward
-
-We can release memory aggressively. During backward, we can delete the OpHandle once
-we have finished its backward. Since all the variable is managed by smart pointer, the
-memory is automatically released when its `ref_count` goes to 0.
-
-### Kernel Fusion
-
-As a symbolic representation of the Tape is constructed first before the actual
-execution, it would be possible to perform graph optimization. One use case is kernel
-fusion.
--- a/paddle/contrib/tape/computation_graph.png
+++ b/paddle/contrib/tape/computation_graph.png
--- a/paddle/contrib/tape/function.h
+++ b/paddle/contrib/tape/function.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "paddle/contrib/tape/tape.h"
-#include "paddle/contrib/tape/variable.h"
-#include "paddle/fluid/framework/type_defs.h"
-
-namespace paddle {
-namespace tape {
-
-class Function {};
-
-class Fill {
- public:
-  Fill(const std::string &initializer, const framework::AttributeMap &attrs)
-      : initializer_(initializer), attrs_(attrs) {}
-
-  void operator()(VariableHandle var) {
-    get_global_tape().AddOp(initializer_, {}, {{"Out", {var}}}, attrs_);
-  }
-
- private:
-  const std::string initializer_;
-  const framework::AttributeMap attrs_;
-};
-
-class Mean {
- public:
-  VariableHandle operator()(VariableHandle var) {
-    VariableHandle out(new Variable("mean"));
-    get_global_tape().AddOp("mean", {{"X", {var}}}, {{"Out", {out}}}, {});
-    return out;
-  }
-};
-
-class Linear {
- public:
-  Linear(int in_dim, int out_dim, const std::string &act)
-      : w_(new Variable("LinearWeight")),
-        b_(new Variable("LinearBias")),
-        act_(act) {
-    Tape init_tape;
-
-    std::string initializer = "fill_constant";
-    framework::AttributeMap attrs;
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{in_dim, out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {w_}}}, attrs);
-
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{out_dim};
-    attrs["value"] = 1.0f;
-    init_tape.AddOp(initializer, {}, {{"Out", {b_}}}, attrs);
-
-    init_tape.Forward();
-  }
-
-  VariableHandle operator()(VariableHandle input) {
-    VariableHandle pre_bias(new Variable("linear"));
-    get_global_tape().AddOp("mul",
-                            {{"X", {input}}, {"Y", {w_}}},
-                            {{"Out", {pre_bias}}},
-                            {{"x_num_col_dims", 1}, {"y_num_col_dims", 1}});
-    VariableHandle pre_act(new Variable("linear"));
-    get_global_tape().AddOp("elementwise_add",
-                            {{"X", {pre_bias}}, {"Y", {b_}}},
-                            {{"Out", {pre_act}}},
-                            {{"axis", 1}});
-    VariableHandle post_act(new Variable("linear"));
-    get_global_tape().AddOp(
-        act_, {{"X", {pre_act}}}, {{"Out", {post_act}}}, {});
-    return post_act;
-  }
-
-  std::vector<VariableHandle> Params() { return {w_, b_}; }
-
- private:
-  VariableHandle w_;
-  VariableHandle b_;
-  std::string act_;
-};
-
-class SGD {
- public:
-  SGD(float learning_rate) : learning_rate_(new Variable("sgd")) {
-    Tape init_tape;
-
-    std::string initializer = "fill_constant";
-    framework::AttributeMap attrs;
-    attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-    attrs["shape"] = std::vector<int>{1};
-    attrs["value"] = learning_rate;
-    init_tape.AddOp(initializer, {}, {{"Out", {learning_rate_}}}, attrs);
-
-    init_tape.Forward();
-  }
-
-  void operator()(VariableHandle input) {
-    PADDLE_ENFORCE(get_global_tape().HasBeenBackwarded(),
-                   "optimization must happen after the backward");
-    Tape temp_tape;
-    temp_tape.AddOp("sgd",
-                    {{"Param", {input}},
-                     {"LearningRate", {learning_rate_}},
-                     {"Grad", {input->Grad()}}},
-                    {{"ParamOut", {input}}},
-                    {});
-    temp_tape.Forward();
-  }
-
- private:
-  VariableHandle learning_rate_;
-};
-}
-}
--- a/paddle/contrib/tape/tape.cc
+++ b/paddle/contrib/tape/tape.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/contrib/tape/tape.h"
-
-#include <list>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/dim.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/pybind/pybind.h"
-
-namespace paddle {
-namespace tape {
-
-// borrowed from
-// https://stackoverflow.com/questions/874134/find-if-string-ends-with-another-string-in-c
-inline bool ends_with(std::string const &value, std::string const &ending) {
-  if (ending.size() > value.size()) return false;
-  return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
-}
-
-std::ostream &operator<<(std::ostream &os, const framework::VarDesc &var_desc) {
-  os << var_desc.Name();
-  os << "[" << var_desc.GetType() << "]";
-  os << "[" << var_desc.GetDataType() << "]";
-  os << "{";
-  for (auto &i : var_desc.GetShape()) {
-    os << i << ",";
-  }
-  os << "}";
-  return os;
-}
-
-std::string to_string(const std::string &type,
-                      const VariableHandleMap &in_vars,
-                      const VariableHandleMap &out_vars,
-                      const framework::AttributeMap &attrs) {
-  std::stringstream ss;
-  ss << type << " ";
-  for (auto &param_name : in_vars) {
-    for (auto &var : param_name.second) {
-      ss << param_name.first << ":(" << var->Desc() << ") ";
-    }
-  }
-  for (auto &param_name : out_vars) {
-    for (auto &var : param_name.second) {
-      ss << param_name.first << ":(" << var->Desc() << ") ";
-    }
-  }
-  return ss.str();
-}
-
-framework::OpDesc CreateOpDesc(const std::string &type,
-                               const VariableHandleMap &in_vars,
-                               const VariableHandleMap &out_vars,
-                               const framework::AttributeMap &attrs) {
-  framework::VariableNameMap inputs;
-  for (auto &param_name : in_vars) {
-    for (auto &var : param_name.second) {
-      inputs[param_name.first].emplace_back(var->Name());
-    }
-  }
-  framework::VariableNameMap outputs;
-  for (auto &param_name : out_vars) {
-    for (auto &var : param_name.second) {
-      outputs[param_name.first].emplace_back(var->Name());
-    }
-  }
-  return framework::OpDesc(type, inputs, outputs, attrs);
-}
-
-void InferShapeAndVarType(const std::string &type,
-                          const VariableHandleMap &in_vars,
-                          VariableHandleMap *out_vars,
-                          const framework::AttributeMap &attrs) {
-  framework::OpDesc op_desc = CreateOpDesc(type, in_vars, *out_vars, attrs);
-
-  // Create a temporary block for compile-time
-  framework::ProgramDesc program_desc;
-  framework::BlockDesc *block_desc = program_desc.MutableBlock(0);
-  PADDLE_ENFORCE(block_desc);
-
-  for (auto &param_name : in_vars) {
-    for (auto &var : param_name.second) {
-      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
-    }
-  }
-  for (auto &param_name : *out_vars) {
-    for (auto &var : param_name.second) {
-      *block_desc->Var(var->Name())->Proto() = *var->MutableDesc()->Proto();
-    }
-  }
-
-  LOG(INFO) << "- " << to_string(type, in_vars, *out_vars, attrs);
-  op_desc.InferShape(*block_desc);
-  op_desc.InferVarType(block_desc);
-  for (auto &param_name : *out_vars) {
-    for (auto &var : param_name.second) {
-      *var->MutableDesc()->Proto() = *block_desc->Var(var->Name())->Proto();
-    }
-  }
-  LOG(INFO) << "+ " << to_string(type, in_vars, *out_vars, attrs);
-}
-
-void Tape::AddOp(const std::string &type,
-                 const VariableHandleMap &in_vars,
-                 VariableHandleMap out_vars,
-                 const framework::AttributeMap &attrs) {
-  InferShapeAndVarType(type, in_vars, &out_vars, attrs);
-  tape_.emplace_back(type, in_vars, out_vars, attrs);
-}
-
-// Temporary Scope for Operator::Run()
-class ScopeWrapper : public framework::Scope {
- public:
-  ScopeWrapper(const VariableHandleMap &in_vars,
-               const VariableHandleMap &out_vars) {
-    for (auto &v : in_vars) {
-      for (auto &vv : v.second) {
-        if (!vars_.count(vv->Name())) {
-          vars_[vv->Name()].reset(vv->Var());
-        }
-      }
-    }
-    for (auto &v : out_vars) {
-      for (auto &vv : v.second) {
-        if (!vars_.count(vv->Name())) {
-          vars_[vv->Name()].reset(vv->Var());
-        }
-      }
-    }
-  }
-
-  ~ScopeWrapper() {
-    for (auto &pair : vars_) {
-      pair.second.release();
-    }
-  }
-};
-
-void Tape::Forward() {
-  LOG(INFO) << "Starting forward -------------------------";
-  PADDLE_ENFORCE(!has_been_backwarded_);
-  while (current_position_ < tape_.size()) {
-    OpHandle &op = tape_[current_position_];
-
-    // Create Output Tensor, this is only necessary for OpWithKernel
-    for (auto &param2var : op.outputs_) {
-      for (auto &var : param2var.second) {
-        var->InitializeVariable();
-      }
-    }
-
-    framework::OpDesc op_desc =
-        CreateOpDesc(op.type_, op.inputs_, op.outputs_, op.attrs_);
-    ScopeWrapper scope(op.inputs_, op.outputs_);
-    framework::OpRegistry::CreateOp(op_desc)->Run(scope, platform::CPUPlace());
-    current_position_++;
-  }
-
-  LOG(INFO) << "Finishing forward -------------------------";
-}
-
-void Tape::Backward(VariableHandle target) {
-  PADDLE_ENFORCE(!has_been_backwarded_);
-
-  Forward();
-
-  // TODO(tonyyang-svail): check output of last op is target
-  backward_tape_.reset(new Tape());
-
-  framework::AttributeMap attrs;
-
-  // FIXME(tonyyang-svail): Need to infer_data_type
-  attrs["dtype"] = framework::proto::VarType::Type::VarType_Type_FP32;
-  attrs["shape"] = std::vector<int>{1};
-  attrs["value"] = 1.0f;
-  backward_tape_->AddOp(
-      "fill_constant", {}, {{"Out", {target->Grad()}}}, attrs);
-
-  for (auto it = tape_.rbegin(); it != tape_.rend(); ++it) {
-    framework::OpDesc op_desc =
-        CreateOpDesc(it->type_, it->inputs_, it->outputs_, it->attrs_);
-    std::unordered_map<std::string, std::string> grad_to_var;
-    std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
-        framework::OpInfoMap::Instance()
-            .Get(op_desc.Type())
-            .GradOpMaker()(op_desc, {}, &grad_to_var, {});
-
-    for (auto &op_desc : grad_op_descs) {
-      std::unordered_map<std::string, VariableHandle> name2var;
-      for (auto &param2vars : it->inputs_) {
-        for (auto &a : param2vars.second) {
-          name2var[a->Name()] = a;
-        }
-      }
-      for (auto &param2vars : it->outputs_) {
-        for (auto &a : param2vars.second) {
-          name2var[a->Name()] = a;
-        }
-      }
-
-      VariableHandleMap in_vars;
-      VariableHandleMap out_vars;
-      std::map<const framework::VariableNameMap *, VariableHandleMap *>
-          loop_over{{&op_desc->Inputs(), &in_vars},
-                    {&op_desc->Outputs(), &out_vars}};
-      for (auto &each : loop_over) {
-        auto &vmp = *each.first;
-        auto &vhm = *each.second;
-        for (auto &p2a : vmp) {
-          for (auto &argu : p2a.second) {
-            if (name2var.count(argu)) {
-              vhm[p2a.first].push_back(name2var[argu]);
-            } else {
-              PADDLE_ENFORCE(ends_with(argu, framework::kGradVarSuffix),
-                             argu.c_str());
-              std::string name = argu.substr(
-                  0, argu.size() - std::strlen(framework::kGradVarSuffix));
-              PADDLE_ENFORCE(name2var.count(name), name.c_str());
-              vhm[p2a.first].push_back(name2var[name]->Grad());
-            }
-          }
-        }
-      }
-
-      backward_tape_->AddOp(
-          op_desc->Type(), in_vars, out_vars, op_desc->GetAttrMap());
-    }
-
-    // TODO(tonyyang-svail): how to fill empty grad?
-    // TODO(tonyyang-svail): Sum var grad is necessary
-  }
-
-  backward_tape_->Forward();
-  has_been_backwarded_ = true;
-}
-
-Tape &get_global_tape() {
-  static Tape T;
-  return T;
-}
-
-void reset_global_tape() { get_global_tape() = Tape(); }
-}
-}
--- a/paddle/contrib/tape/variable.h
+++ b/paddle/contrib/tape/variable.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <memory>
-
-#include "paddle/fluid/framework/operator.h"  // framework::kGradVarSuffix
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/variable.h"
-
-namespace paddle {
-namespace tape {
-
-class Variable;
-using VariableHandle = std::shared_ptr<Variable>;
-
-/*
- * Combination of
- *     framework::VarDesc desc_;
- *     framework::Variable var_;
- */
-class Variable {
- public:
-  Variable(const std::string pre_fix)
-      : desc_(pre_fix + std::to_string(count())) {}
-
-  Variable(const std::string pre_fix, bool is_grad)
-      : desc_(pre_fix + (is_grad ? framework::kGradVarSuffix
-                                 : std::to_string(count()))) {}
-
-  ~Variable() { LOG(INFO) << "Deleting " << Name(); }
-
-  // Instantiate LoDTensor/SelectedRow
-  void InitializeVariable();
-
-  VariableHandle Grad() {
-    if (grad_.expired()) {
-      VariableHandle new_grad(new Variable(desc_.Name(), true));
-      grad_ = new_grad;
-      return new_grad;
-    } else {
-      return VariableHandle(grad_);
-    }
-  }
-
-  // Stochastic Gradient Descent with Momentum
-  //  VariableHandle Momentum ();
-
-  //  void init(const std::string& initializer,
-  //            const framework::AttributeMap& attrs);
-
-  // void value() {};
-
-  const framework::VarDesc& Desc() const { return desc_; }
-  framework::VarDesc* MutableDesc() { return &desc_; }
-
-  // TODO(tonyyang-svail): No need to expose name
-  std::string Name() const { return desc_.Name(); }
-
-  framework::Variable* Var() { return &var_; }
-
- private:
-  int count() {
-    static int counter = 0;
-    return counter++;
-  }
-
-  framework::VarDesc desc_;
-  framework::Variable var_;
-
-  std::weak_ptr<Variable> grad_;
-};
-}
-}
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -295,13 +295,14 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,

 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
    const ProgramDesc& program, int block_id) {
-  auto* ctx = new ExecutorPrepareContext(program, block_id);
+  std::unique_ptr<ExecutorPrepareContext> ctx(
+      new ExecutorPrepareContext(program, block_id));
  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
  auto& block = program.Block(block_id);
  for (auto& op_desc : block.AllOps()) {
    ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
  }
-  return std::unique_ptr<ExecutorPrepareContext>(ctx);
+  return ctx;
 }

 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
@@ -406,6 +407,9 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
      }
    }
  }
+#else
+  LOG(WARNING)
+      << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
 #endif
 }


--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
@@ -18,6 +18,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
@@ -113,6 +114,9 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
  }
  places.emplace_back(platform::CPUPlace());
  platform::DeviceContextPool::Init(places);
+#ifndef PADDLE_WITH_MKLDNN
+  operators::math::SetNumThreads(1);
+#endif
 }

 void InitGLOG(const std::string &prog_name) {

--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -410,5 +410,38 @@ void LoDTensor::MergeLoDTensor(
  }
 }

+LoD ConvertToLengthBasedLoD(const LoD &offset_lod) {
+  LoD length_lod;
+  length_lod.reserve(offset_lod.size());
+  for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) {
+    std::vector<size_t> level;
+    if (offset_lod[lvl].size() > 0) {
+      level.reserve(offset_lod[lvl].size() - 1);
+    }
+    for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) {
+      level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]);
+    }
+    length_lod.push_back(level);
+  }
+  return length_lod;
+}
+
+LoD ConvertToOffsetBasedLoD(const LoD &length_lod) {
+  LoD offset_lod;
+  offset_lod.reserve(length_lod.size());
+  for (size_t lvl = 0; lvl < length_lod.size(); ++lvl) {
+    std::vector<size_t> level;
+    level.reserve(length_lod[lvl].size() + 1);
+    size_t tmp = 0;
+    level.push_back(tmp);
+    for (size_t idx = 0; idx < length_lod[lvl].size(); ++idx) {
+      tmp += length_lod[lvl][idx];
+      level.push_back(tmp);
+    }
+    offset_lod.push_back(level);
+  }
+  return offset_lod;
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -226,5 +226,19 @@ extern void WriteToRecordIO(recordio::Writer* writer,
 extern std::vector<LoDTensor> ReadFromRecordIO(
    recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);

+/*
+ * Convert between length-based LoD and offset-based LoD.
+ * The implementation of LoDTensor class use offset-based LoD.
+ * However, we want to expose the more user-friendly length-based
+ * LoD to the Python side instead.
+ *
+ * Example:
+ * If offset_lod = [[0, 2, 3],[0, 3, 5, 9]]
+ * then length_lod = [[2, 1], [3, 2, 4]]
+ */
+LoD ConvertToLengthBasedLoD(const LoD& offset_lod);
+
+LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -228,6 +228,38 @@ TEST(LoD, CheckAbsLoD) {
  ASSERT_FALSE(CheckAbsLoD(abs_lod0));
 }

+TEST(LoD, ConvertToLengthBasedLoD) {
+  LoD offset_lod;
+  offset_lod.push_back(std::vector<size_t>({0, 2}));
+  offset_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  offset_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  LoD length_lod = ConvertToLengthBasedLoD(offset_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({2}));
+  expected.push_back(std::vector<size_t>({1, 2}));
+  expected.push_back(std::vector<size_t>({2, 2, 1}));
+
+  EXPECT_EQ(length_lod, expected);
+}
+
+TEST(LoD, ConvertToOffsetBasedLoD) {
+  LoD length_lod;
+  length_lod.push_back(std::vector<size_t>({2}));
+  length_lod.push_back(std::vector<size_t>({1, 2}));
+  length_lod.push_back(std::vector<size_t>({2, 2, 1}));
+
+  LoD offset_lod = ConvertToOffsetBasedLoD(length_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({0, 2}));
+  expected.push_back(std::vector<size_t>({0, 1, 3}));
+  expected.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  EXPECT_EQ(offset_lod, expected);
+}
+
 template <typename T>
 static void TestRecordIO() {
  LoDTensor tensor;

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -43,48 +43,29 @@ Scope& Scope::NewScope() const {
 }

 Variable* Scope::Var(const std::string& name) {
-  // acquire the lock when new var under this scope
  std::unique_lock<std::mutex> lock(mutex_);
-  auto* v = FindVarLocally(name);
-  if (v != nullptr) return v;
-
-  v = new Variable();
-  vars_[name].reset(v);
-  VLOG(3) << "Create variable " << name;
-  v->name_ = &(vars_.find(name)->first);
-  return v;
+  return VarInternal(name);
 }

 Variable* Scope::Var(std::string* name) {
-  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  if (name != nullptr) {
-    *name = var_name;
+    *name = new_name;
  }
-  return Var(var_name);
+  return VarInternal(new_name);
 }

 Variable* Scope::FindVar(const std::string& name) const {
-  // acquire the lock when find var
  std::unique_lock<std::mutex> lock(mutex_);
  return FindVarInternal(name);
 }

-Variable* Scope::FindVarInternal(const std::string& name) const {
-  auto var = FindVarLocally(name);
-  if (var != nullptr) {
-    return var;
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVarInternal(name);
-}
-
 const Scope* Scope::FindScope(const Variable* var) const {
-  for (auto& kv : vars_) {
-    if (kv.second.get() == var) {
-      return this;
-    }
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+  std::unique_lock<std::mutex> lock(mutex_);
+  return FindScopeInternal(var);
 }
+
 void Scope::DropKids() {
  std::unique_lock<std::mutex> lock(mutex_);
  for (Scope* s : kids_) delete s;
@@ -92,6 +73,7 @@ void Scope::DropKids() {
 }

 std::vector<std::string> Scope::LocalVarNames() const {
+  std::unique_lock<std::mutex> lock(mutex_);
  std::vector<std::string> known_vars;
  known_vars.reserve(this->vars_.size());
  for (auto& p : vars_) {
@@ -127,6 +109,39 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {

 void Scope::Rename(const std::string& origin_name,
                   const std::string& new_name) const {
+  std::unique_lock<std::mutex> lock(mutex_);
+  RenameInternal(origin_name, new_name);
+}
+
+std::string Scope::Rename(const std::string& origin_name) const {
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
+  RenameInternal(origin_name, new_name);
+  return new_name;
+}
+
+Variable* Scope::VarInternal(const std::string& name) {
+  auto* v = FindVarLocally(name);
+  if (v != nullptr) return v;
+
+  v = new Variable();
+  vars_[name].reset(v);
+  VLOG(3) << "Create variable " << name;
+  v->name_ = &(vars_.find(name)->first);
+  return v;
+}
+
+const Scope* Scope::FindScopeInternal(const Variable* var) const {
+  for (auto& kv : vars_) {
+    if (kv.second.get() == var) {
+      return this;
+    }
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+}
+
+void Scope::RenameInternal(const std::string& origin_name,
+                           const std::string& new_name) const {
  auto origin_it = vars_.find(origin_name);
  PADDLE_ENFORCE(origin_it != vars_.end(),
                 "Cannot find original variable with name %s", origin_name);
@@ -137,10 +152,12 @@ void Scope::Rename(const std::string& origin_name,
  vars_.erase(origin_it);
 }

-std::string Scope::Rename(const std::string& origin_name) const {
-  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
-  Rename(origin_name, var_name);
-  return var_name;
+Variable* Scope::FindVarInternal(const std::string& name) const {
+  auto var = FindVarLocally(name);
+  if (var != nullptr) {
+    return var;
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
 }

 Variable* Scope::FindVarLocally(const std::string& name) const {

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -88,12 +88,20 @@ class Scope {
  // Call Scope::NewScope for a sub-scope.
  explicit Scope(Scope const* parent) : parent_(parent) {}

+  // Called by Var.
+  Variable* VarInternal(const std::string& name);
+
+  // Called by FindScope.
+  const Scope* FindScopeInternal(const Variable* var) const;
+
+  // Called by Rename.
+  void RenameInternal(const std::string& origin_name,
+                      const std::string& new_name) const;
+
  // Called by FindVar recursively.
-  // Caller doesn't own the returned Variable.
  Variable* FindVarInternal(const std::string& name) const;

  // Called by FindVarInternal and Var.
-  // Caller doesn't own the returned Variable.
  Variable* FindVarLocally(const std::string& name) const;

  // Scope in `kids_` are owned by this class.

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
 set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
-cc_library(analysis SRCS dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc fluid_to_data_flow_graph_pass.cc
-  DEPS paddle_fluid)
+cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
+  fluid_to_data_flow_graph_pass.cc
+  data_flow_graph_to_fluid_pass.cc
+  tensorrt_subgraph_pass.cc
+  dfg_graphviz_draw_pass.cc
+  DEPS framework_proto)
 cc_test(test_node SRCS node_tester.cc DEPS analysis)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)

 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)

-cc_test(test_data_flow_graph SRCS data_flow_graph_tester.cc DEPS analysis ${FLUID_CORE_MODULES} paddle_fluid
-  ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
-set_tests_properties(test_data_flow_graph PROPERTIES DEPENDS test_word2vec)
+function (inference_analysis_test TARGET)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS)
+    cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

-cc_test(test_subgraph_splitter
-        SRCS subgraph_splitter_tester.cc
-        DEPS analysis paddle_fluid tensor
-        ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
-set_tests_properties(test_subgraph_splitter PROPERTIES DEPENDS test_word2vec)
+    cc_test(${TARGET}
+            SRCS "${analysis_test_SRCS}"
+            DEPS analysis
+            ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
+    set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
+endfunction(inference_analysis_test)

-cc_test(test_dfg_graphviz_draw_pass
-        SRCS dfg_graphviz_draw_pass_tester.cc
-        DEPS analysis
-        ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model)
-set_tests_properties(test_dfg_graphviz_draw_pass PROPERTIES DEPENDS test_word2vec)
+inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
+inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
+inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
+inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
+inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
+#inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
+inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
--- a/paddle/contrib/tape/variable.cc
+++ b/paddle/contrib/tape/variable.cc
@@ -12,22 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/contrib/tape/variable.h"
-
-namespace paddle {
-namespace tape {
-
-void Variable::InitializeVariable() {
-  LOG(INFO) << "Initialzing " << desc_.Name() << " as " << desc_.GetType();
-  framework::proto::VarType::Type var_type = desc_.GetType();
-  if (var_type == framework::proto::VarType::LOD_TENSOR) {
-    var_.GetMutable<framework::LoDTensor>();
-  } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
-    var_.GetMutable<framework::SelectedRows>();
-  } else {
-    PADDLE_THROW("Variable type %d is not in [LOD_TENSOR, SELECTED_ROWS]",
-                 var_type);
-  }
-}
-}
-}
+#include "paddle/fluid/inference/analysis/argument.h"
--- a/paddle/contrib/tape/tape.h
+++ b/paddle/contrib/tape/tape.h
@@ -11,54 +11,45 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/contrib/tape/variable.h"
-
-namespace paddle {
-namespace tape {
-
-using VariableHandleMap = std::map<std::string, std::vector<VariableHandle>>;

-struct OpHandle {
-  OpHandle(const std::string &type,
-           const VariableHandleMap &in_vars,
-           const VariableHandleMap &out_vars,
-           const framework::AttributeMap &attrs)
-      : type_(type), inputs_(in_vars), outputs_(out_vars), attrs_(attrs) {}
+/*
+ * This file defines the class Argument, which is the input and output of the
+ * analysis module. All the fields that needed either by Passes or PassManagers
+ * are contained in Argument.
+ *
+ * TODO(Superjomn) Find some way better to contain the fields when it grow too
+ * big.
+ */

-  std::string type_;
-  VariableHandleMap inputs_;
-  VariableHandleMap outputs_;
-  framework::AttributeMap attrs_;
-};
-
-class Tape {
- public:
-  void AddOp(const std::string &type,
-             const VariableHandleMap &in_vars,
-             VariableHandleMap out_vars,
-             const framework::AttributeMap &attrs);
-  void Forward();
-  void Backward(VariableHandle target);
-
-  bool HasBeenBackwarded() { return has_been_backwarded_; }
+#pragma once

- private:
-  bool has_been_backwarded_ = false;
-  size_t current_position_ = 0;
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"

-  std::vector<OpHandle> tape_;
-  std::shared_ptr<Tape> backward_tape_;
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * The argument definition of both Pass and PassManagers.
+ *
+ * All the fields should be registered here for clearness.
+ */
+struct Argument {
+  // The graph that process by the Passes or PassManagers.
+  std::unique_ptr<DataFlowGraph> main_dfg;
+
+  // The original program desc.
+  std::unique_ptr<framework::proto::ProgramDesc> origin_program_desc;
 };

-Tape &get_global_tape();
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+#define ANALYSIS_ARGUMENT_CHECK_FIELD(field__)               \
+  if (UNLIKELY(!(field__))) {                                \
+    LOG(ERROR) << "field " << #field__ << " should be set."; \
+    return false;                                            \
+  }

-void reset_global_tape();
-}
-}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -14,6 +14,7 @@ limitations under the License. */

 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/inference/analysis/node.h"

 namespace paddle {
 namespace inference {
@@ -57,19 +58,7 @@ std::string DataFlowGraph::DotString() const {
  // Add nodes
  for (size_t i = 0; i < nodes.size(); i++) {
    const Node &node = nodes.Get(i);
-    switch (node.type()) {
-      case Node::Type::kValue:
-        dot.AddNode(node.repr(), node.dot_attrs());
-        break;
-      case Node::Type::kFunction:
-        dot.AddNode(node.repr(), node.dot_attrs());
-        break;
-      case Node::Type::kFunctionBlock:
-        dot.AddNode(node.repr(), node.dot_attrs());
-        break;
-      default:
-        PADDLE_THROW("unsupported Node type %d", static_cast<int>(node.type()));
-    }
+    dot.AddNode(node.repr(), node.dot_attrs());
  }

  // Add edges

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include "paddle/fluid/framework/proto_desc.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
+  desc_ = argument->origin_program_desc.get();
+  // Here some logic from program_desc.cc and will not add new interfaces into
+  // framework::ProgramDesc class, use some UT to assure the correctness.
+  auto* block = desc_->mutable_blocks()->Add();
+  block->set_idx(framework::kRootBlockIndex);
+  block->set_parent_idx(framework::kNoneBlockIndex);
+  return true;
+}
+
+bool DataFlowGraphToFluidPass::Finalize() { return true; }
+
+void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
+  auto traits = GraphTraits<DataFlowGraph>(graph);
+  for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
+    if (it->deleted()) continue;
+    switch (it->type()) {
+      case Node::Type::kFunction:
+        LOG(INFO) << "add function " << it->name();
+        AddFluidOp(&(*it));
+        break;
+      case Node::Type::kFunctionBlock:
+        AddEngineOp(&(*it));
+        break;
+      default:
+        continue;
+    }
+  }
+}
+
+void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
+  LOG(INFO) << "processing func " << node->name();
+  auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
+  // currently only the main block is analyzed.
+  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto* op = main_block->add_ops();
+  LOG(INFO) << "to copy the op";
+  *op = *ori_op;  // copy the attributes, by default, these will not be changed
+                  // by analysis phrase.
+  // The inputs and outputs of the existing ops are not changed by tensorrt
+  // subgraph pass.
+  // NOTE It might be changed by other passes in the long run.
+}
+
+void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
+  // auto* ori_op = static_cast<framework::proto::OpDesc*>(node->extra_info());
+  // auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  // auto* op = main_block->add_ops();
+  // TODO(Superjomn) Here need to expose some arguments for default setting.
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+/*
+ * This file implements the transformation from fluid ProgramDesc to data flow
+ * graph.
+ */
+
+#pragma once
+
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
+ public:
+  DataFlowGraphToFluidPass() = default;
+
+  bool Initialize(Argument *argument) override;
+  bool Finalize() override;
+
+  void Run(DataFlowGraph *graph) override;
+
+  std::string repr() const override { return "DFG to fluid"; }
+  std::string description() const override {
+    return "Transform a DFG to a Fluid ProgramDesc";
+  }
+
+  Pass *CreatePrinterPass(std::ostream &os,
+                          const std::string &banner) const override {
+    return nullptr;
+  }
+
+ protected:
+  // Add a Fluid Op into the ProgramDesc.
+  void AddFluidOp(Node *node);
+  // Add a EngineOp into the ProgramDesc.
+  void AddEngineOp(Node *node);
+
+ private:
+  framework::proto::ProgramDesc *desc_;
+};
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
@@ -27,13 +27,12 @@ namespace inference {
 namespace analysis {

 TEST_F(DFG_Tester, Test) {
-  framework::proto::ProgramDesc new_desc;
  DataFlowGraph graph;

  FluidToDataFlowGraphPass pass0;
  DataFlowGraphToFluidPass pass1;
-  pass0.Initialize(desc);
-  pass1.Initialize(&new_desc);
+  ASSERT_TRUE(pass0.Initialize(&argument));
+  ASSERT_TRUE(pass1.Initialize(&argument));

  pass0.Run(&graph);
  pass1.Run(&graph);

--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
+  auto content = Draw(graph);
+  std::ofstream file(GenDotPath());
+  file.write(content.c_str(), content.size());
+  file.close();
+  LOG(INFO) << "draw dot to " << GenDotPath();
+}
+
+std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
+  Dot dot;
+  // Add nodes
+  for (size_t i = 0; i < graph->nodes.size(); i++) {
+    const Node &node = graph->nodes.Get(i);
+    if (config_.display_deleted_node || !node.deleted()) {
+      dot.AddNode(node.repr(), node.dot_attrs());
+    }
+  }
+  // Add edges
+  for (size_t i = 0; i < graph->nodes.size(); i++) {
+    const Node &node = graph->nodes.Get(i);
+    if (!config_.display_deleted_node && node.deleted()) continue;
+    for (auto &in : node.inlinks) {
+      if (!config_.display_deleted_node && in->deleted()) continue;
+      for (auto &in : node.inlinks) {
+        dot.AddEdge(in->repr(), node.repr(), {});
+      }
+    }
+  }
+  return dot.Build();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -21,6 +21,7 @@ limitations under the License. */

 #include <fstream>
 #include <string>
+#include "paddle/fluid/inference/analysis/dot.h"
 #include "paddle/fluid/inference/analysis/pass.h"

 namespace paddle {
@@ -32,35 +33,39 @@ namespace analysis {
 */
 class DFG_GraphvizDrawPass : public DataFlowGraphPass {
 public:
-  DFG_GraphvizDrawPass(const std::string& dir, const std::string& id)
-      : dir_(dir), id_(id) {}
-
-  bool Initialize() override { return Pass::Initialize(); }
-  void Run(DataFlowGraph* graph) override {
-    auto content = Draw(graph);
-    std::ofstream file(GenDotPath());
-    file.write(content.c_str(), content.size());
-    file.close();
-    LOG(INFO) << "draw dot to " << GenDotPath();
-  }
+  struct Config {
+    Config(const std::string &dir, const std::string &id,
+           bool display_deleted_node = false)
+        : dir(dir), id(id), display_deleted_node(display_deleted_node) {}
+
+    // The directory to store the .dot or .png files.
+    const std::string dir;
+    // The identifier for this dot file.
+    const std::string id;
+    // Whether to display deleted nodes, default false.
+    const bool display_deleted_node;
+  };
+
+  DFG_GraphvizDrawPass(const Config &config) : config_(config) {}

+  bool Initialize(Argument *argument) override { return true; }
+  void Run(DataFlowGraph *graph) override;
  bool Finalize() override { return Pass::Finalize(); }

-  Pass* CreatePrinterPass(std::ostream& os,
-                          const std::string& banner) const override {
-    return nullptr;
+  std::string repr() const override { return "DFG graphviz drawer"; }
+  std::string description() const override {
+    return "Debug a DFG by draw with graphviz";
  }

 private:
  // Path of the dot file to output.
  std::string GenDotPath() const {
-    return dir_ + "/" + "graph_" + id_ + ".dot";
+    return config_.dir + "/" + "graph_" + config_.id + ".dot";
  }

-  std::string Draw(DataFlowGraph* graph) { return graph->DotString(); }
+  std::string Draw(DataFlowGraph *graph);

-  std::string dir_;
-  std::string id_;
+  Config config_;
 };

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -24,9 +24,10 @@ namespace inference {
 namespace analysis {

 TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
-  auto dfg = ProgramDescToDFG(desc);
-  DFG_GraphvizDrawPass pass("./", "test");
-  pass.Initialize();
+  auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+  DFG_GraphvizDrawPass::Config config("./", "test");
+  DFG_GraphvizDrawPass pass(config);
+  pass.Initialize(&argument);
  pass.Run(&dfg);

  // test content
@@ -38,7 +39,8 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
  while (std::getline(file, line)) {
    no++;
  }
-  ASSERT_EQ(no, 82);
+  // DFG is sensitive to ProgramDesc, be careful to change the existing models.
+  ASSERT_EQ(no, 112);
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -21,19 +21,23 @@ namespace paddle {
 namespace inference {
 namespace analysis {

-FluidToDataFlowGraphPass::FluidToDataFlowGraphPass() {}
-
-bool FluidToDataFlowGraphPass::Initialize() { return Pass::Initialize(); }
-
-bool FluidToDataFlowGraphPass::Initialize(
-    const framework::proto::ProgramDesc &desc) {
-  desc_ = &desc;
+bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
+  PADDLE_ENFORCE(argument);
+  if (!argument->main_dfg) {
+    LOG(INFO) << "Init DFG";
+    argument->main_dfg.reset(new DataFlowGraph);
+  }
+  desc_ = argument->origin_program_desc.get();
  return true;
 }

 bool FluidToDataFlowGraphPass::Finalize() { return Pass::Finalize(); }

 void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
+  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE(desc_);
  // insert vars
  std::unordered_map<std::string, size_t> var2id;
  auto &main_block = desc_->blocks(framework::kRootBlockIndex);
@@ -41,7 +45,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
    const auto &var = main_block.vars(i);
    auto *v = graph->nodes.Create(Node::Type::kValue);
    v->SetName(var.name());
-    v->SetExtraInfo(const_cast<void *>(static_cast<const void *>(&var)));
+    v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
    var2id[var.name()] = v->id();
  }
  for (int i = 0; i < main_block.ops_size(); i++) {
@@ -51,7 +55,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
    static_cast<Function *>(o)->SetFuncType(op.type());
    // Link to the original protobuf message's memory, make it easier to
    // generate from a data flow graph to fluid ProgramDesc.
-    o->SetExtraInfo(const_cast<void *>(static_cast<const void *>(&op)));
+    o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
    // set inputs and outputs
    // TODO(Superjomn) make sure the InputNames is the real variable name.
    for (int j = 0; j < op.inputs_size(); j++) {

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -34,13 +34,18 @@ namespace analysis {
 */
 class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
 public:
-  FluidToDataFlowGraphPass();
-  bool Initialize() override;
-  bool Initialize(const framework::proto::ProgramDesc &desc) override;
+  FluidToDataFlowGraphPass() = default;
+
+  bool Initialize(Argument *argument) override;
  bool Finalize() override;

  void Run(DataFlowGraph *graph) override;

+  std::string repr() const override { return "fluid-to-data-flow-graph"; }
+  std::string description() const override {
+    return "transform a fluid ProgramDesc to a data flow graph.";
+  }
+
  Pass *CreatePrinterPass(std::ostream &os,
                          const std::string &banner) const override;


--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -23,11 +23,11 @@ namespace analysis {

 TEST_F(DFG_Tester, Init) {
  FluidToDataFlowGraphPass pass;
-  pass.Initialize();
-  pass.Initialize(desc);
+  pass.Initialize(&argument);
  DataFlowGraph graph;
  pass.Run(&graph);
-  ASSERT_GT(graph.nodes.size(), 0);
+  // Analysis is sensitive to ProgramDesc, careful to change the original model.
+  ASSERT_EQ(graph.nodes.size(), 37);
  pass.Finalize();
  LOG(INFO) << '\n' << graph.DotString();
 }

--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -62,6 +62,7 @@ struct DataTypeNamer {
    SET_TYPE(int);
    SET_TYPE(bool);
    SET_TYPE(float);
+    SET_TYPE(void *);
  }

  std::unordered_map<decltype(typeid(int).hash_code()),  // NOLINT

--- a/paddle/fluid/inference/analysis/node.cc
+++ b/paddle/fluid/inference/analysis/node.cc
@@ -40,6 +40,9 @@ Node *NodeMap::Create(Node::Type type) {
    case Node::Type::kValue:
      nodes_.emplace_back(new Value);
      break;
+    case Node::Type::kFunctionBlock:
+      nodes_.emplace_back(new FunctionBlock);
+      break;
    default:
      PADDLE_THROW("Not supported node type.");
  }

--- a/paddle/fluid/inference/analysis/node.h
+++ b/paddle/fluid/inference/analysis/node.h
@@ -71,12 +71,17 @@ class Node {

  // Get an additional attribute and convert it to T data type. NOTE this will
  // silently create a new attribute if not exists.
-  Attr &attr(const std::string &name) { return attrs_[name]; }
+  Attr &attr(const std::string &name) const { return attrs_[name]; }

  int id() const { return id_; }

-  bool deleted() const { return deleted_; }
+  // The Protobuf description is set/get with a void* to decouple Node interface
+  // from a specific kind of Protobuf message.
+  void SetPbDesc(void *pb) { attr("pb_desc").Pointer() = pb; }
+  void *pb_desc() const { return attr("pb_desc").Pointer(); }
+
  void SetDeleted() { deleted_ = true; }
+  bool deleted() const { return deleted_; }

  void SetName(const std::string &name) { name_ = name; }
  const std::string &name() const { return name_; }
@@ -84,29 +89,25 @@ class Node {
  void SetType(Type type) { type_ = type; }
  Type type() const { return type_; }

-  void *extra_info() const { return extra_info_; }
-  void SetExtraInfo(void *extra_info) { extra_info_ = extra_info; }
-
  // Input links.
  std::vector<Node *> inlinks;
  // Output links.
  std::vector<Node *> outlinks;

  // A helper class to maintain the status from Pass.
-  // TODO(superjomn) add a checker here to ensure the T is primary.
  struct Attr {
    // NOTE T should be a primary type or a struct combined by several primary
    // types.
    // NOTE the STL containers should not use here.
    // Some usages
-    // Attr attr;
-    // T data;
-    // attr.data.assign((char*)data, sizeof(data));
+    //   Attr attr;
+    //   attr.Bool() = true;

    bool &Bool() { return As<bool>(); }
    float &Float() { return As<float>(); }
    int32_t &Int32() { return As<int32_t>(); }
    int64_t &Int64() { return As<int64_t>(); }
+    void *&Pointer() { return As<void *>(); }

   private:
    template <typename T>
@@ -130,6 +131,7 @@ class Node {
    size_t type_hash_{std::numeric_limits<size_t>::max()};
  };

+  // Type checks.
  bool IsFunction() const { return type_ == Node::Type::kFunction; }
  bool IsValue() const { return type_ == Node::Type::kValue; }
  bool IsFunctionBlock() const { return type_ == Node::Type::kFunctionBlock; }
@@ -148,9 +150,6 @@ class Node {
  Type type_{Type::kNone};
  // Mark this node is deleted by some pass.
  bool deleted_{false};
-
-  void *extra_info_;
-
  mutable std::unordered_map<std::string, Attr> attrs_;
 };


--- a/paddle/fluid/inference/analysis/pass.h
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/node.h"
@@ -30,19 +31,24 @@ namespace analysis {
 class Pass {
 public:
  Pass() = default;
-  virtual ~Pass() {}
+  virtual ~Pass() = default;
  // Virtual method overridden by subclasses to do only necessary initialization
  // before any pass is run.
-  virtual bool Initialize() { return false; }
+  // virtual bool Initialize() { return false; }
  // There is some passes such as FlowToDataFlowGraphPass that needs a
  // ProgramDesc. Here use the native ProgramDesc ProtoBuf message, so that it
  // only couple with the proto file.
-  virtual bool Initialize(const framework::proto::ProgramDesc &desc) {
-    return false;
-  }
+  // virtual bool Initialize(const framework::proto::ProgramDesc &desc) { return
+  // false; }
  // There are some Passes such as DataFlowGraphToFluidPass that will output a
  // ProgramDesc.
-  virtual bool Initialize(framework::proto::ProgramDesc *desc) { return false; }
+  // virtual bool Initialize(framework::proto::ProgramDesc *desc) { return
+  // false; }
+
+  // Mutable Pass.
+  virtual bool Initialize(Argument *argument) { return false; }
+  // Readonly Pass.
+  virtual bool Initialize(const Argument &argument) { return false; }

  // Virtual method overriden by subclasses to do any necessary clean up after
  // all passes have run.
@@ -50,7 +56,9 @@ class Pass {

  // Get a Pass appropriate to print the Node this pass operates on.
  virtual Pass *CreatePrinterPass(std::ostream &os,
-                                  const std::string &banner) const = 0;
+                                  const std::string &banner) const {
+    return nullptr;
+  }

  // Run on a single Node.
  virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
@@ -60,6 +68,11 @@ class Pass {
  virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; }
  // Run on a single DataFlowGraph.
  virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; }
+
+  // Human-readable short representation.
+  virtual std::string repr() const = 0;
+  // Human-readable long description.
+  virtual std::string description() const = 0;
 };

 // NodePass process on any Node types.

--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void DfgPassManager::RunAll() {
+  PADDLE_ENFORCE(argument_);
+  for (auto& pass : data_) {
+    VLOG(4) << "Running pass [" << pass->repr() << "]";
+    pass->Run(argument_->main_dfg.get());
+  }
+}
+
+void NodePassManager::RunAll() {
+  PADDLE_ENFORCE(argument_);
+  PADDLE_ENFORCE(argument_->main_dfg.get());
+  auto trait =
+      GraphTraits<DataFlowGraph>(argument_->main_dfg.get()).nodes_in_DFS();
+  for (auto& node : trait) {
+    for (auto& pass : data_) {
+      pass->Run(&node);
+    }
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/pass_manager.h
+++ b/paddle/fluid/inference/analysis/pass_manager.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the logic of pass management. The analysis for inference is
+ * a pipeline of Passes, a PassManager is a agency that helps to manage the
+ * executation of the Passes.
+ *
+ * There are two modes of Passes, the first one is called NodePass and takes
+ * an Node as input and output; the second one is called DFGPass and takes a
+ * DFG(Data Flow Graph) as input and output. It is hard to put all the passes in
+ * the same pipeline, there are two kinds of PassManagers, both takes a DFG as
+ * input and output a DFG, but the Passes inside are different:
+ *
+ *   1. NodePassManager: the passes inside are all NodePasses, it can have
+ *      different graph trivial algorithm, for example, DFS_NodePassManager will
+ *      trigger the passes in depth first order;
+ *   2. DfgPassManager: the passes inside are all DfgPasses.
+ */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * PassManager is the base class for all pass managers, a pass manager has
+ * several Pass-es registered, and execute them in the linear order.
+ */
+class PassManager : public OrderedRegistry<Pass> {
+ public:
+  PassManager() = default;
+  // Call all the passes' Initialize methods. The desc and data_flow_graph are
+  // globally shared, so pass them as the arguemnts for all the pass managers.
+  virtual bool Initialize(const Argument& argument) { return false; }
+
+  virtual bool Initialize(Argument* argument) {
+    argument_ = argument;
+    for (auto& pass : data_) {
+      LOG(INFO) << "Initializing pass " << pass->repr();
+      if (!pass->Initialize(argument)) {
+        LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Call all the passes' Finalize methods.
+  virtual bool Finalize() {
+    for (auto& pass : data_) {
+      if (!pass->Finalize()) {
+        LOG(ERROR) << "Failed to finalize pass [" << pass->repr() << "]";
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Run all the passes.
+  virtual void RunAll() = 0;
+
+  // Short identifier.
+  virtual std::string repr() const = 0;
+  // Long description.
+  virtual std::string description() const = 0;
+
+  virtual ~PassManager() = default;
+
+ protected:
+  Argument* argument_{nullptr};
+};
+
+/*
+ * A pass manager that process a DFG.
+ */
+class DfgPassManager : public PassManager {
+ public:
+  DfgPassManager() = default;
+
+  void RunAll() override;
+
+  virtual ~DfgPassManager() = default;
+};
+
+/*
+ * A pass manager that process a Node each time.
+ */
+class NodePassManager : public PassManager {
+ public:
+  NodePassManager() = default;
+
+  void RunAll() override;
+
+  virtual ~NodePassManager() = default;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class TestDfgPassManager final : public DfgPassManager {
+ public:
+  TestDfgPassManager() = default;
+  virtual ~TestDfgPassManager() = default;
+  // Short identifier.
+  std::string repr() const override { return "test-pass-manager"; }
+  // Long description.
+  std::string description() const override { return "test doc"; }
+};
+
+class TestNodePassManager final : public NodePassManager {
+ public:
+  virtual ~TestNodePassManager() = default;
+
+  std::string repr() const override { return "test-node-pass-manager"; }
+  std::string description() const override { return "test doc"; }
+};
+
+class TestNodePass final : public NodePass {
+ public:
+  virtual ~TestNodePass() = default;
+
+  bool Initialize(Argument* argument) override { return true; }
+
+  void Run(Node* node) override {
+    LOG(INFO) << "- Processing node " << node->repr();
+  }
+
+  std::string repr() const override { return "test-node"; }
+  std::string description() const override { return "some doc"; }
+};
+
+TEST_F(DFG_Tester, DFG_pass_manager) {
+  TestDfgPassManager manager;
+  DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
+
+  manager.Register("fluid-to-flow-graph", new FluidToDataFlowGraphPass);
+  manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
+  manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
+
+  ASSERT_TRUE(manager.Initialize(&argument));
+  manager.RunAll();
+}
+
+TEST_F(DFG_Tester, Node_pass_manager) {
+  // Pre-process: initialize the DFG with the ProgramDesc first.
+  FluidToDataFlowGraphPass pass0;
+  pass0.Initialize(&argument);
+  pass0.Run(argument.main_dfg.get());
+
+  TestNodePassManager manager;
+  manager.Register("test-node-pass", new TestNodePass);
+  ASSERT_TRUE(manager.Initialize(&argument));
+  manager.RunAll();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -19,22 +19,23 @@ namespace paddle {
 namespace inference {
 namespace analysis {

+SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
+  if (node->type() != Node::Type::kFunction) return false;
+  const auto* func = static_cast<const Function*>(node);
+  if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
+      func->func_type() == "conv2d" || func->func_type() == "mul" ||
+      func->func_type() == "sigmoid" || func->func_type() == "softmax") {
+    LOG(INFO) << "sub-graph marked " << node->repr();
+    return true;
+  }
+  return false;
+};
+
 TEST_F(DFG_Tester, Split) {
  auto desc = LoadProgramDesc();
  auto dfg = ProgramDescToDFG(desc);
  LOG(INFO) << "spliter\n" << dfg.DotString();

-  SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
-    if (node->type() != Node::Type::kFunction) return false;
-    const auto* func = static_cast<const Function*>(node);
-    if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
-        func->func_type() == "conv2d" || func->func_type() == "mul" ||
-        func->func_type() == "sigmoid" || func->func_type() == "softmax") {
-      LOG(INFO) << "sub-graph marked " << node->repr();
-      return true;
-    }
-    return false;
-  };
  ASSERT_GT(dfg.nodes.size(), 5UL);

  auto subgraphs = SubGraphSplitter(&dfg, teller)();
@@ -62,6 +63,28 @@ TEST_F(DFG_Tester, Split) {
  ASSERT_EQ(subgraphs.back().size(), 6UL);
 }

+TEST_F(DFG_Tester, Fuse) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+
+  size_t count0 = dfg.nodes.size();
+
+  SubGraphFuse fuse(&dfg, teller);
+  fuse();
+
+  int count1 = 0;
+  for (auto& node : dfg.nodes.nodes()) {
+    if (node->deleted()) {
+      LOG(INFO) << "deleted " << node->repr();
+    }
+    count1 += node->deleted();
+  }
+
+  // At least one nodes should be deleted.
+  ASSERT_EQ(dfg.nodes.size(), count0 + 1);  // added a new FunctionBlock
+  ASSERT_EQ(6UL, count1);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/contrib/tape/test_tape.cc
+++ b/paddle/contrib/tape/test_tape.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,50 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "gtest/gtest.h"
-#include "paddle/contrib/tape/function.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"

-using namespace paddle::tape;
+namespace paddle {
+namespace inference {
+namespace analysis {

-TEST(Tape, TestMLP) {
-  LOG(INFO) << "TestMLP";
-  Linear linear1(3, 3, "relu");
-  Linear linear2(3, 3, "relu");
-  Mean mean;
+TensorRTSubGraphPass::TensorRTSubGraphPass(
+    const TensorRTSubGraphPass::NodeInsideSubgraphTeller &teller)
+    : node_inside_subgraph_teller_(teller) {}

-  SGD sgd(0.001);
-
-  std::string initializer = "fill_constant";
-  paddle::framework::AttributeMap attrs;
-  attrs["dtype"] = paddle::framework::proto::VarType::Type::VarType_Type_FP32;
-  attrs["shape"] = std::vector<int>{3, 3};
-  attrs["value"] = 1.0f;
-  Fill filler(initializer, attrs);
-
-  for (int i = 0; i < 2; ++i) {
-    reset_global_tape();
-
-    VariableHandle input(new Variable("input"));
-    filler(input);
-
-    auto loss = mean(linear2(linear1(input)));
-
-    get_global_tape().Backward(loss);
-
-    for (auto w : linear1.Params()) {
-      sgd(w);
-    }
-    for (auto w : linear2.Params()) {
-      sgd(w);
-    }
-  }
+void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
+  SubGraphFuse(graph, node_inside_subgraph_teller_);
 }

-int main(int argc, char** argv) {
-  std::vector<paddle::platform::Place> places;
-  places.emplace_back(paddle::platform::CPUPlace());
-  paddle::platform::DeviceContextPool::Init(places);
+}  // analysis
+}  // inference

-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
+}  // paddle
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/inference/analysis/node.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Parse the graph and replace TensorRT supported nodes with SubGraphNode
+ */
+class TensorRTSubGraphPass : public DataFlowGraphPass {
+ public:
+  // Tell whether to transform a sub-graph into TensorRT.
+  using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller;
+
+  TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
+
+  bool Initialize(Argument* argument) override { return true; }
+
+  // This class get a sub-graph as input and determine whether to transform this
+  // sub-graph into TensorRT.
+  void Run(DataFlowGraph* graph) override;
+
+ private:
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // paddle
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_string(model_dir, "", "inference test model dir");
+
+TEST(TensorRTSubGraph, single_pass) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+
+  SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
+    if (node->type() != Node::Type::kFunction) return false;
+    const auto* func = static_cast<const Function*>(node);
+    if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
+        func->func_type() == "conv2d" || func->func_type() == "mul" ||
+        func->func_type() == "sigmoid" || func->func_type() == "softmax") {
+      LOG(INFO) << "sub-graph marked " << node->repr();
+      return true;
+    }
+    return false;
+  };
+
+  DFG_GraphvizDrawPass::Config config{"./", "test"};
+  DFG_GraphvizDrawPass dfg_pass(config);
+  dfg_pass.Initialize();
+
+  DFG_GraphvizDrawPass dfg_pass1(config);
+  dfg_pass1.Initialize();
+
+  dfg_pass.Run(&dfg);
+
+  TensorRTSubGraphPass trt_pass(std::move(teller));
+  trt_pass.Initialize();
+
+  trt_pass.Run(&dfg);
+
+  dfg_pass1.Run(&dfg);
+
+  // Check the TRT op's block desc
+  for (auto node : dfg.nodes.nodes()) {
+    if (node->IsFunctionBlock()) {
+    }
+  }
+}
+
+TEST(TensorRTSubGraph, pass_manager) {}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/ut_helper.h
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -15,33 +15,46 @@ limitations under the License. */
 #pragma once
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
+#include <fstream>
 #include <string>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/io.h"

 namespace paddle {
 namespace inference {
+
+// Read ProgramDesc from a __model__ file, defined in io.cc
+extern void ReadBinaryFile(const std::string& filename, std::string* contents);
+
 namespace analysis {

 DEFINE_string(inference_model_dir, "", "inference test model dir");

 static framework::proto::ProgramDesc LoadProgramDesc(
    const std::string& model_dir = FLAGS_inference_model_dir) {
-  paddle::platform::CPUPlace place;
-  paddle::framework::Executor executor(place);
-  paddle::framework::Scope scope;
-  auto program = Load(&executor, &scope, model_dir);
-  return *program->Proto();
+  std::string msg;
+  std::string net_file = FLAGS_inference_model_dir + "/__model__";
+  std::ifstream fin(net_file, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", net_file);
+  fin.seekg(0, std::ios::end);
+  msg.resize(fin.tellg());
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(msg.at(0)), msg.size());
+  fin.close();
+  framework::proto::ProgramDesc program_desc;
+  program_desc.ParseFromString(msg);
+  return program_desc;
 }

 static DataFlowGraph ProgramDescToDFG(
    const framework::proto::ProgramDesc& desc) {
  DataFlowGraph graph;
  FluidToDataFlowGraphPass pass;
-  pass.Initialize(desc);
+  Argument argument;
+  argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
+  pass.Initialize(&argument);
  pass.Run(&graph);
  pass.Finalize();
  return graph;
@@ -49,9 +62,12 @@ static DataFlowGraph ProgramDescToDFG(

 class DFG_Tester : public ::testing::Test {
 protected:
-  void SetUp() override { desc = LoadProgramDesc(FLAGS_inference_model_dir); }
+  void SetUp() override {
+    auto desc = LoadProgramDesc(FLAGS_inference_model_dir);
+    argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
+  }

-  framework::proto::ProgramDesc desc;
+  Argument argument;
 };

 }  // namespace analysis

--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -64,7 +64,8 @@ class OpConverter {
    (*it)(op, scope, test_mode);
  }

-  // convert fluid block to tensorrt network
+  // Convert a fluid block to tensorrt network, NOTE it just convert operators,
+  // the INetwork's inputs and outputs should specified in some other modules.
  void ConvertBlock(const framework::proto::BlockDesc& block,
                    const std::unordered_set<std::string>& parameters,
                    const framework::Scope& scope, TensorRTEngine* engine) {

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -51,11 +51,12 @@ class TensorRTEngine : public EngineBase {
    nvinfer1::Weights w_;
  };

-  TensorRTEngine(int max_batch, int max_workspace, cudaStream_t* stream,
+  TensorRTEngine(int max_batch, int max_workspace,
+                 cudaStream_t* stream = nullptr,
                 nvinfer1::ILogger& logger = NaiveLogger::Global())
      : max_batch_(max_batch),
        max_workspace_(max_workspace),
-        stream_(stream),
+        stream_(stream ? stream : &default_stream_),
        logger_(logger) {}

  virtual ~TensorRTEngine();
@@ -121,6 +122,8 @@ class TensorRTEngine : public EngineBase {
  // the max memory size the engine uses
  int max_workspace_;
  cudaStream_t* stream_;
+  // If stream_ is not set from outside, hold its own stream.
+  cudaStream_t default_stream_;
  nvinfer1::ILogger& logger_;

  std::vector<Buffer> buffers_;
@@ -165,20 +168,31 @@ class TensorRTEngine : public EngineBase {
 */
 class TRT_EngineManager {
 public:
-  TensorRTEngine* Create(int max_batch, int max_workspace,
-                         cudaStream_t* stream) {
-    engines_.emplace_back(new TensorRTEngine(max_batch, max_workspace, stream));
-    return engines_.back().get();
+  bool HasEngine(const std::string& name) const {
+    return engines_.count(name) != 0;
+  }
+
+  // Get an engine called `name`.
+  TensorRTEngine* Get(const std::string& name) const {
+    return engines_.at(name).get();
+  }
+
+  // Create or get an engine called `name`
+  TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
+                         const std::string& name) {
+    auto* p = new TensorRTEngine(max_batch, max_workspace, stream);
+    engines_[name].reset(p);
+    return p;
  }

  void DeleteALl() {
-    for (auto& ptr : engines_) {
-      ptr.reset(nullptr);
+    for (auto& item : engines_) {
+      item.second.reset(nullptr);
    }
  }

 private:
-  std::vector<std::unique_ptr<TensorRTEngine>> engines_;
+  std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
 };

 }  // namespace tensorrt

--- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -29,6 +29,7 @@ DEFINE_string(data_file, "", "File of input index data.");
 DEFINE_int32(repeat, 100, "Running the inference program repeat times");
 DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
 DEFINE_int32(num_threads, 1, "Number of threads should be used");
+DECLARE_bool(use_mkldnn);

 inline double GetCurrentMs() {
  struct timeval time;
@@ -103,9 +104,9 @@ void ThreadRunInfer(
    const int tid, paddle::framework::Scope* scope,
    const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
  // maybe framework:ProgramDesc is not thread-safe
+  paddle::platform::CPUPlace place;
+  paddle::framework::Executor executor(place);
  auto& sub_scope = scope->NewScope();
-  auto place = paddle::platform::CPUPlace();
-  auto executor = paddle::framework::Executor(place);
  auto inference_program =
      paddle::inference::Load(&executor, scope, FLAGS_model_path);

@@ -182,8 +183,8 @@ TEST(inference, nlp) {
    stop_ms = GetCurrentMs();
  } else {
    // 1. Define place, executor, scope
-    auto place = paddle::platform::CPUPlace();
-    auto executor = paddle::framework::Executor(place);
+    paddle::platform::CPUPlace place;
+    paddle::framework::Executor executor(place);

    // 2. Initialize the inference_program and load parameters
    std::unique_ptr<paddle::framework::ProgramDesc> inference_program;

--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -43,14 +43,16 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {

  *index = 0;  // unlock memory

-  void* p;
+  void* p = nullptr;

 #ifdef PADDLE_WITH_MKLDNN
  // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
  // memory alignment
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0);
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0, "Alloc %ld error!",
+                    size);
 #else
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0);
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0, "Alloc %ld error!",
+                    size);
 #endif
  PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);


--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -12,16 +12,20 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include "mkldnn.hpp"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"

 namespace paddle {
 namespace operators {

-using paddle::framework::Tensor;
-using paddle::platform::MKLDNNDeviceContext;
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::stream;
+using platform::GetMKLDNNFormat;
+using platform::MKLDNNDeviceContext;
+using platform::to_void_cast;

 namespace {
 std::string gethash(const mkldnn::memory::dims &operand_dims,
@@ -35,188 +39,260 @@ std::string gethash(const mkldnn::memory::dims &operand_dims,
  };
  return dim2str(operand_dims) + std::to_string(algorithm);
 }
+}  // namespace
+
+template <typename Functor>
+class MKLDNNActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *x = ctx.Input<Tensor>("X");
+    PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                       x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input x tensor");
+
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+    functor(ctx);
+  }
+};

-template <typename T, typename ExecContext>
-void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
-                     const T alpha = 0, const T beta = 0) {
+template <typename Functor>
+class MKLDNNActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
+                       diff_y->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input OutGrad tensor");
+
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+    functor(ctx);
+  }
+};
+
+template <typename T>
+void eltwise_forward(const framework::ExecutionContext &ctx,
+                     mkldnn::algorithm algorithm, const T alpha = 0,
+                     const T beta = 0) {
  PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                 "It must use CPUPlace.");
-
  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
  const auto &mkldnn_engine = dev_ctx.GetEngine();

-  // get buffers
-  const auto *src = ctx.template Input<Tensor>("X");
-  const auto *src_data = src->template data<T>();
+  const auto *x = ctx.Input<Tensor>("X");
+  auto *y = ctx.Output<Tensor>("Out");

-  auto *dst = ctx.template Output<Tensor>("Out");
-  T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
+  const T *x_data = x->data<T>();
+  T *y_data = y->mutable_data<T>(ctx.GetPlace());

-  // get memory dim
-  PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4,
+  PADDLE_ENFORCE(x->dims().size() == 2 || x->dims().size() == 4,
                 "Input dim must be with 2 or 4");
-  std::vector<int> src_tz = framework::vectorize2int(src->dims());
+
+  std::vector<int> src_tz = framework::vectorize2int(x->dims());
+
+  auto src_format =
+      src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format();

  const std::string key = gethash(src_tz, algorithm);
  const std::string key_src_data =
      key + ctx.op().Output("Out") + "@eltwise_fwd_src_data";
-  const std::string key_src_mem = key + "@eltwise_fwd_src_mem";
-  const std::string key_dst_mem = key + "@eltwise_fwd_dst_mem";
-  const std::string key_fwd = key + "@eltwise_fwd";
+  const std::string key_src_layout =
+      key + ctx.op().Output("Out") + "@eltwise_fwd_src_layout";
+  const std::string key_with_layout = key + std::to_string(src_format);
+  const std::string key_src_mem = key_with_layout + "@eltwise_fwd_src_mem";
+  const std::string key_dst_mem = key_with_layout + "@eltwise_fwd_dst_mem";
+  const std::string key_fwd = key_with_layout + "@eltwise_fwd";
+  const std::string key_fwd_pd = key_with_layout + "@eltwise_fwd_pd";
+
+  // save input data and layout to be referred in backward path
+  auto p_src_data = std::make_shared<const T *>(x_data);
+  dev_ctx.SetBlob(key_src_data, p_src_data);
+  auto p_src_layout = std::make_shared<memory::format>(src_format);
+  dev_ctx.SetBlob(key_src_layout, p_src_layout);

  auto p_fwd = std::static_pointer_cast<mkldnn::eltwise_forward>(
      dev_ctx.GetBlob(key_fwd));

-  // save input data to be referred in backward path
-  auto p_src_data = std::make_shared<const T *>(src_data);
-  dev_ctx.SetBlob(key_src_data, p_src_data);
+  std::shared_ptr<memory> dst_memory;

  if (p_fwd == nullptr) {
-    // create memory description
-    auto data_md = src_tz.size() == 2
-                       ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                                 mkldnn::memory::format::nc)
-                       : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                                 mkldnn::memory::format::nchw);
-
-    // create memory primitives
-    auto p_src_mem = std::make_shared<mkldnn::memory>(mkldnn::memory(
-        {data_md, mkldnn_engine}, platform::to_void_cast(src_data)));
-    dev_ctx.SetBlob(key_src_mem, p_src_mem);
-
-    auto p_dst_mem = std::make_shared<mkldnn::memory>(mkldnn::memory(
-        {data_md, mkldnn_engine}, platform::to_void_cast(dst_data)));
-    dev_ctx.SetBlob(key_dst_mem, p_dst_mem);
-
-    auto fwd_desc = mkldnn::eltwise_forward::desc(
-        mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
-    auto p_fwd_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
-        fwd_desc, mkldnn_engine);
-    const std::string key_fwd_pd = key + "eltwise_fwd_pd";
-    dev_ctx.SetBlob(key_fwd_pd, p_fwd_pd);
-    p_fwd = std::make_shared<mkldnn::eltwise_forward>(
-        *p_fwd_pd, *(p_src_mem.get()), *(p_dst_mem.get()));
+    // create mkldnn memory for input X
+    auto src_md = platform::MKLDNNMemDesc(
+        src_tz, platform::MKLDNNGetDataType<T>(), src_format);
+    auto src_memory = std::shared_ptr<memory>(
+        new memory({src_md, mkldnn_engine}, to_void_cast(x_data)));
+    // save src_memory to be referred in backward path
+    dev_ctx.SetBlob(key_src_mem, src_memory);
+
+    // create primitive descriptor for activation forward and save it
+    auto forward_desc = mkldnn::eltwise_forward::desc(
+        mkldnn::prop_kind::forward_training, algorithm,
+        src_memory->get_primitive_desc().desc(), alpha, beta);
+    auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
+        forward_desc, mkldnn_engine);
+
+    // save prim desc into global device context to be referred in backward path
+    dev_ctx.SetBlob(key_fwd_pd, forward_pd);
+
+    // create mkldnn memory for output y
+    dst_memory =
+        std::make_shared<memory>(forward_pd->dst_primitive_desc(), y_data);
+
+    dev_ctx.SetBlob(key_dst_mem, dst_memory);
+
+    // create activation primitive
+    p_fwd = std::make_shared<mkldnn::eltwise_forward>(*forward_pd, *src_memory,
+                                                      *dst_memory);
    dev_ctx.SetBlob(key_fwd, p_fwd);
  } else {
    // primitives already exist
-    auto p_src_mem =
+    auto src_memory =
        std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
-    PADDLE_ENFORCE(p_src_mem != nullptr,
-                   "Fail to find eltwise p_src_mem in device context.");
-    auto p_dst_mem =
+    PADDLE_ENFORCE(src_memory != nullptr,
+                   "Fail to find eltwise src_memory in device context.");
+    dst_memory =
        std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_dst_mem));
-    PADDLE_ENFORCE(p_dst_mem != nullptr,
-                   "Fail to find eltwise p_src_mem in device context.");
+    PADDLE_ENFORCE(dst_memory != nullptr,
+                   "Fail to find eltwise dst_memory in device context.");

-    p_src_mem->set_data_handle(platform::to_void_reinterpret_cast(src_data));
-    p_dst_mem->set_data_handle(dst_data);
+    src_memory->set_data_handle(platform::to_void_cast(x_data));
+    dst_memory->set_data_handle(y_data);
  }

  // push primitive to stream and wait until it's executed
-  std::vector<mkldnn::primitive> pipeline = {*(p_fwd.get())};
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  std::vector<primitive> pipeline;
+  pipeline.push_back(*p_fwd);
+  stream(stream::kind::eager).submit(pipeline).wait();
+
+  y->set_layout(DataLayout::kMKLDNN);
+  y->set_format(GetMKLDNNFormat(*dst_memory));
 }

-template <typename T, typename ExecContext>
-void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
-                  const T alpha = 0, const T beta = 0) {
+template <typename T>
+void eltwise_grad(const framework::ExecutionContext &ctx,
+                  mkldnn::algorithm algorithm, const T alpha = 0,
+                  const T beta = 0) {
  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
  const auto &mkldnn_engine = dev_ctx.GetEngine();

-  // get buffers
-  const auto *out = ctx.template Input<Tensor>("Out");
-
-  auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
-  const auto *diff_dst = dout->template data<T>();
+  const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
+  auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));

-  auto *dx =
-      ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
-  const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
+  const T *diff_y_data = diff_y->data<T>();
+  T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());

-  // get memory dim
-  std::vector<int> src_tz = framework::vectorize2int(out->dims());
+  std::vector<int> diff_dst_tz = framework::vectorize2int(diff_y->dims());

-  const std::string key = gethash(src_tz, algorithm);
-  const std::string key_diff_src_mem = key + "@eltwise_diff_src_mem";
-  const std::string key_diff_dst_mem = key + "@eltwise_diff_dst_mem";
-  const std::string key_grad = key + "@eltwise_grad";
+  auto diff_y_format =
+      diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format();

+  const std::string key = gethash(diff_dst_tz, algorithm);
  const std::string key_src_data =
      key + ctx.op().Input("Out") + "@eltwise_fwd_src_data";
+  const std::string key_src_layout =
+      key + ctx.op().Input("Out") + "@eltwise_fwd_src_layout";
+  const auto p_src_layout =
+      std::static_pointer_cast<memory::format>(dev_ctx.GetBlob(key_src_layout));
+  const std::string key_src_mem =
+      key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem";
+  const std::string key_fwd_pd =
+      key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd";
+  const std::string key_with_layouts =
+      key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format);
+  const std::string key_diff_src_mem =
+      key_with_layouts + "@eltwise_diff_src_mem";
+  const std::string key_diff_dst_mem =
+      key_with_layouts + "@eltwise_diff_dst_mem";
+  const std::string key_grad = key_with_layouts + "@eltwise_grad";
+
  const auto p_src_data =
      std::static_pointer_cast<T *>(dev_ctx.GetBlob(key_src_data));

-  const std::string key_src_mem = key + "@eltwise_fwd_src_mem";
-  auto p_src_mem =
+  auto src_memory =
      std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
-  p_src_mem->set_data_handle(*p_src_data.get());
+  PADDLE_ENFORCE(src_memory != nullptr,
+                 "Fail to find src_memory in device context");
+  src_memory->set_data_handle(*p_src_data.get());
+
+  std::shared_ptr<memory> diff_src_memory;

-  auto p_grad = std::static_pointer_cast<mkldnn::eltwise_forward::primitive>(
+  auto p_grad = std::static_pointer_cast<mkldnn::eltwise_backward>(
      dev_ctx.GetBlob(key_grad));

  if (p_grad == nullptr) {
-    // create memory description
-    auto data_md = src_tz.size() == 2
-                       ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                                 mkldnn::memory::format::nc)
-                       : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                                 mkldnn::memory::format::nchw);
-
-    // create memory primitives
-    std::shared_ptr<void> p_diff_src_mem =
-        std::make_shared<mkldnn::memory>(mkldnn::memory(
-            {data_md, mkldnn_engine}, platform::to_void_cast(diff_src)));
-    dev_ctx.SetBlob(key_diff_src_mem, p_diff_src_mem);
-    std::shared_ptr<void> p_diff_dst_mem =
-        std::make_shared<mkldnn::memory>(mkldnn::memory(
-            {data_md, mkldnn_engine}, platform::to_void_cast(diff_dst)));
-    dev_ctx.SetBlob(key_diff_dst_mem, p_diff_dst_mem);
-
-    auto bwd_desc = mkldnn::eltwise_backward::desc(algorithm, data_md, data_md,
-                                                   alpha, beta);
-
-    const std::string key_fwd_pd = key + "eltwise_fwd_pd";
-    auto *p_fwd_pd = static_cast<mkldnn::eltwise_forward::primitive_desc *>(
-        dev_ctx.GetBlob(key_fwd_pd).get());
-
-    auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
-        bwd_desc, mkldnn_engine, *p_fwd_pd);
-
+    // create mkldnn memory for input diff_y
+    auto diff_dst_md = platform::MKLDNNMemDesc(
+        diff_dst_tz, platform::MKLDNNGetDataType<T>(), diff_y_format);
+    auto diff_dst_memory = std::shared_ptr<memory>(
+        new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data)));
+    dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory);
+
+    // retrieve eltwise primitive desc from device context
+    auto forward_pd =
+        std::static_pointer_cast<mkldnn::eltwise_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_fwd_pd));
+    PADDLE_ENFORCE(forward_pd != nullptr,
+                   "Fail to find eltwise_fwd_pd in device context");
+
+    // ceate primitive descriptor for activation backward
+    auto backward_desc = mkldnn::eltwise_backward::desc(
+        algorithm, diff_dst_memory->get_primitive_desc().desc(),
+        src_memory->get_primitive_desc().desc(), alpha, beta);
+    auto backward_pd = mkldnn::eltwise_backward::primitive_desc(
+        backward_desc, mkldnn_engine, *forward_pd);
+
+    // create mkldnn memory for output diff_src
+    diff_src_memory = std::make_shared<memory>(
+        backward_pd.diff_src_primitive_desc(), diff_x_data);
+    dev_ctx.SetBlob(key_diff_src_mem, diff_src_memory);
+
+    // create activation backward primitive
    p_grad = std::make_shared<mkldnn::eltwise_backward>(
-        eltwise_bwd_prim_desc, *static_cast<mkldnn::memory *>(p_src_mem.get()),
-        *(static_cast<mkldnn::memory *>(p_diff_dst_mem.get())),
-        *(static_cast<mkldnn::memory *>(p_diff_src_mem.get())));
+        backward_pd, *src_memory, *diff_dst_memory, *diff_src_memory);
+    dev_ctx.SetBlob(key_grad, p_grad);
  } else {
    // primitives already exist
-    auto p_diff_src_mem = std::static_pointer_cast<mkldnn::memory>(
+    diff_src_memory = std::static_pointer_cast<mkldnn::memory>(
        dev_ctx.GetBlob(key_diff_src_mem));
-    auto p_diff_dst_mem = std::static_pointer_cast<mkldnn::memory>(
+    auto diff_dst_memory = std::static_pointer_cast<mkldnn::memory>(
        dev_ctx.GetBlob(key_diff_dst_mem));

-    p_diff_src_mem->set_data_handle(
-        platform::to_void_reinterpret_cast(diff_src));
-    p_diff_dst_mem->set_data_handle(
-        platform::to_void_reinterpret_cast(diff_dst));
+    diff_src_memory->set_data_handle(
+        platform::to_void_reinterpret_cast(diff_x_data));
+    diff_dst_memory->set_data_handle(
+        platform::to_void_reinterpret_cast(diff_y_data));
  }

  // push primitive to stream and wait until it's executed
-  std::vector<mkldnn::primitive> pipeline = {*(p_grad.get())};
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  std::vector<primitive> pipeline;
+  pipeline.push_back(*p_grad);
+  stream(stream::kind::eager).submit(pipeline).wait();
+
+  diff_x->set_layout(DataLayout::kMKLDNN);
+  diff_x->set_format(GetMKLDNNFormat(*diff_src_memory));
 }
-}  // anonymous namespace

 template <typename T, mkldnn::algorithm algorithm>
 struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
-  template <typename ExecContext>
-  void operator()(const ExecContext &ctx) const {
+  void operator()(const framework::ExecutionContext &ctx) const {
    eltwise_forward<T>(ctx, algorithm);
  }
 };

 template <typename T, mkldnn::algorithm algorithm>
 struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
-  template <typename ExecContext>
-  void operator()(const ExecContext &ctx) const {
+  void operator()(const framework::ExecutionContext &ctx) const {
    eltwise_grad<T>(ctx, algorithm);
  }
 };

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

+using paddle::framework::Tensor;
+
 #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)               \
  class OP_NAME##OpMaker                                                \
      : public ::paddle::framework::OpProtoAndCheckerMaker {            \
@@ -29,7 +31,7 @@ namespace operators {
      AddAttr<bool>("use_mkldnn",                                       \
                    "(bool, default false) Only used in mkldnn kernel") \
          .SetDefault(false);                                           \
-      AddComment(OP_COMMENT);                                           \
+      AddComment(#OP_COMMENT);                                          \
    }                                                                   \
  }

@@ -58,7 +60,6 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
                                      const framework::OperatorWithKernel& oper,
                                      const std::string& name) {
  framework::LibraryType library{framework::LibraryType::kPlain};
-
  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
  auto it = oper.Attrs().find("use_mkldnn");
@@ -82,6 +83,7 @@ class ActivationOp : public framework::OperatorWithKernel {
    ctx->ShareLoD("X", /*->*/ "Out");
  }

+ protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    return GetKernelType(ctx, *this, "X");
@@ -96,6 +98,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
  }

+ protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    return GetKernelType(ctx, *this, "Out");
@@ -112,7 +115,7 @@ $$out = \frac{1}{1 + e^{-x}}$$
 __attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator

-$$out = \log \frac{1}{1 + e^{-x}}$$
+$$out = \\log \\frac{1}{1 + e^{-x}}$$

 )DOC";

@@ -133,7 +136,7 @@ $out = \max(x, 0)$
 __attribute__((unused)) constexpr char TanhDoc[] = R"DOC(
 Tanh Activation Operator.

-$$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

 )DOC";

@@ -196,7 +199,7 @@ $out = [x]$
 __attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC(
 Reciprocal Activation Operator.

-$$out = \frac{1}{x}$$
+$$out = \\frac{1}{x}$$

 )DOC";

@@ -252,15 +255,14 @@ class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "Output of Softshrink operator");
    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
    AddComment(R"DOC(
-Softshrink Activation Operator.
+:strong:`Softshrink Activation Operator`

-$$
-out = \begin{cases} 
-    x - \lambda, \text{if } x > \lambda \\
-    x + \lambda, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+..  math::
+    out = \begin{cases} 
+         x - \lambda, \text{if } x > \lambda \\
+         x + \lambda, \text{if } x < -\lambda \\
+         0,  \text{otherwise}
+         \end{cases}

 )DOC");
  }
@@ -271,18 +273,18 @@ class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("X", "Input of HardShrink operator");
    AddOutput("Out", "Output of HardShrink operator");
-    AddAttr<float>("threshold", "The value of threshold for HardShrink")
+    AddAttr<float>("threshold",
+                   "The value of threshold for HardShrink. [default: 0.5]")
        .SetDefault(0.5f);
    AddComment(R"DOC(
-HardShrink Activation Operator.
+:strong:`HardShrink activation operator`

-$$
-out = \begin{cases} 
-    x, \text{if } x > \lambda \\
-    x, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+..  math::
+    out = \begin{cases}
+            x, \text{if } x > \lambda \\
+            x, \text{if } x < -\lambda \\
+            0,  \text{otherwise}
+          \end{cases}

 )DOC");
  }
@@ -394,18 +396,18 @@ class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("X", "Input of ThresholdedRelu operator");
    AddOutput("Out", "Output of ThresholdedRelu operator");
-    AddAttr<float>("threshold", "The threshold location of activation")
+    AddAttr<float>("threshold",
+                   "The threshold location of activation. [default 1.0].")
        .SetDefault(1.0f);
    AddComment(R"DOC(
-ThresholdedRelu Activation Operator.
+:strong:`ThresholdedRelu activation operator`

-$$
-out = \begin{cases} 
-    x, \text{if } x > threshold \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+..  math::

+    out = \begin{cases}
+             x,  \text{if } x > threshold \\
+             0,  \text{otherwise}
+          \end{cases}
 )DOC");
  }
 };
@@ -444,7 +446,7 @@ class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 Swish Activation Operator.

-$$out = \frac{x}{1 + e^{- \beta x}}$$
+$$out = \\frac{x}{1 + e^{- \beta x}}$$

 )DOC");
  }

--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
@@ -91,32 +91,31 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
        "(int64_t). The number of chunks both in Inference and Label on the "
        "given mini-batch.");
    AddAttr<int>("num_chunk_types",
-                 "(int). The number of chunk type. See below for details.");
-    AddAttr<std::string>(
-        "chunk_scheme",
-        "(string, default IOB). The labeling scheme indicating "
-        "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below "
-        "for details.")
+                 "The number of chunk type. See the description for details.");
+    AddAttr<std::string>("chunk_scheme",
+                         "The labeling scheme indicating "
+                         "how to encode the chunks. Must be IOB, IOE, IOBES or "
+                         "plain. See the description"
+                         "for details.")
        .SetDefault("IOB");
    AddAttr<std::vector<int>>("excluded_chunk_types",
-                              "(list<int>) A list including chunk type ids "
+                              "A list including chunk type ids "
                              "indicating chunk types that are not counted. "
-                              "See below for details.")
+                              "See the description for details.")
        .SetDefault(std::vector<int>{});
    AddComment(R"DOC(
 For some basics of chunking, please refer to
-‘Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
+'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.

-
-CheckEvalOp computes the precision, recall, and F1-score of chunk detection,
+ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
 and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
 Here is a NER example of labeling for these tagging schemes:
-
- 	     Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
-  IO:    I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
-  IOB:   B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
-  IOE:   I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
-  IOBES: B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
+   
+          Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
+   IO     I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
+   IOB    B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
+   IOE    I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
+   IOBES  B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC

 There are three chunk types(named entity types) including PER(person), ORG(organization)
 and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
@@ -124,31 +123,31 @@ and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chun
 Since the calculations actually use label ids rather than labels, extra attention
 should be paid when mapping labels to ids to make CheckEvalOp work. The key point
 is that the listed equations are satisfied by ids.
-
-    tag_type = label % num_tag_type
-    chunk_type = label / num_tag_type
+   
+   tag_type = label % num_tag_type
+   chunk_type = label / num_tag_type

 where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
 is the num of chunk types, and `tag_type` get its value from the following table.
-
-    Scheme Begin Inside End   Single
-     plain   0     -      -     -
-     IOB     0     1      -     -
-     IOE     -     0      1     -
-     IOBES   0     1      2     3
+   
+   Scheme Begin Inside End   Single
+    plain   0     -      -     -
+    IOB     0     1      -     -
+    IOE     -     0      1     -
+    IOBES   0     1      2     3

 Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
 PER and LOC. To satisfy the above equations, the label map can be like this:

-    B-ORG  0
-    I-ORG  1
-    B-PER  2
-    I-PER  3
-    B-LOC  4
-    I-LOC  5
-    O      6
+   B-ORG  0
+   I-ORG  1
+   B-PER  2
+   I-PER  3
+   B-LOC  4
+   I-LOC  5
+   O      6

-It’s not hard to verify the equations noting that the num of chunk types
+It's not hard to verify the equations noting that the num of chunk types
 is 3 and the num of tag types in IOB scheme is 2. For example, the label
 id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
 I-LOC is 2, which consistent with the results from the equations.

--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
@@ -54,10 +54,19 @@ be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as
 shown in the following formula:

 $$
-Out = \frac{max\_norm * X}{norm(X)},
+Out = \\frac{max\\_norm * X}{norm(X)},
 $$

 where $norm(X)$ represents the L2 norm of $X$.
+
+Examples:
+        .. code-block:: python
+
+            data = fluid.layer.data(
+                name='data', shape=[2, 4, 6], dtype='float32')
+            reshaped = fluid.layers.clip_by_norm(
+                x=data, max_norm=0.5)
+
 )DOC");
  }
 };

--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -23,30 +23,26 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    OpComment comment;
-    AddInput("X",
-             string::Sprintf("(LoDTensor) the left hand operand of %s operator",
-                             comment.type));
-    AddInput("Y", string::Sprintf(
-                      "(LoDTensor) the right hand operand of %s operator",
-                      comment.type));
+    AddInput("X", string::Sprintf("the left hand operand of %s operator",
+                                  comment.type));
+    AddInput("Y", string::Sprintf("the right hand operand of %s operator",
+                                  comment.type));
    AddAttr<bool>("force_cpu",
-                  "(bool, default false) Force fill output variable to cpu "
+                  "Force fill output variable to cpu "
                  "memory. Otherwise, fill output variable to the running "
-                  "device")
-        .SetDefault(false);
-    AddOutput("Out", string::Sprintf(
-                         "(LoDTensor) n-dim bool tensor. Each element is %s",
-                         comment.equation));
-    AddComment(string::Sprintf(R"DOC(%s Operator
-
+                  "device [default true].")
+        .SetDefault(true);
+    AddOutput("Out", string::Sprintf("n-dim bool tensor. Each element is %s",
+                                     comment.equation));
+    AddComment(string::Sprintf(R"DOC(
 It operates element-wise on X and Y, and returns the Out. Each of them is a
 N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
-calculated by %s
+calculated by $%s$
 )DOC",
-                               comment.type, comment.equation));
-    AddAttr<int>("axis",
-                 "(int, default -1). The start dimension index "
-                 "for broadcasting Y onto X.")
+                               comment.equation));
+    AddAttr<int>(
+        "axis",
+        "The start dimension index for broadcasting Y onto X. [default -1]")
        .SetDefault(-1)
        .EqualGreaterThan(-1);
  }

--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -107,7 +107,13 @@ REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
                      false> /* set false to disable empty grad */);
 REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>);
+    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int>);
 REGISTER_OP_CPU_KERNEL(
    concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int>);
--- a/paddle/fluid/operators/concat_op.cu.cc
+++ b/paddle/fluid/operators/concat_op.cu.cc
@@ -15,7 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/concat_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>);
+    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int>);
 REGISTER_OP_CUDA_KERNEL(
    concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int>);
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -60,34 +60,45 @@ template <typename DeviceContext, typename T>
 class ConcatGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto out_var_names = ctx.Outputs(framework::GradVarName("X"));
    auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));

+    // get output tensor that the name is not kEmptyVarName
+    std::vector<framework::Tensor*> outputs;
+    for (size_t j = 0; j < outs.size(); ++j) {
+      if (out_var_names[j] != framework::kEmptyVarName) {
+        outs[j]->mutable_data<T>(ctx.GetPlace());
+        outputs.push_back(outs[j]);
+      } else {
+        outputs.push_back(nullptr);
+      }
+    }
+
    // Sometimes direct copies will be faster, this maybe need deeply analysis.
    if (axis == 0 && outs.size() < 10) {
      size_t input_offset = 0;
-      auto in_stride = framework::stride_numel(in->dims());
+      const auto in_stride = framework::stride_numel(out_grad->dims());

-      for (auto& out : outs) {
-        out->mutable_data<T>(ctx.GetPlace());
-        auto out_stride = framework::stride_numel(out->dims());
-        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
-                                    out_stride, in->data<T>() + input_offset,
-                                    in_stride, out_stride[axis]);
+      for (size_t i = 0; i < outs.size(); ++i) {
+        auto out_stride = framework::stride_numel(ins[i]->dims());
+        auto* out = outputs[i];
+        if (out != nullptr) {
+          StridedNumelCopyWithAxis<T>(
+              ctx.device_context(), axis, out->data<T>(), out_stride,
+              out_grad->data<T>() + input_offset, in_stride, out_stride[axis]);
+        }
        input_offset += out_stride[axis];
      }
    } else {
-      std::vector<framework::Tensor> outputs(outs.size());
-      for (size_t j = 0; j < outs.size(); ++j) {
-        outs[j]->mutable_data<T>(ctx.GetPlace());
-        outputs[j] = *outs[j];
-      }
-
      auto& dev_ctx = ctx.template device_context<DeviceContext>();
      paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
          concat_grad_functor;
-      concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), &outputs);
+      concat_grad_functor(dev_ctx, *out_grad, ins, static_cast<int>(axis),
+                          &outputs);
    }
  }
 };

--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -156,7 +156,7 @@ Parameters(strides, paddings) are two elements. These two elements represent hei
 and width, respectively.
 The input(X) size and output(Out) size may be different.

-Example:
+For an example:
  Input:
       Input shape: $(N, C_{in}, H_{in}, W_{in})$
       Filter shape: $(C_{in}, C_{out}, H_f, W_f)$

--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -76,9 +76,9 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
        .AsIntermediate();

    AddComment(R"DOC(
-Cosine Similarity Operator.
+**Cosine Similarity Operator**

-$Out = X^T * Y / (\sqrt{X^T * X} * \sqrt{Y^T * Y})$
+$Out = \frac{X^T * Y}{(\sqrt{X^T * X} * \sqrt{Y^T * Y})}$

 The input X and Y must have the same shape, except that the 1st dimension
 of input Y could be just 1 (different from input X), which will be

--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -53,21 +53,18 @@ sequence of observed tags.
 The output of this operator changes according to whether Input(Label) is given:

 1. Input(Label) is given:
-
-This happens in training. This operator is used to co-work with the chunk_eval
-operator.
-
-When Input(Label) is given, the crf_decoding operator returns a row vector
-with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
-prediction, or 1 indicating a tag is correctly predicted. Such an output is the
-input to chunk_eval operator.
+   This happens in training. This operator is used to co-work with the chunk_eval
+   operator.
+   When Input(Label) is given, the crf_decoding operator returns a row vector
+   with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
+   prediction, or 1 indicating a tag is correctly predicted. Such an output is the
+   input to chunk_eval operator.

 2. Input(Label) is not given:
-
-This is the standard decoding process.
+   This is the standard decoding process.

 The crf_decoding operator returns a row vector with shape [N x 1] whose values
-range from 0 to maximum tag number - 1. Each element indicates an index of a
+range from 0 to maximum tag number - 1, Each element indicates an index of a
 predicted tag.
 )DOC");
  }

--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -52,7 +52,7 @@ static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
  } else {
    res = ctx.Attr<std::vector<int>>("offsets");
    PADDLE_ENFORCE_EQ(
-        rank, res.size(),
+        rank, static_cast<int>(res.size()),
        "Offsets size should be equal to dimension size of input tensor.");
  }
  return res;

--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -30,19 +30,19 @@ class CumOp : public framework::OperatorWithKernel {
 class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("X", "Input of Cumsum operator");
-    AddOutput("Out", "Output of Cumsum operator");
+    AddInput("X", "Input of cumsum operator");
+    AddOutput("Out", "Output of cumsum operator");
    AddAttr<int>("axis",
-                 "(int, default -1). The dimenstion to accumulate along. "
-                 "-1 means the last dimenstion")
+                 "The dimenstion to accumulate along. -1 means the last "
+                 "dimenstion [default -1].")
        .SetDefault(-1)
        .EqualGreaterThan(-1);
    AddAttr<bool>("exclusive",
-                  "bool, default false). Whether to perform exclusive cumsum")
+                  "Whether to perform exclusive cumsum. [default false].")
        .SetDefault(false);
    AddAttr<bool>("reverse",
-                  "bool, default false). If true, the cumsum is performed in "
-                  "the reversed direction")
+                  "If true, the cumsum is performed in the reversed direction. "
+                  "[default false].")
        .SetDefault(false);
    AddComment(R"DOC(
 The cumulative sum of the elements along a given axis.

--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -245,7 +245,7 @@ void GRPCClient::Proceed() {
    if (c->status_.ok()) {
      c->Process();
    } else {
-      LOG(ERROR) << "var: " << c->var_h_.String()
+      LOG(FATAL) << "var: " << c->var_h_.String()
                 << " grpc error:" << c->status_.error_message();
    }
    delete c;

--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -106,23 +106,36 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
              "and M represents the number of deocded boxes.");

    AddComment(R"DOC(
-Bounding Box Coder Operator.
+
+Bounding Box Coder.
+
 Encode/Decode the target bounding box with the priorbox information.
+
 The Encoding schema described below:
-ox = (tx - px) / pw / pxv
-oy = (ty - py) / ph / pyv
-ow = log(abs(tw / pw)) / pwv 
-oh = log(abs(th / ph)) / phv 
+
+    ox = (tx - px) / pw / pxv
+
+    oy = (ty - py) / ph / pyv
+
+    ow = log(abs(tw / pw)) / pwv 
+
+    oh = log(abs(th / ph)) / phv 
+
 The Decoding schema described below:
-ox = (pw * pxv * tx * + px) - tw / 2
-oy = (ph * pyv * ty * + py) - th / 2
-ow = exp(pwv * tw) * pw + tw / 2
-oh = exp(phv * th) * ph + th / 2
-where tx, ty, tw, th denote the target box's center coordinates, width and
-height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
-center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
-of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
-width and height.
+
+    ox = (pw * pxv * tx * + px) - tw / 2
+
+    oy = (ph * pyv * ty * + py) - th / 2
+
+    ow = exp(pwv * tw) * pw + tw / 2
+
+    oh = exp(phv * th) * ph + th / 2
+
+where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
+and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
+priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
+`phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the
+encoded/decoded coordinates, width and height.
 )DOC");
  }
 };

--- a/paddle/fluid/operators/detection/iou_similarity_op.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cc
@@ -68,15 +68,16 @@ class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
              "representing pairwise iou scores.");

    AddComment(R"DOC(
-IOU Similarity Operator.
+**IOU Similarity Operator**
+
 Computes intersection-over-union (IOU) between two box lists.
- Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
- boxes in 'Y' are shared by all instance of the batched inputs of X.
- Given two boxes A and B, the calculation of IOU is as follows:
+Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
+boxes in 'Y' are shared by all instance of the batched inputs of X.
+Given two boxes A and B, the calculation of IOU is as follows:

 $$
 IOU(A, B) = 
-\frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)}
+\\frac{area(A\\cap B)}{area(A)+area(B)-area(A\\cap B)}
 $$

 )DOC");

--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -83,11 +83,13 @@ class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker {

    AddComment(R"DOC(
 PolygonBoxTransform Operator.
+
+PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
+
 The input is the final geometry output in detection network.
 We use 2*n numbers to denote the coordinate shift from n corner vertices of
 the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi),
 the geometry output contains 2*n channels.
-PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
 )DOC");
  }
 };

--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise_mul_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
 namespace ops = paddle::operators;
-REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\odot\\ Y");
+REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\\\odot Y");
 REGISTER_OP_CPU_KERNEL(
    elementwise_mul,
    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
@@ -36,11 +36,12 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
  void Apply() override {
    AddAttr<float>("mean",
                   "(float, default 0.0) "
-                   "mean of random tensor.")
+                   "The mean (or center) of the gaussian distribution.")
        .SetDefault(.0f);
    AddAttr<float>("std",
                   "(float, default 1.0) "
-                   "std of random tensor.")
+                   "The standard deviation (std, or spread) of the "
+                   "gaussian distribution.")
        .SetDefault(1.0f);
    AddAttr<int>("seed",
                 "(int, default 0) "
@@ -55,9 +56,11 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
        .SetDefault(framework::proto::VarType::FP32);

    AddComment(R"DOC(
-GaussianRandom Operator.

 Used to initialize tensors with gaussian random generator.
+The defalut mean of the distribution is 0. and defalut standard
+deviation (std) of the distribution is 1.. Uers can set mean and std
+by input arguments.
 )DOC");
  }
 };

--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
@@ -85,7 +85,7 @@ class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
        .InEnum({"CUDA", "CPU", "AUTO"})
        .SetDefault("AUTO");
    AddComment(R"DOC(
-Returns a list of places based on flags. The list will be used for parallel
+Returns a list of places based on arguments. The list will be used for parallel
 execution.
 )DOC");
  }

--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -62,36 +62,33 @@ class LayerNormOp : public framework::OperatorWithKernel {
 class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("X", "(LoDTensor) The input tensor.");
+    AddInput("X", "The input tensor.");
    AddInput("Scale",
-             "(Tensor, optional) Scale is a 1-dimensional tensor of size "
+             "(optional) Scale is a 1-dimensional tensor of size "
             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
             "It is applied to the output.")
        .AsDispensable();
    AddInput("Bias",
-             "(Tensor, optional) Bias is a 1-dimensional tensor of size "
+             "(optional) Bias is a 1-dimensional tensor of size "
             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
             "It is applied to the output.")
        .AsDispensable();
-    AddOutput("Y", "(LoDTensor) Result after normalization.");
-    AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
-        .AsIntermediate();
-    AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
+    AddOutput("Y", "Result after normalization.");
+    AddOutput("Mean", "Mean of the current mini batch.").AsIntermediate();
+    AddOutput("Variance", "Variance of the current mini batch.")
        .AsIntermediate();

    AddAttr<float>("epsilon",
-                   "(float, default 1e-5) Constant for "
-                   "numerical stability")
+                   "Constant for numerical stability [default 1e-5].")
        .SetDefault(1e-5)
        .AddCustomChecker([](const float &epsilon) {
          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
                         "'epsilon' should be between 0.0 and 0.001.");
        });
    AddAttr<int>("begin_norm_axis",
-                 "(int default:1), the "
-                 "axis of `begin_norm_axis ... Rank(X) - 1` will be "
+                 "the axis of `begin_norm_axis ... Rank(X) - 1` will be "
                 "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
-                 "matrix [N,H].")
+                 "matrix [N,H]. [default 1].")
        .SetDefault(1)
        .AddCustomChecker([](const int &begin_norm_axis) {
          PADDLE_ENFORCE_GT(begin_norm_axis, 0,
@@ -99,10 +96,14 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
        });

    AddComment(R"DOC(
-Layer Normalization.
-Layer Norm has been implemented as discussed in the paper:
-https://arxiv.org/abs/1607.06450
-...
+Assume feature vectors exist on dimensions
+:attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
+along these dimensions for each feature vector :math:`a` with size
+:math:`H`, then normalize each feature vector using the corresponding
+statistics. After that, apply learnable gain and bias on the normalized
+tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
+
+Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
 )DOC");
  }
 };

--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -84,6 +84,7 @@ CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
 http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.

 Equation:
+
 1. Denote Input(Emission) to this operator as $x$ here.
 2. The first D values of Input(Transition) to this operator are for starting
 weights, denoted as $a$ here.
@@ -106,6 +107,7 @@ Finally, the linear chain CRF operator outputs the logarithm of the conditional
 likelihood of each training sample in a mini-batch.

 NOTE:
+
 1. The feature function for a CRF is made up of the emission features and the
 transition features. The emission feature weights are NOT computed in
 this operator. They MUST be computed first before this operator is called.

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -348,7 +348,8 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
 };

 void SignalHandler::StopAndExit(int signal_num) {
-  VLOG(3) << "Catch interrupt signal: " << signal_num << ", program will exit";
+  // Do not use VLOG here for the device for printing maybe already released.
+  // exit will release interal allocated resoureces.
  exit(0);
 }


--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -184,34 +184,32 @@ Long-Short Term Memory (LSTM) Operator.
 The defalut implementation is diagonal/peephole connection
 (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:

-$$
-i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) \\
+$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) $$

-f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) \\
+$$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) $$

-\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\
+$$ \\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) $$

-o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) \\
+$$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) $$

-c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
+$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$

-h_t = o_t \odot act_h(c_t)
-$$
+$$ h_t = o_t \\odot act_h(c_t) $$

-where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
-of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
-are diagonal weight matrices for peephole connections. In our implementation,
-we use vectors to reprenset these diagonal weight matrices. The b terms
-denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
-is the non-line activations, such as logistic sigmoid function, and
-$i, f, o$ and $c$ are the input gate, forget gate, output gate,
-and cell activation vectors, respectively, all of which have the same size as
-the cell output activation vector $h$.
-
-The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
-are the cell input and cell output activation functions and `tanh` is usually
-used for them. $\tilde{c_t}$ is also called candidate hidden state,
-which is computed based on the current input and the previous hidden state.
+- W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
+  of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
+  are diagonal weight matrices for peephole connections. In our implementation,
+  we use vectors to reprenset these diagonal weight matrices.
+- The b terms denote bias vectors ($b_i$ is the input gate bias vector).
+- $\sigma$ is the non-line activations, such as logistic sigmoid function.
+- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
+  and cell activation vectors, respectively, all of which have the same size as
+  the cell output activation vector $h$.
+- The $\odot$ is the element-wise product of the vectors.
+- $act_g$ and $act_h$ are the cell input and cell output activation functions
+  and `tanh` is usually used for them.
+- $\tilde{c_t}$ is also called candidate hidden state,
+  which is computed based on the current input and the previous hidden state.

 Set `use_peepholes` False to disable peephole connection. The formula
 is omitted here, please refer to the paper

--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -70,35 +70,40 @@ template <typename T>
 class ConcatGradFunctor<platform::CPUDeviceContext, T> {
 public:
  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const int axis,
-                  std::vector<framework::Tensor>* outputs) {
+                  const framework::Tensor& input,
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
    // TODO(zcd): Add input data validity checking
-    int num = outputs->size();
+    size_t num = outputs->size();

    int input_rows = 1;
-    auto dim_0 = outputs->at(0).dims();
+    auto dim_0 = ref_inputs[0]->dims();
    for (int i = 0; i < axis; ++i) {
      input_rows *= dim_0[i];
    }
+
    int input_cols = 0;

    std::vector<int64_t> output_cols(outputs->size());
-    for (int i = 0; i < num; ++i) {
-      int t_cols = outputs->at(i).numel() / input_rows;
+    for (size_t i = 0; i < num; ++i) {
+      int t_cols = ref_inputs[i]->numel() / input_rows;
      input_cols += t_cols;
      output_cols[i] = t_cols;
    }
    auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());

    // computation
-    for (int k = 0; k < input_rows; ++k) {
+    for (size_t k = 0; k < input_rows; ++k) {
      const T* src_ptr = input.data<T>() + k * input_cols;
      int col_idx = 0;
      for (int j = 0; j < num; ++j) {
        int col_len = output_cols[j];
-        T* dst_ptr = outputs->at(j).data<T>() + k * col_len;
-        memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
-                     sizeof(T) * col_len);
+        auto* out_tensor = outputs->at(j);
+        if (out_tensor != nullptr) {
+          T* dst_ptr = out_tensor->data<T>() + k * col_len;
+          memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
+                       sizeof(T) * col_len);
+        }
        col_idx += col_len;
      }
    }

--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -102,10 +102,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
    int local_col = tid_x - curr_offset;
    int segment_width = curr_col_offset - curr_offset;
    T* output_ptr = outputs_data[curr_segment];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-      output_ptr[tid_y * segment_width + local_col] =
-          input_data[tid_y * in_col + tid_x];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * segment_width + local_col] =
+            input_data[tid_y * in_col + tid_x];
+    }
  }
 }

@@ -118,10 +120,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
    int split = tid_x / fixed_out_col;
    int in_offset = tid_x - split * fixed_out_col;
    T* output_ptr = outputs_data[split];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-      output_ptr[tid_y * fixed_out_col + in_offset] =
-          input_data[tid_y * in_col + tid_x];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * fixed_out_col + in_offset] =
+            input_data[tid_y * in_col + tid_x];
+    }
  }
 }

@@ -203,17 +207,18 @@ template <typename T>
 class ConcatGradFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const int axis,
-                  std::vector<framework::Tensor>* outputs) {
+                  const framework::Tensor& input,
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
    // TODO(zcd): Add input data validity checking
    int o_num = outputs->size();
    int out_row = 1;
-    auto dim_0 = outputs->at(0).dims();
+    auto dim_0 = ref_inputs[0]->dims();
    for (int i = 0; i < axis; ++i) {
      out_row *= dim_0[i];
    }

-    int out_col = outputs->at(0).numel() / out_row;
+    int out0_col = ref_inputs[0]->numel() / out_row;
    int in_col = 0, in_row = out_row;
    bool sameShape = true;

@@ -223,13 +228,17 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {

    outputs_cols[0] = 0;
    for (int i = 0; i < o_num; ++i) {
-      int t_col = outputs->at(i).numel() / out_row;
+      int t_col = outputs->at(i)->numel() / out_row;
      if (sameShape) {
-        if (t_col != out_col) sameShape = false;
+        if (t_col != out0_col) sameShape = false;
      }
      in_col += t_col;
      outputs_cols[i + 1] = in_col;
-      outputs_ptr[i] = outputs->at(i).data<T>();
+      if (outputs->at(i) != nullptr) {
+        outputs_ptr[i] = outputs->at(i)->data<T>();
+      } else {
+        outputs_ptr[i] = nullptr;
+      }
    }

    T** dev_out_gpu_data =
@@ -255,7 +264,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {

    if (sameShape) {
      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), in_row, in_col, out_col, dev_out_gpu_data);
+          input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
    } else {
      const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(

--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
@@ -57,7 +57,8 @@ template <typename DeviceContext, typename T>
 class ConcatGradFunctor {
 public:
  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const int axis, std::vector<framework::Tensor>* outputs);
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs);
 };

 }  // namespace math

--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -33,12 +33,10 @@ class MeanOp : public framework::OperatorWithKernel {
 class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("X", "The input of mean op");
-    AddOutput("Out", "The output of mean op").Reuse("X");
+    AddInput("X", "(Tensor) The input of mean op");
+    AddOutput("Out", "(Tensor) The output of mean op").Reuse("X");
    AddComment(R"DOC(
-Mean Operator.
-
-Out is a scalar which is the mean of all elements in X. 
+Mean Operator calculates the mean of all elements in X.

 )DOC");
  }

--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -62,26 +62,46 @@ class MultiplexOp : public framework::OperatorWithKernel {
 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("Ids", "The index tensor of multiplex operator.");
-    AddInput("X", "The candidate tensors of multiplex operator.")
+    AddInput("Ids",
+             "Tensor<int32>, index variable which is a 2-D tensor with shape "
+             "[M, 1] where M is the batch size.");
+    AddInput("X",
+             "A list of variables to gather from. All variables have the same "
+             "shape and the rank is at least 2.")
        .AsDuplicable();
    AddOutput("Out", "The output tensor of multiplex operator.");
    AddComment(R"DOC(
-Multiplex Operator.
-
-Multiplex multiple tensors according to the index provided by the index tensor.
-
-Ids: the index tensor.
-X[0 : N - 1]: the candidate tensors for output (N >= 2).
-For each index i from 0 to batchSize - 1, the output is the i-th row of the
+Referring to the given index variable, this layer selects rows from the
+input variables to construct a multiplex variable. Assuming that there are
+:math:`m` input variables and :math:`I_i` represents the i-th input
+variable and :math:`i` is in [0, :math:`m`). All input variables are
+tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
+Please note that rank of the input tensor should be at least 2. Each input
+variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
+where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
+* ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
+variable. The given index variable should be a 2-D tensor with shape
+[:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
+Then the output variable will be a tensor with shape [:math:`d_0`,
+:math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
+matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
+row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
+
+* Ids: the index tensor.
+
+* X[0 : N - 1]: the candidate tensors for output (N >= 2).
+
+* For each index i from 0 to batchSize - 1, the output is the i-th row of the
 the (Ids[i])-th tensor.

 For i-th row of the output tensor:

-$$y[i] = x_{k}[i]$$
+$$
+y[i] = x_{k}[i]
+$$

-where `y` is the output tensor, `x_{k}` is the k-th input tensor,
-and `k = Ids[i]`.
+where $y$ is the output tensor, $x_{k}$ is the k-th input tensor,
+and $k = Ids[i]$.

 )DOC");
  }

--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -128,8 +128,10 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
                              "user should avoid setting this attribute.")
        .SetDefault({});
    AddComment(R"DOC(
-Compute and return the noise-contrastive estimation training loss.
-See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+Compute and return the noise-contrastive estimation training loss. See 
+`Noise-contrastive estimation: A new estimation principle for unnormalized 
+statistical models 
+ <http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_.
 By default this operator uses a uniform distribution for sampling.
 )DOC");
  }

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -204,8 +204,6 @@ void Pool2dOpMaker::Make() {
  // TODO(dzhwinter): need to registered layout transform function

  AddComment(R"DOC(
-Pool2d Operator.
-
 The pooling2d operation calculates the output based on
 the input, pooling_type and ksize, strides, paddings parameters.
 Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
@@ -215,19 +213,28 @@ These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.

 Example:
+
  Input:
+
       X shape: $(N, C, H_{in}, W_{in})$
+
  Output:
+
       Out shape: $(N, C, H_{out}, W_{out})$
+
  For ceil_mode = false:
       $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
       $$
  For ceil_mode = true:
       $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
       $$

 )DOC");

--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -78,11 +78,15 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
 class CreateRecordIOReaderOpMaker : public FileReaderMakerBase {
 protected:
  void Apply() override {
-    AddAttr<std::string>("filename", "The filename of record io reader");
+    AddAttr<std::string>(
+        "filename",
+        "The filename of record file. This file will given to reader.");
    AddComment(R"DOC(
-      CreateRecordIOReader Operator
+Open a recordio file and return the reader object. The returned reader object
+is thread-safe.

-      Create a reader from a record io file
+NOTE: This is a very low-level API. It is used for debugging data file or
+training. Please use `open_files` instead of this API for production usage.
    )DOC");
  }
 };

--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -54,7 +54,7 @@ std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
 }

 void FileReaderMakerBase::Make() {
-  AddOutput("Out", "(ReaderHolder) The created random reader.").AsDuplicable();
+  AddOutput("Out", "(ReaderHolder): The created random reader.").AsDuplicable();
  AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes.");
  AddAttr<std::vector<int>>(
      "ranks",

--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -139,7 +139,20 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
                 "The pooled output width.")
        .SetDefault(1);
    AddComment(R"DOC(
-ROIPool operator
+**ROIPool Operator**
+
+Region of interest pooling (also known as RoI pooling) is to perform
+is to perform max pooling on inputs of nonuniform sizes to obtain
+fixed-size feature maps (e.g. 7*7).
+
+The operator has three steps:
+
+1. Dividing each region proposal into equal-sized sections with
+   the pooled_width and pooled_height
+
+2. Finding the largest value in each section
+
+3. Copying these max values to the output buffer

 ROI Pooling for Faster-RCNN. The link below is a further introduction: 
 https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn

--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -78,23 +78,23 @@ class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("X",
-             "(LoDTensor), the input(X) is a LodTensor, which supports "
+             "the input(X) is a LodTensor, which supports "
             "variable time-length input sequences. The underlying tensor "
             "in this LoDTensor is a matrix with shape (T x N), where T "
             "is the total time steps in this mini-batch and N is the input "
             "data dimension.");
    AddInput("Filter",
-             "(Tensor), the input(Filter) is a learnable parameter. It "
+             "the input(Filter) is a learnable parameter. It "
             "is a 2-D tensor with shape (future_context x N), where, "
             "future_context is the future context length and N is the data "
             "dimension.");
    AddOutput("Out",
-              "(LoDTensor), the output(Out) is a LodTensor, which supports "
+              "the output(Out) is a LodTensor, which supports "
              "variable time-length input sequences. The underlying tensor "
              "in this LodTensor is a matrix with shape T x N, i.e., the "
              "same shape as X.");
    AddComment(R"DOC(
-Row-convolution Operator.
+:strong:`Row-convolution operator`

 The row convolution is called lookahead convolution.  This operator was 
 introduced in the following paper for DeepSpeech2:
@@ -114,9 +114,23 @@ and a filter ($W$) of size $context \times d$,
 the output sequence is convolved as:

 $$
-out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :}
+out_{i, :} = \\sum_{j=i}^{i + context} in_{j,:} \\cdot W_{i-j, :}
 $$

+In the above equation:
+
+* $Out_{i}$: The i-th row of output variable with shape [1, D].
+
+* $\\tau$: Future context size.
+
+* $X_{j}$: The j-th row of input variable with shape [1, D].
+
+* $W_{i-j}$: The (i-j)-th row of parameters with shape [1, D].
+
+More details about row_conv please refer to
+the design document
+https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .
+
 )DOC");
  }
 };

--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -41,13 +41,13 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X", "(Tensor) Input tensor of scale operator.");
    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
    AddComment(R"DOC(
-Scale operator
+**Scale operator**
+
+Multiply the input tensor with a float scalar to scale the input tensor.

 $$Out = scale*X$$
 )DOC");
-    AddAttr<float>("scale",
-                   "(float, default 1.0)"
-                   "The scaling factor of the scale operator.")
+    AddAttr<float>("scale", "The scaling factor of the scale operator.")
        .SetDefault(1.0);
  }
 };

--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -36,10 +36,13 @@ class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("Input", "(Tensor), The input tensor.");
-    AddOutput("Out", "(Tensor), The shape of input tensor.");
+    AddOutput("Out",
+              "(Tensor), The shape of input tensor, the data type of the shape"
+              " is int64_t, will be on the same device with the input Tensor.");
    AddComment(R"DOC(
-Shape Operator. 
-Get the shape of input tensor.
+Shape Operator
+
+Get the shape of input tensor. Only support CPU input Tensor now.
 )DOC");
  }
 };

--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -113,14 +113,14 @@ The logistic loss is given as follows:

       $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$

-We know that $$\sigma(X) = (1 / (1 + \exp(-X)))$$. By substituting this we get:
+We know that $$\sigma(X) = \\frac{1}{1 + \exp(-X)}$$. By substituting this we get:

       $$loss = X - X * Labels + \log(1 + \exp(-X))$$

 For stability and to prevent overflow of $$\exp(-X)$$ when X < 0,
 we reformulate the loss as follows:

-       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-|X|))$$
+       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-\|X\|))$$

 Both the input `X` and `Labels` can carry the LoD (Level of Details) information.
 However the output only shares the LoD with input `X`.

--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -95,23 +95,26 @@ of that dimension. If the value passed to start or end is larger than
 the n (the number of elements in this dimension), it represents n. 
 For slicing to the end of a dimension with unknown size, it is recommended 
 to pass in INT_MAX. If axes are omitted, they are set to [0, ..., ndim-1].
-
-    Example 1:
-    Given:
-        data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-        axes = [0, 1]
-        starts = [1, 0]
-        ends = [2, 3]
-    Then:
-        result = [ [5, 6, 7], ]
-
-    Example 2:
-    Given:
-        data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-        starts = [0, 1]
-        ends = [-1, 1000]
-    Then:
-        result = [ [2, 3, 4], ]
+Following examples will explain how slice works:
+
+    .. code-block:: text
+
+        Cast1:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [1, 0]
+                ends = [2, 3]
+            Then:
+                result = [ [5, 6, 7], ]
+
+        Cast2:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                starts = [0, 1]
+                ends = [-1, 1000]
+            Then:
+                result = [ [2, 3, 4], ]
 )DOC");
  }
 };

--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -115,4 +115,7 @@ USE_CPU_ONLY_OP(concat);

 REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker);
 REGISTER_OP_CPU_KERNEL(split,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, float>);
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, double>,
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, float>,
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, int64_t>,
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, int>);
--- a/paddle/fluid/operators/split_op.cu.cc
+++ b/paddle/fluid/operators/split_op.cu.cc
@@ -15,4 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/split_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>);
+    split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SplitOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SplitOpKernel<paddle::platform::CUDADeviceContext, int>);
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -66,17 +66,25 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
 }  // namespace

 template <typename DeviceContext, typename T>
-void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
+void TensorRTEngineKernel<DeviceContext, T>::Prepare(
    const framework::ExecutionContext &context) const {
  VLOG(4) << "Prepare engine";
  // Get the ProgramDesc and pass to convert.
  framework::proto::BlockDesc block_desc;
  block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-  max_batch_ = context.Attr<int>("max_batch");
+  int max_batch = context.Attr<int>("max_batch");
  auto max_workspace = context.Attr<int>("max_workspace");
-  engine_ = Singleton<TRT_EngineManager>::Global().Create(
-      max_batch_, max_workspace, &stream_);
-  engine_->InitNetwork();
+  auto params = context.Attr<std::vector<std::string>>("parameters");
+  std::unordered_set<std::string> parameters;
+  for (const auto &param : params) {
+    parameters.insert(param);
+  }
+
+  // TODO(Superjomn) replace this with a different stream
+  auto *engine = Singleton<TRT_EngineManager>::Global().Create(
+      max_batch, max_workspace, nullptr /*engine hold its own stream*/,
+      context.Attr<std::string>("engine_uniq_key"));
+  engine->InitNetwork();

  framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
  // Add inputs
@@ -87,24 +95,23 @@ void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
    PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
                      "TensorRT engine only takes LoDTensor as input");
    auto shape = var->GetShape();
-    engine_->DeclareInput(
+    engine->DeclareInput(
        input, FluidDataType2TRT(
                   var->Proto()->type().lod_tensor().tensor().data_type()),
        Vec2TRT_Dims(var->GetShape()));
  }

-  // TODO(Superjomn) parameters should be passed after analysised from outside.
  inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
-      block_desc, {}, context.scope(), engine_);
+      block_desc, parameters, context.scope(), engine);

  // Add outputs
  VLOG(4) << "declare outputs";
  for (auto &output : context.Outputs("Ys")) {
    VLOG(4) << "declare output " << output;
-    engine_->DeclareOutput(output);
+    engine->DeclareOutput(output);
  }

-  engine_->FreezeNetwork();
+  engine->FreezeNetwork();
 }

 class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -113,6 +120,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Xs", "A list of inputs.").AsDuplicable();
    AddOutput("Ys", "A list of outputs").AsDuplicable();
    AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
    AddAttr<int>("max_batch", "the maximum batch size.");
    AddAttr<int>("max_workspace", "the maximum batch size.");
    AddComment("TensorRT engine operator.");

--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -19,10 +19,14 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"

 namespace paddle {
 namespace operators {

+using inference::Singleton;
+using inference::tensorrt::TRT_EngineManager;
+
 class TensorRTEngineOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -47,16 +51,18 @@ template <typename DeviceContext, typename T>
 class TensorRTEngineKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    if (!engine_) {
+    auto engine_name = context.Attr<std::string>("engine_uniq_key");
+    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
      Prepare(context);
    }
+    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
    auto input_names = context.op().Inputs("Xs");
    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
    // Try to determine a batch_size
    auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
        context.scope(), input_names.front());
    int batch_size = tensor0.dims()[0];
-    PADDLE_ENFORCE_LE(batch_size, max_batch_);
+    PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));

    // Convert input tensor from fluid to engine.
    for (const auto& x : context.Inputs("Xs")) {
@@ -64,20 +70,20 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
          context.scope(), x);
      if (platform::is_cpu_place(t.place())) {
-        engine_->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
-                                 t.memory_size());
+        engine->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
+                                t.memory_size());
      } else {
-        engine_->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
-                                 t.memory_size());
+        engine->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
+                                t.memory_size());
      }
    }
    // Execute the engine.
    PADDLE_ENFORCE_GT(batch_size, 0);
-    engine_->Execute(batch_size);
+    engine->Execute(batch_size);
    // Convert output tensor from engine to fluid
    for (const auto& y : context.Outputs("Ys")) {
      // convert output and copy to fluid.
-      nvinfer1::ITensor* trt_t = engine_->GetITensor(y);
+      nvinfer1::ITensor* trt_t = engine->GetITensor(y);
      auto dims = trt_t->getDimensions();
      // Use the output ITensor's dims to reshape the Fluid Tensor.
      std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
@@ -89,27 +95,22 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
      if (platform::is_cpu_place(fluid_t->place())) {
        // TODO(Superjomn) change this float to dtype size.
-        engine_->GetOutputInCPU(
+        engine->GetOutputInCPU(
            y, fluid_t->mutable_data<float>(platform::CPUPlace()),
            size * sizeof(float));
      } else {
-        engine_->GetOutputInGPU(
+        engine->GetOutputInGPU(
            y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
            size * sizeof(float));
      }
    }

-    cudaStreamSynchronize(stream_);
+    cudaStreamSynchronize(*engine->stream());
  }

 protected:
  // Build the engine.
  void Prepare(const framework::ExecutionContext& context) const;
-
- private:
-  mutable cudaStream_t stream_;
-  mutable inference::tensorrt::TensorRTEngine* engine_{nullptr};
-  mutable int max_batch_{0};
 };

 }  // namespace operators

--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -79,6 +79,17 @@ void SetAttr<int64_t>(framework::proto::OpDesc* op, const std::string& name,
  attr->set_type(paddle::framework::proto::AttrType::LONG);
  attr->set_l(data);
 }
+template <>
+void SetAttr<std::vector<std::string>>(framework::proto::OpDesc* op,
+                                       const std::string& name,
+                                       const std::vector<std::string>& data) {
+  auto* attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
+  for (const auto& s : data) {
+    attr->add_strings(s.c_str());
+  }
+}

 }  // namespace

@@ -123,11 +134,15 @@ TEST(TensorRTEngineOp, manual) {
  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                       block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 30);
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 100);
  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
+  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
+  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
+                                    std::vector<std::string>({}));

  LOG(INFO) << "create engine op";
  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+  LOG(INFO) << "engine_op " << engine_op.get();

  framework::Scope scope;
  platform::CPUPlace place;
@@ -145,6 +160,88 @@ TEST(TensorRTEngineOp, manual) {
  engine_op->Run(scope, place);
 }

+void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  using shape_t = std::vector<int64_t>;
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+
+  auto AddFCLayer = [&](const std::string& x_name, const std::string& y_name,
+                        const std::string& z_name, bool x_created,
+                        const shape_t& x_shape, const shape_t& y_shape,
+                        const shape_t& z_shape) {
+
+    LOG(INFO) << "create fc op";
+    auto* fc = block_desc.AppendOp();
+    fc->SetType("mul");
+    fc->SetInput("X", std::vector<std::string>({x_name}));
+    fc->SetInput("Y", std::vector<std::string>({y_name}));
+    fc->SetOutput("Out", std::vector<std::string>({z_name}));
+
+    // Set inputs' variable shape in BlockDesc
+    if (!x_created) {
+      AddTensorToBlockDesc(block_, x_name,
+                           std::vector<int64_t>({batch_size, input_dim, 1, 1}));
+    }
+    AddTensorToBlockDesc(block_, y_name,
+                         std::vector<int64_t>({input_dim, output_dim}));
+    AddTensorToBlockDesc(block_, z_name,
+                         std::vector<int64_t>({batch_size, output_dim}));
+
+    // Prepare variables.
+    if (!x_created) {
+      CreateCPUTensor(&scope, x_name, std::vector<int64_t>(x_shape));
+    }
+    CreateCPUTensor(&scope, y_name, std::vector<int64_t>(y_shape));
+    CreateCPUTensor(&scope, z_name, std::vector<int64_t>(z_shape));
+
+    // It is wired, need to copy manually.
+    *block_->add_ops() = *fc->Proto();
+  };
+
+  // Test with 4 layer FC
+  AddFCLayer("x0", "y0", "z0", false, {batch_size, input_dim},
+             {input_dim, output_dim}, {batch_size, output_dim});
+  AddFCLayer("z0", "y1", "z1", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z1", "y2", "z2", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z2", "y3", "z3", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+
+  LOG(INFO) << "create tensorrt desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("tensorrt_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
+
+  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
+                       block_->SerializeAsString());
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch", batch_size);
+  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 2 << 10);
+  SetAttr<std::vector<std::string>>(
+      engine_op_desc.Proto(), "parameters",
+      std::vector<std::string>({"y0", "y1", "y2", "y3"}));
+  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
+
+  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+
+  // Execute them.
+  engine_op->Run(scope, place);
+}
+
+// Test with a larger FC layer.
+TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
+
 }  // namespace operators
 }  // namespace paddle


--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
@@ -35,10 +35,10 @@ class UniformRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
 protected:
  void Apply() override {
    AddComment(R"DOC(
-Uniform random operator
+UniformRandomBatchSizeLike operator.

 This operator initializes a tensor with the same batch_size as the Input tensor
- with random values sampled from a uniform distribution.
+with random values sampled from a uniform distribution.

 )DOC");
    AddAttr<float>("min",

--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
--- a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
--- a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
--- a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
--- a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
--- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
--- a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
--- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
--- a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
--- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
--- a/python/paddle/fluid/tests/unittests/test_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
--- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook