deal with conflict

f42ea489 · nhzlx · 940f5dbc · 1e417b93 · f42ea489 · f42ea489
194 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,6 +136,12 @@ else()
    set(THIRD_PARTY_BUILD_TYPE Release)
 endif()

+if(WITH_MKL)
+  option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF)
+  if (MKL_SPLIT_GEMM)
+    add_definitions(-DPADDLE_MKL_SPLIT_GEMM)
+  endif()
+endif()
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)
    if (WITH_MKL AND AVX2_FOUND)

--- a/README.md
+++ b/README.md
@@ -18,7 +18,21 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.

-### Lastest PaddlePaddle Version: [Fluid](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid)
+
+### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0)
+### Install Latest Stable Release:
+```
+# Linux CPU
+pip install paddlepaddle
+# Linux GPU cuda9cudnn7
+pip install paddlepaddle-gpu
+# Linux GPU cuda8cudnn7
+pip install paddlepaddle-gpu==0.14.0.post87
+# Linux GPU cuda8cudnn5
+pip install paddlepaddle-gpu==0.14.0.post85
+
+# For installation on other platform, refer to http://paddlepaddle.org/
+```

 ## Features


--- a/benchmark/paddle/image/run.sh
+++ b/benchmark/paddle/image/run.sh
+#!/bin/bash
+
 set -e

 function train() {

--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
+#!/bin/bash
+
 set -e

 function clock_to_seconds() {

--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
+#!/bin/bash
+
 set -e

 function train() {

--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
+#!/bin/bash
+
 set -e

 function clock_to_seconds() {

--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
+#!/bin/bash
+
 set -e

 function train() {

--- a/benchmark/paddle/rnn/run.sh
+++ b/benchmark/paddle/rnn/run.sh
+#!/bin/bash
+
 set -e

 function train() {

--- a/benchmark/tensorflow/image/run.sh
+++ b/benchmark/tensorflow/image/run.sh
+#!/bin/bash
+
 set -e

 function test() {

--- a/benchmark/tensorflow/image/run_multi.sh
+++ b/benchmark/tensorflow/image/run_multi.sh
+#!/bin/bash
+
 set -e

 function test() {

--- a/benchmark/tensorflow/rnn/run.sh
+++ b/benchmark/tensorflow/rnn/run.sh
+#!/bin/bash
+
 set -e

 function test() {

--- a/benchmark/tensorflow/rnn/run_multi.sh
+++ b/benchmark/tensorflow/rnn/run_multi.sh
+#!/bin/bash
+
 set -e

 function test() {

--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -4,25 +4,42 @@ set(tmp_version "HEAD")
 set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
 set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
 while ("${PADDLE_VERSION}" STREQUAL "")
+  # Check current branch name
  execute_process(
-    COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
+    COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version}
    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-    OUTPUT_VARIABLE GIT_TAG_NAME
-    RESULT_VARIABLE GIT_RESULT
+    OUTPUT_VARIABLE GIT_BRANCH_NAME
+    RESULT_VARIABLE GIT_BRANCH_RESULT
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if (NOT ${GIT_RESULT})
-    # Check the tag is a correct version
-    if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
-      # if no tag was found, set PADDLE_VERSION to latest
-      set(PADDLE_VERSION "latest")
-    elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
-      string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
-    else()  # otherwise, get the previous git tag name.
-      set(tmp_version "${GIT_TAG_NAME}~1")
+  if (NOT ${GIT_BRANCH_RESULT})
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+      OUTPUT_VARIABLE GIT_TAG_NAME
+      RESULT_VARIABLE GIT_RESULT
+      ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if (NOT ${GIT_RESULT})
+      # Check if current branch is release branch
+      if (${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
+        # Check the tag is a correct version
+        if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
+          # if no tag was found, set PADDLE_VERSION to latest
+          set(PADDLE_VERSION "latest")
+        elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
+          string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
+        else()  # otherwise, get the previous git tag name.
+          set(tmp_version "${GIT_TAG_NAME}~1")
+        endif()
+      else() # otherwise, we always set PADDLE_VERSION to latest
+        set(PADDLE_VERSION "latest")
+      endif()
+    else()
+      set(PADDLE_VERSION "0.0.0")
+      message(WARNING "Cannot add paddle version from git tag")
    endif()
  else()
    set(PADDLE_VERSION "0.0.0")
-    message(WARNING "Cannot add paddle version from git tag")
+    message(WARNING "Cannot add paddle version for wrong git branch result")
  endif()
 endwhile()


--- a/doc/fluid/design/ir/draft.md
+++ b/doc/fluid/design/ir/draft.md
+## Motivation
+
+There is a ```gap``` between the ```Program``` defined by
+user and the ```Executable``` that can be scheduled
+efficiently on heterogeneous hardware, either locally
+or distributedly.
+
+Usually, the ```gap``` is bridged by
+
+* A serious transformations with defined order.
+
+* These transformations usually involve
+```insert, delete, clustering, split, dependency analysis```.
+
+* Has a simple way to verify and debug each transformation.
+
+* Flexible to add, remove or customize transformations to fit
+the requirements of various algorithms (models) and hardware secenarios.
+
+Some other events also push us to a better unified pattern.
+
+* The deep learning framework is built around the concepts of graphs.
+To leverage tools such as compilation (e.g. TVM and nGraph) or
+cross-framework conversion (e.g. ONNX), we also need a intermediate
+representation that can be connected to the rest of the ecosystem.
+
+
+We need a unified pattern to naturally support the requirements
+described above. The pattern should fit both training, inference
+and other offline serielized model transformations.
+Learned from LLVM and other deep learning framework, we draft the
+design below.
+
+
+## Design
+
+### Major Concepts
+
+#### Node
+
+```Node``` represents an operation that performs some computation or
+a variable that is input or output of operation.
+
+```Node```s are connected to other ```Node```s via inputs and outputs.
+
+Other properties (maybe device placement information) can be added
+to ```Node``` in the future if it's a
+common requirement of many other ```Pass```es. Otherwise, it should live
+in a ```Node``` wrapper class that is private to some ```Pass``` or be
+a local member of a ```Pass```.
+
+#### Graph
+
+```Graph``` contains a list of ```Node```s, which are connected to
+each other via inputs and outputs.
+
+TODO: Better definitions for the graph.
+
+```Graph``` can also contain ```Attribute```s. ```Attribute```s
+can be ``any`` thing. For example, it can be a list of "wraper"
+nodes. The ```wrapper``` nodes compose ```Node```s and provide
+helper method for execution or transformation. ```Attribute```
+can also contain other things that describe some properties of
+the ```Graph``` or ```Graph``` nodes. ```Attribute``` can be passed
+across ```Pass```. However, it should be used with care.
+
+#### Pass
+
+```Pass``` represents a transformation of ```Graph```. Its input
+is a ```Graph``` and its output is also a ```Graph```. For example,
+a ```Pass``` can simply print out the ```Graph```. A ```Pass```
+can also fuse some ```Graph```'s ```Node```s.
+
+#### Optimize
+
+```Optimize``` contains a series of ```Pass``` with defined order.
+```Optimize``` transforms a ```Graph``` that only contains raw
+modeling logic to a ```Graph``` that can be run efficiently while
+maintaining the original modeling logic.
+
+
+### Optimize Process
+
+* Program is first converted to Graph.
+* Graph goes through a series of Pass
+* Graph is transformed from raw model logic to a
+form that is efficient to execute.
+
+Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
--- a/doc/v2/api/index_en.rst
+++ b/doc/v2/api/index_en.rst
@@ -4,7 +4,6 @@ API
 ..  toctree::
    :maxdepth: 1

-    overview.rst
    model_configs.rst
    data.rst
    run_logic.rst
--- a/doc/v2/faq/parameter/index_en.rst
+++ b/doc/v2/faq/parameter/index_en.rst
-#################
-Parameter Setting
-#################
+##################
+Parameter Settings
+##################

-TBD
+.. contents::
+
+1. How to Choose the Learning Rate of SGD Algorithm
+--------------------------
+
+An important issue when training with :code:`sgd/async_sgd` is to choose the correct value for :code:`learning_rate`. If it is too large, the training may not converge. If too small, the convergence may be slow, resulting in a long training time.
+
+Usually, we start with a relatively large learning rate. If the training does not converge, then we need to reduce the learning rate continuously by a factor of 10 until the training converges. We examine the convergence of the training by estimating the minimum cost at a constant output of the model.
+
+If the cost of the training process is significantly higher than the cost of the output, then we judge that the training does not converge. For example, if we have a three-class problem and use multi-class-cross-entropy as the cost, the ratio of 0, 1, and 2 in the data will be :code:`0.2, 0.5, 0.3`. The minimum cost thus will be :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03`. If the cost is greater than this number after training a pass (or even before), then the training may not be converged and the learning rate should be reduced.
+
+2. How to Implement Learning Rate Annealing
+------------------------------------------------
+
+We use the Adam algorithm as an example. Set the parameters of :code:`learning_rate_schedule` in the corresponding optimization algorithm as follows:
+
+.. code-block:: python
+
+    Optimizer = paddle.optimizer.Adam(
+        Learning_rate=1e-3,
+        Learning_rate_decay_a=0.5,
+        Learning_rate_decay_b=0.75,
+        Learning_rate_schedule="poly",)
+
+PaddlePaddle currently supports 8 learning rate schedules. The 8 learning rate schedules and their corresponding learning rates are calculated as follows:
+
+* "constant"
+  
+  Lr = learning_rate
+
+* "poly"
+
+  Lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
+
+  Variable :code:`num_samples_processed` is the number of trained samples.
+
+* "caffe_poly"
+
+  Lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
+
+* "exp"
+
+  Lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
+
+* "discexp"
+
+  Lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
+
+* "linear"
+
+  Lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
+
+* "manual"
+
+  This is a learning rate annealing method that is segmented by the number of trained samples. When using this learning rate schedule, we modify the learning rate attenuation factor piecewise function by changing the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
+
+  .. code-block:: python
+
+      Optimizer = paddle.optimizer.Adam(
+          Learning_rate=1e-3,
+          Learning_rate_schedule="manual",
+          Learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
+
+  In this example, when the number of trained samples is less than or equal to 1000, the learning rate is: code:`1e-3*1.0`; when the number of trained samples is greater than 1000 or less than or equal to 2000, the learning rate is:code:`1e- 3 * 0.9`; when the number of trained samples is greater than 2,000, the learning rate is: code:`1e-3*0.8`.
+
+* "pass_manual"
+
+  This is a learning rate annealing method that piecewisely pick values according to the number of trained passes. When using this learning rate schedule, we set the learning rate attenuation factor piecewise function by the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
+
+  .. code-block:: python
+
+      Optimizer = paddle.optimizer.Adam(
+          Learning_rate=1e-3,
+          Learning_rate_schedule="pass_manual",
+          Learning_rate_args="1:1.0,2:0.9,3:0.8",)
+
+  In this example, when the number of trained passes is less than or equal to 1, the learning rate is :code:`1e-3*1.0`; when the number of trained passes is greater than 1 or less than 2, the learning rate is :code:`1e- 3 * 0.9`; when the number of trained passes is greater than 2, the learning rate is :code:`1e-3*0.8`.
+
+3. How to Initialize Parameters
+-----------------
+
+By default, PaddlePaddle initializes parameters with an average of 0 and a standard deviation of :math:`\frac{1}{\sqrt{d}}`, where :math:`d` is the width of the parameter matrix. This initialization method does not produce bad results under normal circumstances. If users want to customize the initialization method, PaddlePaddle provides two ways to initialize the parameters:
+
+* Gaussian distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
+* Uniform distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
+
+For example, to set a full connection layer parameter initialization mode and bias initialization mode, you can use the following code:
+
+.. code-block:: python
+
+    Hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
+                      Bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
+
+The above code initializes the bias to 1.0 and initializes the parameters to a uniform distribution of :code:`[1.0, -1.0]`.
+
+4. How to Share Parameters
+---------------
+
+PaddlePaddle's parameters use :code:`name` as the ID. Parameters with the same name will share parameters//. We can set the name of the parameters using :code:`ParamAttr(name="YOUR_PARAM_NAME")`. More conveniently, we can make the parameters to be shared use the same :code:`ParamAttr` object.
+
+A simple fully connected network has its configuration of parameter sharing as follows \:
+
+.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+
+Here :code:`hidden_a` and :code:`hidden_b` have the same parameter and bias. The two input of the softmax layer also use the same parameter :code:`softmax_param`.
+
+5. How to Load Pre-training Parameters
+------------------------
+* For layers that load pre-training parameters, set :code:`is_static = True` so that the parameters of that layer remain unchanged during the training process. Take the embedding layer as an example, the code is as follows:
+
+.. code-block:: python
+
+    Emb_para = paddle.attr.Param(name='emb', is_static=True)
+    Paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
+
+
+* Load pre-training parameters from the model file into :code:`numpy.array`. After creating the parameters, load the pre-training parameters using :code:`parameters.set()`. The first 16 bytes of the model parameter file saved by PaddlePaddle is the header information. The user must loads : :code:`numpy.array` starting with the 17th byte. Take the embedding layer as an example, the code is as follows:
+
+.. code-block:: python
+
+    Def load_parameter(file_name, h, w):
+        With open(file_name, 'rb') as f:
+            F.read(16) # skip header.
+            Return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+    Parameters = paddle.parameters.create(my_cost)
+    Parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
+
+6. Format of the Stored Parameter and How to Convert the File to Plain Text
+--------------------------------------------------
+
+The model parameter file saved by PaddlePaddle consists of 16 bytes of header information and network parameters. In the header information, the first four bytes show PaddlePaddle's version information. The user should fill in with 0s. The next four bytes represent the number of bytes occupied by each parameter. If the saved network parameter is a float type, the number is four; if it is a double, the number is eight. The third group of four bytes represents the total number of saved parameters.
+
+When restoring the model parameters saved by PaddlePaddle back to plain text, we use the corresponding data type :code:`numpy.array` to load specific network parameters. At this time, you can skip the header information of the PaddlePaddle model parameter file. If not specified to compile with a precision for double in PaddlePaddle, then the parameter file will be caiculated with a precision for float, and the argument will be stored as a float. In this case, when using :code:`numpy.array`, generally we set :code:`dtype=float32`. An example is as follows:
+
+.. code-block:: python
+
+    Def read_parameter(fname, width):
+        s = open(fname).read()
+        # skip header
+        Vec = np.fromstring(s[16:], dtype=np.float32)
+        # width is the size of the corresponding layer
+        Np.savetxt(fname + ".csv", vec.reshape(width, -1),
+                Fmt="%.6f", delimiter=",")
+
+
+When the plaintext parameters are converted into PaddlePaddle loadable model parameters, the header information is constructed first, then the network parameters are written. The following code converts the randomly generated matrix into model parameters that can be loaded by PaddlePaddle:
+
+.. code-block:: python
+
+    Def gen_rand_param(param_file, width, height, need_trans):
+        Np.random.seed()
+        Header = struct.pack("iil", 0, 4, height * width)
+        Param = np.float32(np.random.rand(height, width))
+        With open(param_file, "w") as fparam:
+            Fparam.write(header + param.tostring())
+
+7. A Protocol Message Rejected Because of its Large Size
+-------------------------------------------------- ----------
+
+If you are training NLP related models, and the following error occurs:
+
+.. code-block:: bash
+
+    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit( ) in google/protobuf/io/coded_stream.h.
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
+
+The possible reason is that one of the args passed to the dataprovider is too large, which is usually caused by directly passing a large dictionary. A wrongly defineed `_py_data_sources2` is similar to:
+
+.. code-block:: python
+
+     Src_dict = dict()
+     For line_count, line in enumerate(open(src_dict_path, "r")):
+        Src_dict[line.strip()] = line_count
+
+     Define_py_data_sources2(
+        Train_list,
+        Test_list,
+        Module="dataprovider",
+        Obj="process",
+        Args={"src_dict": src_dict})
+
+The solution is to pass the address of the dictionary as args to the dataprovider, and then load the dictionary according to the address in the dataprovider. Change `_py_data_sources2` to:
+
+.. code-block:: python
+
+     Define_py_data_sources2(
+        Train_list,
+        Test_list,
+        Module="dataprovider",
+        Obj="process",
+        Args={"src_dict_path": src_dict_path})
+
+The full source code can be found in the `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_recurrent.py>`_ example.
--- a/doc/v2/howto/rnn/hierarchical_layer_en.rst
+++ b/doc/v2/howto/rnn/hierarchical_layer_en.rst
-Layers supporting hierarchical sequence as input
-================================================
-
-TBD
+###########################
+Layers that Support Hierarchical Sequences as Input
+###########################
+ 
+.. contents::
+ 
+Overview 
+====
+ 
+A sequence is a common data type in natural language processing tasks. An independent word can be regarded as a non-sequential input or a 0-level sequence. A sentence made up of words is a single-level sequence; a number of sentences make up a paragraph, which is a double-level sequence.
+ 
+A double-level sequence is a nested sequence where each element is a single-level sequence. This is a very flexible way of organizing data that helps us construct some complex input information.
+ 
+We can define non-sequences, single-level sequences, and double-level sequences at the following levels.
+ 
+ 0-level sequence: an independent element. Its type can be any input data type supported by PaddlePaddle;
+ Single-level sequence: multiple elements arranged in a row; each element is a 0-level sequence. The order of elements is an important input information;
+ Double-level sequence: multiple elements arranged in a row; each element is a single-layer sequence called a subseq of a double-level sequence, and each element of the subseq is a 0-level sequence.
+ 
+In PaddlePaddle, the following layers accept double-layer sequences as input and perform corresponding calculations.
+ 
+`pooling`
+========
+ 
+The use of pooling is as follows:
+ 
+.. code-block:: bash
+ 
+        Seq_pool = pooling(input=layer,
+                           Pooling_type=pooling.Max(),
+                           Agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- `pooling_type` currently supports two types: pooling.Max() and pooling.Avg().
+ 
+- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
+ 
+  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence 
+  - Input: a double-level sequence or a single-level sequence
+  - Output: a 0-level sequence which is the average (or maximum) of the entire input sequence (single or double)
+ 
+- When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
+ 
+  - Effect: a double-level sequence will be transformed into a single-level sequence
+  - Input: a double-level sequence
+  - Output: a single-level sequence where each element of the sequence is the average (or maximum) value of each subseq element of the original double-level sequence.
+ 
+`last_seq` and `first_seq`
+=====================
+ 
+An example of using `last_seq` is as follows (usage of `first_seq` is similar).
+ 
+.. code-block:: bash
+ 
+        Last = last_seq(input=layer,
+                        Agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
+ 
+  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence
+  - Input: a double-level sequence or a single-level sequence
+  - Output: a 0-level sequence, which is the last or the first element of the input sequence (double or single level).
+ 
+- When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
+  - Effect: a double-level sequence will be transformed into a single-level sequence
+  - Input: a double-level sequence
+  - Output: a single-layer sequence in which each element is the last (or first) element of each subseq in a double-level sequence.
+ 
+`expand`
+======
+ 
+The use of expand is as follows.
+ 
+.. code-block:: bash
+ 
+        Ex = expand(input=layer1,
+                    Expand_as=layer2,
+                    Expand_level=ExpandLevel.FROM_NO_SEQUENCE)
+        
+- When `expand_level=ExpandLevel.FROM_NO_SEQUENCE` (default):
+ 
+  - Effect: a 0-level sequence is extended to a single-level sequence or a double-level sequence
+  - Input: layer1 must be a 0-level sequence to be extended; layer2 can be a single-level sequence or a double-level sequence that provides the extended length information
+  - Output: a single-level sequence or a double-level sequence; the type of the output sequence and the number of elements contained in the sequence are the same as layer2. If the output is a single-level sequence, each element of the single-level sequence will be a copy of the layer1 element. If the output is a double-level sequence, each element in the double-level sequence will be a copy of the layer1 element
+ 
+- When `expand_level=ExpandLevel.FROM_SEQUENCE`:
+ 
+  - Effect: a single-level sequence is extended to a double-level sequence
+  - Input: layer1 must be a single-level sequence to be extended; layer2 must be a double-level sequence providing extended length information
+  - Output: a double-level sequence with the same number of elements as that of layer2. It is required that the number of elements in the single-level sequence be the same as the number of subseq in the double-level sequences. The i-th element of the single-level sequence (the 0-level sequence) is expanded into a single-level sequence that constitutes the i-th subseq of the output, the double-level sequence.
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -179,26 +179,17 @@ paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaul
 paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
-paddle.fluid.layers.BlockGuardServ.__init__ ArgSpec(args=['self', 'server'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.ListenAndServ.__init__ ArgSpec(args=['self', 'endpoint', 'inputs', 'fan_in', 'optimizer_mode'], varargs=None, keywords=None, defaults=(1, True))
-paddle.fluid.layers.ListenAndServ.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.ListenAndServ.do ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.ListenAndServ.get_params_and_grads ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.ListenAndServ.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.Send ArgSpec(args=['endpoints', 'send_vars', 'sync'], varargs=None, keywords=None, defaults=(True,))
-paddle.fluid.layers.Recv ArgSpec(args=['endpoints', 'get_vars', 'sync'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
-paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, None, 1, True))
+paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.shuffle ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,))
-paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True))
 paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.Preprocessor.is_completed ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Preprocessor.outputs ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
 paddle.fluid.layers.load ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False))
@@ -218,9 +209,6 @@ paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=
 paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.split_lod_tensor ArgSpec(args=['input', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
 paddle.fluid.layers.merge_lod_tensor ArgSpec(args=['in_true', 'in_false', 'x', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
-paddle.fluid.layers.BlockGuard.__init__ ArgSpec(args=['self', 'main_program'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.BlockGuardWithCompletion.__init__ ArgSpec(args=['self', 'rnn'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.WhileGuard.__init__ ArgSpec(args=['self', 'while_op'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@@ -350,6 +338,26 @@ paddle.fluid.layers.polynomial_decay ArgSpec(args=['learning_rate', 'decay_steps
 paddle.fluid.layers.piecewise_decay ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.noam_decay ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.append_LARS ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.InitState.__init__ ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32'))
+paddle.fluid.contrib.StateCell.__init__ ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.StateCell.compute_state ArgSpec(args=['self', 'inputs'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.get_input ArgSpec(args=['self', 'input_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.get_state ArgSpec(args=['self', 'state_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.out_state ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.set_state ArgSpec(args=['self', 'state_name', 'state_value'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.state_updater ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.update_states ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.TrainingDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.contrib.TrainingDecoder.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
+paddle.fluid.contrib.TrainingDecoder.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.TrainingDecoder.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None))
+paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
+paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
 add_subdirectory(details)
+add_subdirectory(ir)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)

@@ -93,7 +94,7 @@ else()
 endif()


-cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph)

 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
-cc_library(var_handle SRCS var_handle.cc DEPS place)
+cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)

-cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
-cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
+cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph)
 cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
 cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)

@@ -35,7 +34,7 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS

 cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)

-cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
+cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)


--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -23,10 +23,14 @@ namespace framework {
 namespace details {

 #ifdef PADDLE_WITH_CUDA
-AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
+                                     const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places,
                                     const platform::NCCLContextMap *ctxs)
-    : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
+    : OpHandleBase(node),
+      local_scopes_(local_scopes),
+      places_(places),
+      nccl_ctxs_(ctxs) {
  if (nccl_ctxs_) {
    for (auto &p : places_) {
      this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
@@ -34,9 +38,10 @@ AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
  }
 }
 #else
-AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
+                                     const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places)
-    : local_scopes_(local_scopes), places_(places) {}
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif

 void AllReduceOpHandle::RunImpl() {

--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -30,11 +30,11 @@ namespace details {

 struct AllReduceOpHandle : public OpHandleBase {
 #ifdef PADDLE_WITH_CUDA
-  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places,
                    const platform::NCCLContextMap *ctxs);
 #else
-  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places);
 #endif
  std::string Name() const override;

--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -35,10 +35,13 @@ namespace details {
 struct BroadcastOpHandle : public OpHandleBase {
 public:
 #ifdef PADDLE_WITH_CUDA
-  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places,
                    const platform::NCCLContextMap *nccl_ctxs)
-      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+      : OpHandleBase(node),
+        local_scopes_(local_scopes),
+        places_(places),
+        nccl_ctxs_(nccl_ctxs) {
    if (nccl_ctxs_) {
      for (auto &p_ctx : nccl_ctxs_->contexts_) {
        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
@@ -46,9 +49,9 @@ struct BroadcastOpHandle : public OpHandleBase {
    }
  }
 #else
-  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places)
-      : local_scopes_(local_scopes), places_(places) {}
+      : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif

  std::string Name() const override;

--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -96,48 +96,61 @@ struct TestBroadcastOpHandle {
    }
    param_scopes_[input_scope_idx]->Var("input");

+    std::unique_ptr<ir::Node> n(
+        new ir::Node("node0", ir::Node::Type::kOperation));
    if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
+                                             nccl_ctxs_.get()));
 #else
      PADDLE_THROW("CUDA is not support.");
 #endif
    } else {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
+                                             nccl_ctxs_.get()));
 #else
-      op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
+      op_handle_.reset(
+          new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_));
 #endif
    }

-    auto* in_var_handle =
-        new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
+    std::unique_ptr<ir::Node> v(
+        new ir::Node("node1", ir::Node::Type::kVariable));
+    auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
+                                        gpu_list_[input_scope_idx]);
    vars_.emplace_back(in_var_handle);
    op_handle_->AddInput(in_var_handle);

    // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+
+    std::unique_ptr<ir::Node> v2(
+        new ir::Node("node2", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(v2.get()));
    DummyVarHandle* dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
-    dummy_var_handle->generated_op_ = nullptr;
+    dummy_var_handle->ClearGeneratedOp();
    op_handle_->AddInput(dummy_var_handle);

    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      if (!use_gpu_) {
        op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
      }
-      VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
+      std::unique_ptr<ir::Node> v3(
+          new ir::Node("node3", ir::Node::Type::kVariable));
+      VarHandle* out_var_handle =
+          new VarHandle(v3.get(), 2, j, "out", gpu_list_[j]);
      vars_.emplace_back(out_var_handle);
      op_handle_->AddOutput(out_var_handle);
    }

    // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+    std::unique_ptr<ir::Node> v4(
+        new ir::Node("node4", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(v4.get()));
    DummyVarHandle* out_dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
-    out_dummy_var_handle->generated_op_ = nullptr;
+    out_dummy_var_handle->ClearGeneratedOp();
    op_handle_->AddOutput(out_dummy_var_handle);
  }


--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -19,9 +19,10 @@
 namespace paddle {
 namespace framework {
 namespace details {
-ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
+ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
                                         platform::Place place)
-    : op_(framework::OpRegistry::CreateOp(op_desc)),
+    : OpHandleBase(node),
+      op_(framework::OpRegistry::CreateOp(*node->Op())),
      scope_(scope),
      place_(place) {}

@@ -35,8 +36,8 @@ void ComputationOpHandle::RunImpl() {

 bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
  bool need_wait =
-      in_var && in_var->generated_op_ &&
-      in_var->generated_op_->DeviceContext(place_) != dev_ctxes_[place_];
+      in_var && in_var->GeneratedOp() &&
+      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_];
  return need_wait;
 }


--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -28,8 +28,7 @@ namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
 public:
-  ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
-                      platform::Place place);
+  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place);

  std::string Name() const override;


--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -22,10 +22,10 @@ namespace details {

 #ifdef PADDLE_WITH_CUDA
 DataBalanceOpHandle::DataBalanceOpHandle(
-    const std::vector<Scope *> &local_scopes,
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
    const std::vector<platform::Place> &places,
    const platform::NCCLContextMap *ctxs)
-    : local_scopes_(local_scopes), places_(places) {
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
  if (ctxs) {
    for (auto &p : places_) {
      this->dev_ctxes_[p] = ctxs->DevCtx(p);
@@ -34,9 +34,9 @@ DataBalanceOpHandle::DataBalanceOpHandle(
 }
 #else
 DataBalanceOpHandle::DataBalanceOpHandle(
-    const std::vector<Scope *> &local_scopes,
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
    const std::vector<platform::Place> &places)
-    : local_scopes_(local_scopes), places_(places) {}
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif

 std::string DataBalanceOpHandle::Name() const { return "data balance"; }

--- a/paddle/fluid/framework/details/data_balance_op_handle.h
+++ b/paddle/fluid/framework/details/data_balance_op_handle.h
@@ -30,11 +30,11 @@ namespace details {
 struct DataBalanceOpHandle : public OpHandleBase {
 public:
 #ifdef PADDLE_WITH_CUDA
-  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                      const std::vector<platform::Place> &places,
                      const platform::NCCLContextMap *ctxs);
 #else
-  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                      const std::vector<platform::Place> &places);
 #endif


--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -21,13 +21,16 @@ namespace paddle {
 namespace framework {
 namespace details {

-FetchOpHandle::FetchOpHandle(FeedFetchList *data, size_t offset,
+FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
                             std::vector<Scope *> *local_scopes)
-    : data_(data), offset_(offset), local_scopes_(local_scopes) {}
+    : OpHandleBase(node),
+      data_(data),
+      offset_(offset),
+      local_scopes_(local_scopes) {}

 FetchOpHandle::~FetchOpHandle() {
  for (auto *input_var : inputs_) {
-    input_var->pending_ops_.erase(this);
+    input_var->RemoveOutput(this, this->Node());
  }
 }

@@ -77,8 +80,8 @@ void FetchOpHandle::RunImpl() {
 void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
  auto cpu_ctx = platform::DeviceContextPool::Instance().Get(place);
  for (auto *input : inputs_) {
-    if (input->generated_op_) {
-      input->generated_op_->RecordWaitEventOnCtx(cpu_ctx);
+    if (input->GeneratedOp()) {
+      input->GeneratedOp()->RecordWaitEventOnCtx(cpu_ctx);
    }
  }
 }

--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -28,7 +28,7 @@ namespace details {

 struct FetchOpHandle : public OpHandleBase {
 public:
-  FetchOpHandle(FeedFetchList *data, size_t offset,
+  FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
                std::vector<Scope *> *local_scopes);

  ~FetchOpHandle();

--- a/paddle/fluid/framework/details/fuse_vars_op_handle.h
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h
@@ -30,10 +30,12 @@ namespace details {

 struct FuseVarsOpHandle : public OpHandleBase {
 public:
-  FuseVarsOpHandle(Scope *local_scope, const platform::Place &place,
+  FuseVarsOpHandle(ir::Node *node, Scope *local_scope,
+                   const platform::Place &place,
                   const std::unordered_map<std::string, int64_t> &inputs_numel,
                   const std::type_index &var_type)
-      : local_scope_(local_scope),
+      : OpHandleBase(node),
+        local_scope_(local_scope),
        place_(place),
        inputs_numel_(inputs_numel),
        type_(var_type) {

--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -20,9 +20,10 @@ namespace paddle {
 namespace framework {
 namespace details {

-GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
+GatherOpHandle::GatherOpHandle(ir::Node *node,
+                               const std::vector<Scope *> &local_scopes,
                               const std::vector<platform::Place> &places)
-    : local_scopes_(local_scopes), places_(places) {}
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}

 void GatherOpHandle::RunImpl() {
  if (places_.size() == 1) return;

--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -30,7 +30,7 @@ namespace details {

 struct GatherOpHandle : public OpHandleBase {
 public:
-  GatherOpHandle(const std::vector<Scope *> &local_scopes,
+  GatherOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                 const std::vector<platform::Place> &places);

  std::string Name() const override;

--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -70,6 +70,7 @@ struct TestGatherOpHandle {
  }

  void InitGatherOp(size_t input_scope_idx) {
+    std::vector<std::unique_ptr<ir::Node>> nodes;
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      local_scopes_.push_back(&(g_scope_.NewScope()));
      Scope& local_scope = local_scopes_.back()->NewScope();
@@ -81,30 +82,37 @@ struct TestGatherOpHandle {
    }
    param_scopes_[input_scope_idx]->Var("out");

-    op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
+    nodes.emplace_back(new ir::Node("node", ir::Node::Type::kOperation));
+    op_handle_.reset(
+        new GatherOpHandle(nodes.back().get(), local_scopes_, gpu_list_));
    // add input
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
-      auto* in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
+      nodes.emplace_back(new ir::Node("node1", ir::Node::Type::kVariable));
+      auto* in_var_handle =
+          new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]);
      vars_.emplace_back(in_var_handle);
      op_handle_->AddInput(in_var_handle);
    }

    // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+    nodes.emplace_back(new ir::Node("node2", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(nodes.back().get()));
    DummyVarHandle* in_dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
-    in_dummy_var_handle->generated_op_ = nullptr;
+    in_dummy_var_handle->ClearGeneratedOp();
    op_handle_->AddInput(in_dummy_var_handle);

    // add output
-    auto* out_var_handle =
-        new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]);
+    nodes.emplace_back(new ir::Node("node3", ir::Node::Type::kVariable));
+    auto* out_var_handle = new VarHandle(nodes.back().get(), 2, input_scope_idx,
+                                         "out", gpu_list_[input_scope_idx]);
    vars_.emplace_back(out_var_handle);
    op_handle_->AddOutput(out_var_handle);

    // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+    nodes.emplace_back(new ir::Node("node4", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(nodes.back().get()));
    DummyVarHandle* dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
    op_handle_->AddOutput(dummy_var_handle);

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -19,6 +19,7 @@

 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/framework/ir/graph.h"

 namespace paddle {
 namespace platform {
@@ -45,13 +46,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                          const std::vector<Scope *> &local_scopes,
                          const BuildStrategy &strategy);
 #endif
-
-  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override;
  int GetVarDeviceID(const std::string &varname) const override;

 private:
-  void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
-                         size_t device_id) const;
+  void CreateOpHandleIOs(Graph *result, ir::Node *node, size_t device_id) const;

 private:
  std::string loss_var_name_;
@@ -63,48 +62,46 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
  platform::NCCLContextMap *nccl_ctxs_;
 #endif

-  bool IsScaleLossOp(const OpDesc &op) const;
+  bool IsScaleLossOp(ir::Node *node) const;

-  void CreateRPCOp(SSAGraph *result, const OpDesc &op) const;
-  void CreateDistTrainOp(SSAGraph *result, const OpDesc &op) const;
+  void CreateRPCOp(Graph *result, ir::Node *node) const;
+  void CreateDistTrainOp(Graph *result, ir::Node *node) const;

  /**
   * Is this operator as the end-point operator before/after send operator.
   */
-  bool IsDistTrainOp(const OpDesc &op,
-                     const std::vector<std::string> &send_vars,
+  bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
                     const std::vector<std::string> &recv_vars) const;

  std::vector<std::string> FindDistTrainSendVars(
-      const ProgramDesc &program) const;
+      const std::vector<std::unique_ptr<ir::Node>> &nodes) const;

  std::vector<std::string> FindDistTrainRecvVars(
-      const ProgramDesc &program) const;
+      const std::vector<std::unique_ptr<ir::Node>> &nodes) const;

-  void ConnectOp(SSAGraph *result, OpHandleBase *op,
+  void ConnectOp(Graph *result, OpHandleBase *op,
                 const std::string &prev_op_name) const;

-  void CreateComputationalOps(SSAGraph *result, const OpDesc &op,
+  void CreateComputationalOps(Graph *result, ir::Node *node,
                              size_t num_places) const;

-  void CreateScaleLossGradOp(SSAGraph *result) const;
-  VarHandle *CreateReduceOp(SSAGraph *result, const std::string &og,
+  void CreateScaleLossGradOp(Graph *result) const;
+  VarHandle *CreateReduceOp(Graph *result, const std::string &og,
                            int dst_dev_id) const;
-  void CreateComputationalOp(SSAGraph *result, const OpDesc &op,
-                             int dev_id) const;
+  void CreateComputationalOp(Graph *result, ir::Node *node, int dev_id) const;

  bool IsParameterGradientOnce(
      const std::string &og,
      std::unordered_set<std::string> *og_has_been_broadcast) const;

-  int GetOpDeviceID(const OpDesc &op) const;
+  int GetOpDeviceID(ir::Node *node) const;

-  void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
+  void InsertAllReduceOp(Graph *result, const std::string &og) const;

-  void InsertDataBalanceOp(SSAGraph *result,
+  void InsertDataBalanceOp(Graph *result,
                           const std::vector<std::string> &datas) const;

-  void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
+  void CreateBroadcastOp(Graph *result, const std::string &p_name,
                         size_t src_dev_id) const;

  bool IsSparseGradient(const std::string &og) const;

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -80,19 +80,21 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {

 void OpHandleBase::AddInput(VarHandleBase *in) {
  this->inputs_.emplace_back(in);
-  in->pending_ops_.insert(this);
+  node_->inputs.push_back(in->Node());
+  in->AddOutput(this, this->Node());
 }

 void OpHandleBase::AddOutput(VarHandleBase *out) {
  outputs_.emplace_back(out);
-  out->generated_op_ = this;
+  node_->outputs.push_back(out->Node());
+  out->AddInput(this, this->Node());
 }

 void OpHandleBase::WaitInputVarGenerated() {
  for (auto in_var : inputs_) {
    if (NeedWait(in_var)) {
      for (auto &pair : dev_ctxes_) {
-        in_var->generated_op_->RecordWaitEventOnCtx(pair.second);
+        in_var->GeneratedOp()->RecordWaitEventOnCtx(pair.second);
      }
    }
  }
@@ -101,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() {
 void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
  for (auto *in : inputs_) {
    if (NeedWait(in)) {
-      in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[place]);
+      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[place]);
    }
  }
 }
@@ -117,7 +119,7 @@ size_t OpHandleBase::NoDummyInputSize() const {
 }

 bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
-  return in_var && in_var->generated_op_;
+  return in_var && in_var->GeneratedOp();
 }

 void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/macros.h"

@@ -26,9 +27,11 @@ namespace details {

 constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";

+// Wraps ir::Node and provide helper utilities.
+// It's responsible for populating necessary fields of ir::Node.
 class OpHandleBase {
 public:
-  OpHandleBase() {}
+  explicit OpHandleBase(ir::Node *node) : node_(node) {}

  virtual ~OpHandleBase();

@@ -82,6 +85,8 @@ class OpHandleBase {

  size_t NoDummyInputSize() const;

+  ir::Node *Node() { return node_; }
+
 protected:
  void RunAndRecordEvent(const std::function<void()> &callback);

@@ -90,6 +95,7 @@ class OpHandleBase {

  virtual void RunImpl() = 0;

+  ir::Node *node_;
  std::vector<VarHandleBase *> inputs_;
  std::vector<VarHandleBase *> outputs_;
  std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;

--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -35,14 +35,16 @@ struct ReduceLoDTensor {
    PADDLE_ENFORCE(!src_tensors_.empty());
    auto &t0 = *src_tensors_[0];
    PADDLE_ENFORCE_NE(t0.numel(), 0);
+
    dst_tensor_.Resize(t0.dims());
    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
-    if (dst != t0.data<T>()) {
-      std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
-    }

-    for (size_t i = 1; i < src_tensors_.size(); ++i) {
+    for (size_t i = 0; i < src_tensors_.size(); ++i) {
      auto &t = *src_tensors_[i];
+      if (dst == t.data<T>()) {
+        continue;
+      }
+
      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
      PADDLE_ENFORCE_EQ(t.type(), t0.type());
      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,

--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -37,10 +37,13 @@ struct ReduceOpHandle : public OpHandleBase {

 #ifdef PADDLE_WITH_CUDA
  const platform::NCCLContextMap *nccl_ctxs_;
-  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                 const std::vector<platform::Place> &places,
                 const platform::NCCLContextMap *nccl_ctxs)
-      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+      : OpHandleBase(node),
+        local_scopes_(local_scopes),
+        places_(places),
+        nccl_ctxs_(nccl_ctxs) {
    if (nccl_ctxs_) {
      for (auto &p_ctx : nccl_ctxs_->contexts_) {
        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
@@ -48,9 +51,9 @@ struct ReduceOpHandle : public OpHandleBase {
    }
  }
 #else
-  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                 const std::vector<platform::Place> &places)
-      : local_scopes_(local_scopes), places_(places) {}
+      : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif

  std::string Name() const override;

--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -84,6 +84,7 @@ struct TestReduceOpHandle {
  }

  void InitReduceOp(size_t out_scope_idx) {
+    std::vector<std::unique_ptr<ir::Node>> nodes;
    // init scope
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      local_scopes_.push_back(&(g_scope_.NewScope()));
@@ -96,19 +97,21 @@ struct TestReduceOpHandle {
    }
    param_scopes_[out_scope_idx]->Var("out");

+    nodes.emplace_back(new ir::Node("node"));
    if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
+                                          gpu_list_, nccl_ctxs_.get()));
 #else
      PADDLE_THROW("CUDA is not support.");
 #endif
    } else {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
+                                          gpu_list_, nccl_ctxs_.get()));
 #else
-      op_handle_.reset(new ReduceOpHandle(local_scopes_, gpu_list_));
+      op_handle_.reset(
+          new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_));
 #endif
    }

@@ -118,8 +121,10 @@ struct TestReduceOpHandle {
      if (!use_gpu_) {
        op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
      }
-      auto *in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
-      in_var_handle->generated_op_ = nullptr;
+      nodes.emplace_back(new ir::Node("node1"));
+      auto *in_var_handle =
+          new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]);
+      in_var_handle->ClearGeneratedOp();
      vars_.emplace_back(in_var_handle);
      op_handle_->AddInput(in_var_handle);
    }
@@ -128,12 +133,13 @@ struct TestReduceOpHandle {
    vars_.emplace_back(new DummyVarHandle());
    DummyVarHandle *in_dummy_var_handle =
        static_cast<DummyVarHandle *>(vars_.back().get());
-    in_dummy_var_handle->generated_op_ = nullptr;
+    in_dummy_var_handle->ClearGeneratedOp();
    op_handle_->AddInput(in_dummy_var_handle);

    // add output
-    auto *out_var_handle =
-        new VarHandle(2, out_scope_idx, "out", gpu_list_[out_scope_idx]);
+    nodes.emplace_back(new ir::Node("node2"));
+    auto *out_var_handle = new VarHandle(nodes.back().get(), 2, out_scope_idx,
+                                         "out", gpu_list_[out_scope_idx]);
    vars_.emplace_back(out_var_handle);
    op_handle_->AddOutput(out_var_handle);


--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -18,10 +18,11 @@ namespace paddle {
 namespace framework {
 namespace details {

-RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc,
+RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc,
                         const Scope *local_scope, const std::string &name,
                         const platform::Place &place)
-    : op_(framework::OpRegistry::CreateOp(op_desc)),
+    : OpHandleBase(node),
+      op_(framework::OpRegistry::CreateOp(op_desc)),
      local_scope_(local_scope),
      name_(name),
      place_(place) {}
@@ -35,8 +36,8 @@ void RPCOpHandle::RunImpl() {
    if (in->DebugString() == "dummy") {  // HACK
      continue;
    }
-    if (in->generated_op_) {
-      in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[p]);
+    if (in->GeneratedOp()) {
+      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[p]);
    }
  }
  auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();

--- a/paddle/fluid/framework/details/rpc_op_handle.h
+++ b/paddle/fluid/framework/details/rpc_op_handle.h
@@ -28,8 +28,9 @@ namespace framework {
 namespace details {

 struct RPCOpHandle : public OpHandleBase {
-  RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
-              const std::string& name, const platform::Place& place);
+  RPCOpHandle(ir::Node* node, const framework::OpDesc& op_desc,
+              const Scope* local_scope, const std::string& name,
+              const platform::Place& place);

  std::string Name() const override;


--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -19,10 +19,14 @@
 namespace paddle {
 namespace framework {
 namespace details {
-ScaleLossGradOpHandle::ScaleLossGradOpHandle(size_t num_dev, Scope *scope,
+ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
+                                             Scope *scope,
                                             platform::Place place,
                                             platform::DeviceContext *dev_ctx)
-    : coeff_(static_cast<float>(1.0 / num_dev)), scope_(scope), place_(place) {
+    : OpHandleBase(node),
+      coeff_(static_cast<float>(1.0 / num_dev)),
+      scope_(scope),
+      place_(place) {
  dev_ctxes_[place_] = dev_ctx;
 }


--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -25,7 +25,8 @@ namespace framework {
 namespace details {

 struct ScaleLossGradOpHandle : public OpHandleBase {
-  ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place,
+  ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
+                        platform::Place place,
                        platform::DeviceContext *context);

  ~ScaleLossGradOpHandle() final;

--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -17,6 +17,9 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"
+
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 #include "paddle/fluid/framework/scope.h"

--- a/paddle/fluid/framework/details/ssa_graph.cc
+++ b/paddle/fluid/framework/details/ssa_graph.cc
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/ssa_graph.h"
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -17,8 +17,8 @@
 namespace paddle {
 namespace framework {
 namespace details {
-void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
-  for (auto &var_map : graph->vars_) {
+void SSAGraphBuilder::PolishGraphToSupportDataHazards(Graph *graph) {
+  for (auto &var_map : graph->Get<GraphVars>("vars")) {
    for (auto &name_pair : var_map) {
      if (name_pair.second.size() <= 1) {
        continue;
@@ -27,8 +27,8 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
      auto it_old = name_pair.second.rbegin();
      ++it_old;
      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
-        auto *write_op = (*it_new)->generated_op_;
-        auto &read_ops = (*it_old)->pending_ops_;
+        OpHandleBase *write_op = (*it_new)->GeneratedOp();
+        const auto &read_ops = (*it_old)->PendingOps();

        for (auto *read_op : read_ops) {
          // Manually add a dependency var from read_op to write_op;
@@ -37,10 +37,11 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
            continue;
          }

-          auto *dep_var = new DummyVarHandle();
+          auto *dep_var = new DummyVarHandle(
+              graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
          read_op->AddOutput(dep_var);
          write_op->AddInput(dep_var);
-          graph->dep_vars_.emplace(dep_var);
+          graph->Get<GraphDepVars>("dep_vars").emplace(dep_var);
        }
      }
    }
@@ -48,13 +49,20 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
 }

 VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
-    SSAGraph *graph, const std::string &each_var_name,
-    const platform::Place &place, size_t place_offset) {
-  auto &var_holders = graph->vars_[place_offset];
-  auto &var_holder = var_holders[each_var_name];
+    Graph *graph, ir::Node *node, const platform::Place &place,
+    size_t place_offset) {
+  auto &var_holders = graph->Get<GraphVars>("vars")[place_offset];
+  auto &var_holder = var_holders[node->Name()];
  VarHandle *var = nullptr;
  if (var_holder.empty()) {
-    var = new VarHandle(0, place_offset, each_var_name, place);
+    if (node->Var()) {
+      var = new VarHandle(graph->CreateVarNode(node->Var()), 0, place_offset,
+                          node->Name(), place);
+    } else {
+      var = new VarHandle(
+          graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0,
+          place_offset, node->Name(), place);
+    }
    var_holder.emplace_back(var);
  } else {
    var = var_holder.rbegin()->get();
@@ -62,24 +70,26 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
  return var;
 }

-void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
-                                     const std::string &each_var_name,
+void SSAGraphBuilder::CreateOpOutput(Graph *graph, OpHandleBase *op_handle,
+                                     ir::Node *new_node,
                                     const platform::Place &place,
                                     size_t place_offset) {
-  auto &vars = graph->vars_[place_offset][each_var_name];
+  auto &vars = graph->Get<GraphVars>("vars")[place_offset][new_node->Name()];
  size_t version = vars.size();
-  auto var = new VarHandle(version, place_offset, each_var_name, place);
+  auto var =
+      new VarHandle(new_node, version, place_offset, new_node->Name(), place);
  vars.emplace_back(var);
  op_handle->AddOutput(var);
 }

-void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
-  for (auto &op : graph->ops_) {
+void SSAGraphBuilder::AddOutputToLeafOps(Graph *graph) {
+  for (auto &op : graph->Get<GraphOps>("ops")) {
    if (!op->Outputs().empty()) {
      continue;
    }
-    auto *dummy_leaf = new DummyVarHandle();
-    graph->dep_vars_.emplace(dummy_leaf);
+    auto *dummy_leaf = new DummyVarHandle(
+        graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
+    graph->Get<GraphDepVars>("dep_vars").emplace(dummy_leaf);
    op->AddOutput(dummy_leaf);
  }
 }

--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -16,20 +16,42 @@

 #include <memory>
 #include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"

-#include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/place.h"

+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
 namespace paddle {
 namespace framework {
 namespace details {

-class SSAGraphBuilder {
+// all variable in each devices.
+// The outside vector is the device vector. Each element of this vector is a
+// map from variable name to variables. The variables, who have the same name,
+// will have a differsent version. The offset in the
+// `std::vector<std::unique_ptr<VarHandle>>` is the version of varaibles.
+typedef std::vector<
+    std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
+    GraphVars;
+
+// aux variables to represent dependency. Useful to resolve data hazard.
+typedef std::unordered_set<std::unique_ptr<VarHandleBase>> GraphDepVars;
+
+// all operators. NOTE that even we use a vector here, the operators is
+// unordered.
+typedef std::vector<std::unique_ptr<OpHandleBase>> GraphOps;
+
+class SSAGraphBuilder : public ir::Pass {
 public:
  SSAGraphBuilder() {}
  virtual ~SSAGraphBuilder() {}
-  virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
+
  virtual int GetVarDeviceID(const std::string &var_name) const = 0;

  DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
@@ -42,20 +64,19 @@ class SSAGraphBuilder {
   *
   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
   */
-  static void PolishGraphToSupportDataHazards(SSAGraph *graph);
+  static void PolishGraphToSupportDataHazards(Graph *graph);

-  static VarHandle *CreateOrGetLatestVarHandle(SSAGraph *graph,
-                                               const std::string &each_var_name,
+  static VarHandle *CreateOrGetLatestVarHandle(Graph *graph, ir::Node *node,
                                               const platform::Place &place,
                                               size_t place_offset);

  // Add an output variable (each_var_name, place, place_offset) to op_handle,
  // which belongs to graph
-  static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
-                             const std::string &each_var_name,
-                             const platform::Place &place, size_t place_offset);
+  static void CreateOpOutput(Graph *graph, OpHandleBase *op_handle,
+                             ir::Node *new_node, const platform::Place &place,
+                             size_t place_offset);

-  static void AddOutputToLeafOps(SSAGraph *graph);
+  static void AddOutputToLeafOps(Graph *graph);
 };
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/ssa_graph_checker.cc
+++ b/paddle/fluid/framework/details/ssa_graph_checker.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/framework/details/ssa_graph.h"
-#include <string>
 #include "paddle/fluid/framework/details/ssa_graph_checker.h"
+#include <string>
+#include "paddle/fluid/framework/ir/graph.h"

 namespace paddle {
 namespace framework {
 namespace details {

-bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
+bool SSAGraghBuilderWithChecker::IsValidGraph(const Graph *graph) const {
  std::unordered_map<OpHandleBase *, size_t> pending_ops;
  std::unordered_set<VarHandleBase *> pending_vars;
  std::unordered_set<VarHandleBase *> ready_vars;
@@ -28,12 +28,12 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {

  auto insert_pending_var = [&](VarHandleBase *var) {
    pending_vars.insert(var);
-    if (var->generated_op_ == nullptr) {
+    if (var->GeneratedOp() == nullptr) {
      ready_vars.emplace(var);
    }
  };

-  for (auto &var_map : graph->vars_) {
+  for (auto &var_map : graph->Get<GraphVars>("vars")) {
    for (auto &name_pair : var_map) {
      for (auto &version_pair : name_pair.second) {
        insert_pending_var(version_pair.get());
@@ -41,11 +41,11 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
    }
  }

-  for (auto &var : graph->dep_vars_) {
+  for (auto &var : graph->Get<GraphDepVars>("dep_vars")) {
    insert_pending_var(var.get());
  }

-  for (auto &op : graph->ops_) {
+  for (auto &op : graph->Get<GraphOps>("ops")) {
    if (op->Inputs().empty()) {
      ready_ops.insert(op.get());
    } else {
@@ -71,7 +71,7 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {

    for (auto ready_var : ready_vars) {
      pending_vars.erase(ready_var);
-      for (auto *op : ready_var->pending_ops_) {
+      for (auto *op : ready_var->PendingOps()) {
        auto &deps = --pending_ops[op];
        if (deps == 0) {
          ready_ops.insert(op);

--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -21,7 +21,6 @@
 namespace paddle {
 namespace framework {
 namespace details {
-struct SSAGraph;

 class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
 public:
@@ -29,17 +28,17 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
      std::unique_ptr<SSAGraphBuilder>&& builder)
      : builder_(std::move(builder)) {}

-  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
-    auto graph = builder_->Build(program);
-    PADDLE_ENFORCE(IsValidGraph(graph.get()));
-    return graph;
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
+    auto new_graph = builder_->Apply(std::move(graph));
+    PADDLE_ENFORCE(IsValidGraph(new_graph.get()));
+    return new_graph;
  }

  int GetVarDeviceID(const std::string& var_name) const override {
    return builder_->GetVarDeviceID(var_name);
  }

-  bool IsValidGraph(const SSAGraph* graph) const;
+  bool IsValidGraph(const Graph* graph) const;

 private:
  std::unique_ptr<SSAGraphBuilder> builder_;

--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -18,8 +18,8 @@
 #include <string>
 #include <vector>

-#include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/ir/graph.h"

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/details/ssa_graph_printer.cc
+++ b/paddle/fluid/framework/details/ssa_graph_printer.cc
@@ -14,15 +14,15 @@

 #include "paddle/fluid/framework/details/ssa_graph_printer.h"
 #include <string>
-#include "paddle/fluid/framework/details/ssa_graph.h"
+#include "paddle/fluid/framework/ir/graph.h"

 namespace paddle {
 namespace framework {
 namespace details {

 template <typename Callback>
-static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
-  for (auto &each : graph.vars_) {
+static inline void IterAllVar(const Graph &graph, Callback callback) {
+  for (auto &each : graph.Get<GraphVars>("vars")) {
    for (auto &pair1 : each) {
      for (auto &pair2 : pair1.second) {
        callback(*pair2);
@@ -30,12 +30,12 @@ static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
    }
  }

-  for (auto &var : graph.dep_vars_) {
+  for (auto &var : graph.Get<GraphDepVars>("dep_vars")) {
    callback(*var);
  }
 }

-void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
+void GraphvizSSAGraphPrinter::Print(const Graph &graph,
                                    std::ostream &sout) const {
  size_t var_id = 0;
  std::unordered_map<const VarHandleBase *, size_t> vars;
@@ -61,7 +61,7 @@ void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
  });

  size_t op_id = 0;
-  for (auto &op : graph.ops_) {
+  for (auto &op : graph.Get<GraphOps>("ops")) {
    std::string op_name = "op_" + std::to_string(op_id++);
    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
         << std::endl;

--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -21,16 +21,16 @@
 namespace paddle {
 namespace framework {
 namespace details {
-struct SSAGraph;
+
 class SSAGraphPrinter {
 public:
  virtual ~SSAGraphPrinter() {}
-  virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0;
+  virtual void Print(const Graph& graph, std::ostream& sout) const = 0;
 };

 class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
 public:
-  void Print(const SSAGraph& graph, std::ostream& sout) const override;
+  void Print(const Graph& graph, std::ostream& sout) const override;
 };

 class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
@@ -50,10 +50,10 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
        stream_ptr_(std::move(sout)),
        stream_ref_(*stream_ptr_) {}

-  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
-    auto graph = builder_->Build(program);
-    printer_->Print(*graph, stream_ref_);
-    return graph;
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
+    auto new_graph = builder_->Apply(std::move(graph));
+    printer_->Print(*new_graph, stream_ref_);
+    return new_graph;
  }

  int GetVarDeviceID(const std::string& var_name) const override {

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -14,13 +14,14 @@

 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"

+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+
 namespace paddle {
 namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    std::unique_ptr<SSAGraph> &&graph)
+    const std::vector<platform::Place> &places, std::unique_ptr<Graph> &&graph)
    : graph_(std::move(graph)),
      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                       : nullptr),
@@ -43,18 +44,18 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
  std::unordered_set<OpHandleBase *> delayed_ops;

  // Transform SSAGraph to pending_ops & pending_vars
-  for (auto &var_map : graph_->vars_) {
+  for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
    for (auto &name_pair : var_map) {
      for (auto &version_pair : name_pair.second) {
        InsertPendingVar(&pending_vars, &ready_vars, version_pair.get());
      }
    }
  }
-  for (auto &var : graph_->dep_vars_) {
+  for (auto &var : graph_->Get<details::GraphDepVars>("dep_vars")) {
    InsertPendingVar(&pending_vars, &ready_vars, var.get());
  }

-  for (auto &op : graph_->ops_) {
+  for (auto &op : graph_->Get<details::GraphOps>("ops")) {
    if (op->Inputs().empty()) {  // Special case, Op has no input.
      ready_ops.insert(op.get());
    } else {
@@ -64,11 +65,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(

  // Step 2. Insert FetchOps
  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
+  std::vector<std::unique_ptr<ir::Node>> tmp_nodes;
  std::unordered_set<std::unique_ptr<VarHandleBase>> fetch_dependencies;
  FeedFetchList fetch_data(fetch_tensors.size());

-  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
-                 &pending_vars, &ready_vars, &fetch_data);
+  InsertFetchOps(fetch_tensors, &fetch_ops, &tmp_nodes, &fetch_dependencies,
+                 &pending_ops, &pending_vars, &ready_vars, &fetch_data);

  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
    for (auto *op : set) {
@@ -125,7 +127,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    // Find the ready_ops after the ready_var.
    for (auto ready_var : cur_ready_vars) {
      pending_vars.erase(ready_var);
-      for (auto *op : ready_var->pending_ops_) {
+      for (auto *op : ready_var->PendingOps()) {
        auto &deps = pending_ops[op];
        --deps;
        if (deps == 0) {
@@ -151,6 +153,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 void ThreadedSSAGraphExecutor::InsertFetchOps(
    const std::vector<std::string> &fetch_tensors,
    std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
+    std::vector<std::unique_ptr<ir::Node>> *temp_nodes,
    std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
    std::unordered_map<OpHandleBase *, size_t> *pending_ops,
    std::unordered_set<VarHandleBase *> *pending_vars,
@@ -158,7 +161,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;

  for (auto &fetch_var_name : fetch_tensors) {
-    for (auto &var_map : graph_->vars_) {
+    for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
      auto it = var_map.find(fetch_var_name);
      if (it != var_map.end()) {
        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
@@ -168,8 +171,16 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(

  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
    auto &var_name = fetch_tensors[i];
-    auto &vars = fetched_vars.at(var_name);
-    auto *op = new FetchOpHandle(fetch_data, i, &local_scopes_);
+    auto fetched_var_it = fetched_vars.find(var_name);
+    PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
+                   "Cannot find fetched variable.(Perhaps the main_program "
+                   "is not set to ParallelExecutor)");
+
+    auto &vars = fetched_var_it->second;
+
+    temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
+    auto *op = new FetchOpHandle(temp_nodes->back().get(), fetch_data, i,
+                                 &local_scopes_);
    fetch_ops->emplace_back(op);

    for (auto &p : places_) {
@@ -180,7 +191,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
      op->AddInput(var);
    }

-    auto *fetch_dummy = new DummyVarHandle();
+    temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
+    auto *fetch_dummy = new DummyVarHandle(temp_nodes->back().get());
    op->AddOutput(fetch_dummy);
    fetch_dependencies->emplace(fetch_dummy);
    this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy);
@@ -198,7 +210,7 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
    std::unordered_set<VarHandleBase *> *pending_vars,
    BlockingQueue<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
  pending_vars->insert(var);
-  if (var->generated_op_ == nullptr) {
+  if (var->GeneratedOp() == nullptr) {
    ready_vars->Push(var);
  }
 }

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -27,6 +27,7 @@
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph.h"

 namespace paddle {
 namespace framework {
@@ -39,7 +40,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                           const std::vector<Scope *> &local_scopes,
                           const std::vector<platform::Place> &places,
-                           std::unique_ptr<SSAGraph> &&graph);
+                           std::unique_ptr<Graph> &&graph);

  // Run a SSAGraph by a thread pool
  // Use topological sort algorithm
@@ -52,7 +53,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
             details::OpHandleBase *op);

 private:
-  std::unique_ptr<SSAGraph> graph_;
+  std::unique_ptr<Graph> graph_;
  std::unique_ptr<::ThreadPool> pool_;
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
@@ -71,6 +72,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  void InsertFetchOps(
      const std::vector<std::string> &fetch_tensors,
      std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
+      std::vector<std::unique_ptr<ir::Node>> *temp_nodes,
      std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
      std::unordered_map<OpHandleBase *, size_t> *pending_ops,
      std::unordered_set<VarHandleBase *> *pending_vars,

--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -13,11 +13,14 @@
 // limitations under the License.

 #pragma once
+
+#include <algorithm>
 #include <sstream>
 #include <string>
 #include <unordered_set>
 #include <utility>

+#include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/platform/place.h"

 namespace paddle {
@@ -25,19 +28,60 @@ namespace framework {
 namespace details {
 class OpHandleBase;

+// Wraps ir::Node and provide helper utilities.
+// It's responsible for populating necessary fields of ir::Node.
+//
 // VarHandleBase is the var node in the dependency graph.
 // A variable can only be generated by a single operator. i.e.
 // This is a single assignment graph.
 struct VarHandleBase {
+  explicit VarHandleBase(ir::Node* node) : node_(node) {}
+
  virtual ~VarHandleBase();
+
  virtual std::string DebugString() const = 0;

+  void AddInput(OpHandleBase* in, ir::Node* node) {
+    node_->inputs.clear();
+    node_->inputs.push_back(node);
+    generated_op_ = in;
+  }
+
+  void AddOutput(OpHandleBase* out, ir::Node* node) {
+    if (pending_ops_.find(out) == pending_ops_.end()) {
+      pending_ops_.insert(out);
+      node_->outputs.push_back(node);
+    }
+  }
+
+  void RemoveOutput(OpHandleBase* out, ir::Node* node) {
+    pending_ops_.erase(out);
+    node_->outputs.erase(
+        std::remove(node_->outputs.begin(), node_->outputs.end(), node),
+        node_->outputs.end());
+  }
+
+  void ClearGeneratedOp() {
+    generated_op_ = nullptr;
+    node_->inputs.clear();
+  }
+
+  OpHandleBase* GeneratedOp() { return generated_op_; }
+
+  const std::unordered_set<OpHandleBase*>& PendingOps() const {
+    return pending_ops_;
+  }
+
+  ir::Node* Node() { return node_; }
+
+ protected:
  // The operator who generate this variable. nullptr if the variable
  // is a root node.
  OpHandleBase* generated_op_{nullptr};

  // Operators which depend on this variable ready.
  std::unordered_set<OpHandleBase*> pending_ops_;
+  ir::Node* node_;
 };

 // VarHandle is actually a single version of Runtime Variable.
@@ -46,11 +90,14 @@ struct VarHandleBase {
 //
 // NOTE: runtime variables have place.
 struct VarHandle : public VarHandleBase {
+  explicit VarHandle(ir::Node* node) : VarHandleBase(node) {}
+
  std::string DebugString() const override;

-  VarHandle(size_t version, size_t scope_index, std::string name,
-            platform::Place place)
-      : version_(version),
+  VarHandle(ir::Node* node, size_t version, size_t scope_index,
+            std::string name, platform::Place place)
+      : VarHandleBase(node),
+        version_(version),
        scope_idx_(scope_index),
        name_(std::move(name)),
        place_(std::move(place)) {}
@@ -70,6 +117,8 @@ struct VarHandle : public VarHandleBase {

 // Dummy Variable. It is used to represent dependencies between operators
 struct DummyVarHandle : public VarHandleBase {
+  explicit DummyVarHandle(ir::Node* node) : VarHandleBase(node) {}
+
  std::string DebugString() const override;
 };


--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
+cc_library(node SRCS node.cc DEPS proto_desc)
+cc_library(graph SRCS graph.cc DEPS node)
+cc_library(pass SRCS pass.cc DEPS graph node)
+
+cc_test(graph_test SRCS graph_test.cc DEPS graph proto_desc op_registry)
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+
+namespace paddle {
+namespace framework {
+
+// NOTE(paddle-dev): This graph contains circle.
+Graph::Graph(const ProgramDesc &program) : program_(program) {
+  VLOG(3) << "block in program:" << program_.Size();
+  std::unordered_map<std::string, VarDesc *> all_vars;
+  for (auto *var : program.Block(0).AllVars()) {
+    all_vars.emplace(var->Name(), var);
+  }
+
+  std::map<std::string, ir::Node *> var_nodes;
+  for (auto *op : program.Block(0).AllOps()) {
+    ir::Node *node = CreateOpNode(op);
+
+    for (auto &each_var_name : op->InputArgumentNames()) {
+      ir::Node *var = nullptr;
+      if (var_nodes.find(each_var_name) != var_nodes.end()) {
+        var = var_nodes.at(each_var_name);
+      } else if (all_vars.count(each_var_name) != 0) {
+        var = CreateVarNode(all_vars.at(each_var_name));
+        var_nodes[each_var_name] = var;
+      } else {
+        // TODO(paddle-dev): Seems some assumption doesn't hold?
+        VLOG(3) << op->Type()
+                << " input var not in all_var list: " << each_var_name;
+        var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
+        var_nodes[each_var_name] = var;
+      }
+      node->inputs.push_back(var);
+      var->outputs.push_back(node);
+    }
+
+    for (auto &each_var_name : op->OutputArgumentNames()) {
+      ir::Node *var = nullptr;
+      if (var_nodes.find(each_var_name) != var_nodes.end()) {
+        var = var_nodes.at(each_var_name);
+      } else {
+        var = CreateVarNode(all_vars.at(each_var_name));
+        var_nodes[each_var_name] = var;
+      }
+      node->outputs.push_back(var);
+      var->inputs.push_back(node);
+    }
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+
+class Graph {
+ public:
+  explicit Graph(const ProgramDesc& program);
+
+  virtual ~Graph() {
+    for (auto& attr : attrs_) {
+      attr_dels_[attr.first]();
+    }
+    attrs_.clear();
+    attr_dels_.clear();
+  }
+
+  template <typename AttrType>
+  AttrType& Get(const std::string& attr_name) const {
+    return *boost::any_cast<AttrType*>(attrs_.at(attr_name));
+  }
+
+  template <typename AttrType>
+  void Set(const std::string& attr_name, AttrType* attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+    attrs_[attr_name] = attr;
+    attr_dels_[attr_name] = [attr, attr_name]() {
+      VLOG(3) << "deleting " << attr_name;
+      delete attr;
+    };
+  }
+
+  ir::Node* CreateVarNode(VarDesc* var_desc) {
+    nodes.emplace_back(new ir::Node(var_desc));
+    return nodes.back().get();
+  }
+
+  ir::Node* CreateOpNode(OpDesc* op_desc) {
+    nodes.emplace_back(new ir::Node(op_desc));
+    return nodes.back().get();
+  }
+
+  ir::Node* CreateEmptyNode(const std::string& name, ir::Node::Type type) {
+    nodes.emplace_back(new ir::Node(name, type));
+    return nodes.back().get();
+  }
+
+  std::vector<std::unique_ptr<ir::Node>> nodes;
+
+ private:
+  // NOTE: program_ shouldn't be exposed to user.
+  const ProgramDesc& program_;
+  std::map<std::string, boost::any> attrs_;
+  std::map<std::string, std::function<void(void)>> attr_dels_;
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+class NOP : public OperatorBase {
+ public:
+  NOP(const std::string &type, const VariableNameMap &inputs,
+      const VariableNameMap &outputs, const AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const Scope &scope,
+               const platform::Place &place) const override {}
+};
+
+class SumOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class SumOpVarTypeInference : public VarTypeInference {
+ public:
+  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
+    auto &inputs = op_desc.Input("X");
+    auto default_var_type = proto::VarType::SELECTED_ROWS;
+
+    bool any_input_is_lod_tensor = std::any_of(
+        inputs.begin(), inputs.end(), [block](const std::string &name) {
+          return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR;
+        });
+    if (any_input_is_lod_tensor) {
+      default_var_type = proto::VarType::LOD_TENSOR;
+    }
+
+    auto out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetType(default_var_type);
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
+                  paddle::framework::SumOpVarTypeInference);
+REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
+                  paddle::framework::SumOpMaker);
+
+namespace paddle {
+namespace framework {
+
+TEST(GraphTest, Basic) {
+  ProgramDesc prog;
+  auto *op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("sum");
+  op->SetInput("X", {"test_a", "test_b", "test_c"});
+  op->SetOutput("Out", {"test_out"});
+
+  prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarType::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarType::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_out");
+
+  op->InferVarType(prog.MutableBlock(0));
+
+  ASSERT_EQ(proto::VarType::SELECTED_ROWS,
+            prog.MutableBlock(0)->Var("test_out")->GetType());
+
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::LOD_TENSOR);
+  op->InferVarType(prog.MutableBlock(0));
+  ASSERT_EQ(proto::VarType::LOD_TENSOR,
+            prog.MutableBlock(0)->Var("test_out")->GetType());
+
+  std::unique_ptr<Graph> g(new Graph(prog));
+  ASSERT_EQ(g->nodes[0]->Name(), "sum");
+  ASSERT_EQ(g->nodes[0]->inputs[0]->Name(), "test_a");
+  ASSERT_EQ(g->nodes[0]->inputs[1]->Name(), "test_b");
+  ASSERT_EQ(g->nodes[0]->inputs[2]->Name(), "test_c");
+  ASSERT_EQ(g->nodes[0]->outputs[0]->Name(), "test_out");
+  ASSERT_EQ(g->nodes[1]->Name(), "test_a");
+  ASSERT_EQ(g->nodes[1]->outputs[0]->Name(), "sum");
+  ASSERT_EQ(g->nodes[2]->Name(), "test_b");
+  ASSERT_EQ(g->nodes[2]->outputs[0]->Name(), "sum");
+  ASSERT_EQ(g->nodes[3]->Name(), "test_c");
+  ASSERT_EQ(g->nodes[3]->outputs[0]->Name(), "sum");
+  ASSERT_EQ(g->nodes[4]->Name(), "test_out");
+  ASSERT_EQ(g->nodes[4]->inputs[0]->Name(), "sum");
+  ASSERT_EQ(g->nodes.size(), 5);
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace framework {}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Node {
+ public:
+  enum class Type { kOperation, kVariable };
+  explicit Node(const std::string& name, Type type)
+      : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
+
+  explicit Node(VarDesc* var_desc)
+      : name_(var_desc->Name()),
+        var_desc_(var_desc),
+        op_desc_(nullptr),
+        type_(Type::kVariable) {}
+
+  explicit Node(OpDesc* op_desc)
+      : name_(op_desc->Type()),
+        var_desc_(nullptr),
+        op_desc_(op_desc),
+        type_(Type::kOperation) {}
+
+  Type NodeType() const { return type_; }
+
+  std::string Name() const { return name_; }
+
+  VarDesc* Var() {
+    PADDLE_ENFORCE(type_ == Type::kVariable);
+    return var_desc_;
+  }
+  OpDesc* Op() {
+    PADDLE_ENFORCE(type_ == Type::kOperation);
+    return op_desc_;
+  }
+
+  std::vector<Node*> inputs;
+  std::vector<Node*> outputs;
+
+ protected:
+  const std::string name_;
+  VarDesc* var_desc_;
+  OpDesc* op_desc_;
+  Type type_;
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(Node);
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Pass {
+ public:
+  Pass() = default;
+  virtual ~Pass() {}
+
+  virtual std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const = 0;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -312,19 +312,22 @@ void WriteToRecordIO(recordio::Writer *writer,
  writer->Write(buffer.str());
 }

-std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner *scanner, const platform::DeviceContext &dev_ctx) {
-  std::vector<LoDTensor> result;
-  if (scanner->HasNext()) {
-    std::istringstream sin(scanner->Next());
-    uint32_t sz;
-    sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
-    result.resize(sz);
-    for (uint32_t i = 0; i < sz; ++i) {
-      DeserializeFromStream(sin, &result[i], dev_ctx);
-    }
+bool ReadFromRecordIO(recordio::Scanner *scanner,
+                      const platform::DeviceContext &dev_ctx,
+                      std::vector<LoDTensor> *result_ptr) {
+  if (!scanner->HasNext()) {
+    return false;
  }
-  return result;
+  std::istringstream sin(scanner->Next());
+  uint32_t sz;
+  sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
+  auto &result = *result_ptr;
+  result.resize(sz);
+  for (uint32_t i = 0; i < sz; ++i) {
+    DeserializeFromStream(sin, &result[i], dev_ctx);
+  }
+
+  return true;
 }

 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(

--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -223,8 +223,9 @@ extern void WriteToRecordIO(recordio::Writer* writer,
                            const std::vector<LoDTensor>& tensor,
                            const platform::DeviceContext& dev_ctx);

-extern std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
+extern bool ReadFromRecordIO(recordio::Scanner* scanner,
+                             const platform::DeviceContext& dev_ctx,
+                             std::vector<LoDTensor>* result_ptr);

 /*
 * Convert between length-based LoD and offset-based LoD.

--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -301,11 +301,12 @@ static void TestRecordIO() {
  {
    std::unique_ptr<std::istream> stream_ptr(stream);
    recordio::Scanner scanner(std::move(stream_ptr));
-    auto tensors = ReadFromRecordIO(&scanner, ctx);
+    std::vector<framework::LoDTensor> tensors;
+    ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors));
    ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
    assert_tensor_ok(tensors[0]);
    assert_tensor_ok(tensors[1]);
-    tensors = ReadFromRecordIO(&scanner, ctx);
+    ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors));
    ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
    assert_tensor_ok(tensors[0]);
    assert_tensor_ok(tensors[1]);

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -26,6 +26,7 @@
 namespace paddle {
 namespace framework {

+#if defined(PADDLE_WITH_CUDA)
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
@@ -37,11 +38,11 @@ class Vector {
  Vector() { InitEmpty(); }

  // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T& value = T()) {
+  explicit Vector(size_t count, const T &value = T()) {
    InitEmpty();
    if (count != 0) {
      resize(count);
-      T* ptr = begin();
+      T *ptr = begin();
      for (size_t i = 0; i < count; ++i) {
        ptr[i] = value;
      }
@@ -59,7 +60,7 @@ class Vector {

  // implicit cast from std::vector.
  template <typename U>
-  Vector(const std::vector<U>& dat) {  // NOLINT
+  Vector(const std::vector<U> &dat) {  // NOLINT
    if (dat.size() == 0) {
      InitEmpty();
    } else {
@@ -68,10 +69,10 @@ class Vector {
  }

  // Copy ctor
-  Vector(const Vector<T>& other) { this->operator=(other); }
+  Vector(const Vector<T> &other) { this->operator=(other); }

  // Copy operator
-  Vector<T>& operator=(const Vector<T>& other) {
+  Vector<T> &operator=(const Vector<T> &other) {
    if (other.size() != 0) {
      this->InitByIter(other.size(), other.begin(), other.end());
    } else {
@@ -81,7 +82,7 @@ class Vector {
  }

  // Move ctor
-  Vector(Vector<T>&& other) {
+  Vector(Vector<T> &&other) {
    this->size_ = other.size_;
    this->flag_ = other.flag_;
    if (other.cuda_vec_.memory_size()) {
@@ -93,13 +94,13 @@ class Vector {
  }

  // CPU data access method. Mutable.
-  T& operator[](size_t i) {
+  T &operator[](size_t i) {
    MutableCPU();
-    return const_cast<T*>(cpu_vec_.data<T>())[i];
+    return const_cast<T *>(cpu_vec_.data<T>())[i];
  }

  // CPU data access method. Immutable.
-  const T& operator[](size_t i) const {
+  const T &operator[](size_t i) const {
    ImmutableCPU();
    return cpu_vec_.data<T>()[i];
  }
@@ -107,43 +108,43 @@ class Vector {
  // std::vector iterator methods. Based on CPU data access method
  size_t size() const { return size_; }

-  T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
+  T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }

-  T* end() {
+  T *end() {
    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
  }

-  T& front() { return *begin(); }
+  T &front() { return *begin(); }

-  T& back() {
+  T &back() {
    auto it = end();
    --it;
    return *it;
  }

-  const T* begin() const {
+  const T *begin() const {
    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
  }

-  const T* end() const {
+  const T *end() const {
    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
  }

-  const T* cbegin() const { return begin(); }
+  const T *cbegin() const { return begin(); }

-  const T* cend() const { return end(); }
+  const T *cend() const { return end(); }

-  const T& back() const {
+  const T &back() const {
    auto it = end();
    --it;
    return *it;
  }

-  T* data() { return begin(); }
+  T *data() { return begin(); }

-  const T* data() const { return begin(); }
+  const T *data() const { return begin(); }

-  const T& front() const { return *begin(); }
+  const T &front() const { return *begin(); }
  // end of std::vector iterator methods

  // assign this from iterator.
@@ -169,7 +170,7 @@ class Vector {
  void Extend(It begin, It end) {
    size_t pre_size = size_;
    resize(pre_size + (end - begin));
-    T* ptr = this->begin() + pre_size;
+    T *ptr = this->begin() + pre_size;
    for (; begin < end; ++begin, ++ptr) {
      *ptr = *begin;
    }
@@ -183,9 +184,9 @@ class Vector {
      MutableCPU();
      Tensor cpu_tensor;
      platform::Place cpu = platform::CPUPlace();
-      T* ptr = cpu_tensor.mutable_data<T>(
+      T *ptr = cpu_tensor.mutable_data<T>(
          framework::make_ddim({static_cast<int64_t>(size)}), cpu);
-      const T* old_ptr =
+      const T *old_ptr =
          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
      if (old_ptr != nullptr) {
        std::copy(old_ptr, old_ptr + size_, ptr);
@@ -196,7 +197,7 @@ class Vector {
  }

  // get cuda ptr. immutable
-  const T* CUDAData(platform::Place place) const {
+  const T *CUDAData(platform::Place place) const {
    PADDLE_ENFORCE(platform::is_gpu_place(place),
                   "CUDA Data must on CUDA place");
    ImmutableCUDA(place);
@@ -204,10 +205,10 @@ class Vector {
  }

  // get cuda ptr. mutable
-  T* CUDAMutableData(platform::Place place) {
-    const T* ptr = CUDAData(place);
+  T *CUDAMutableData(platform::Place place) {
+    const T *ptr = CUDAData(place);
    flag_ = kDirty | kDataInCUDA;
-    return const_cast<T*>(ptr);
+    return const_cast<T *>(ptr);
  }

  // clear
@@ -228,7 +229,7 @@ class Vector {
  }

  // the unify method to access CPU or CUDA data. immutable.
-  const T* Data(platform::Place place) const {
+  const T *Data(platform::Place place) const {
    if (platform::is_gpu_place(place)) {
      return CUDAData(place);
    } else {
@@ -237,7 +238,7 @@ class Vector {
  }

  // the unify method to access CPU or CUDA data. mutable.
-  T* MutableData(platform::Place place) {
+  T *MutableData(platform::Place place) {
    if (platform::is_gpu_place(place)) {
      return CUDAMutableData(place);
    } else {
@@ -253,7 +254,7 @@ class Vector {
    return result;
  }

-  bool operator==(const Vector<T>& other) const {
+  bool operator==(const Vector<T> &other) const {
    if (size() != other.size()) return false;
    auto it1 = cbegin();
    auto it2 = other.cbegin();
@@ -274,7 +275,7 @@ class Vector {
  template <typename Iter>
  void InitByIter(size_t size, Iter begin, Iter end) {
    platform::Place cpu = platform::CPUPlace();
-    T* ptr = this->cpu_vec_.template mutable_data<T>(
+    T *ptr = this->cpu_vec_.template mutable_data<T>(
        framework::make_ddim({static_cast<int64_t>(size)}), cpu);
    for (size_t i = 0; i < size; ++i) {
      *ptr++ = *begin++;
@@ -368,7 +369,7 @@ class Vector {
    }
  }

-  static T& EmptyDummy() {
+  static T &EmptyDummy() {
    static T dummy = T();
    return dummy;
  }
@@ -379,5 +380,53 @@ class Vector {
  size_t size_;
 };

-}  // namespace framework
+#else  // PADDLE_WITH_CUDA
+
+template <typename T>
+class CPUVector : public std::vector<T, std::allocator<T>> {
+ public:
+  CPUVector() : std::vector<T>() {}
+  CPUVector(size_t count, const T &value = T())
+      : std::vector<T>(count, value) {}
+  CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
+  CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}
+  explicit CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
+  CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
+  CPUVector(std::vector<T> &&other) : std::vector<T>(std::move(other)) {}
+  CPUVector &operator=(const CPUVector &other) {
+    this->assign(other.begin(), other.end());
+    return *this;
+  }
+  CPUVector &operator=(const std::vector<T> &other) {
+    this->assign(other.begin(), other.end());
+    return *this;
+  }
+
+  friend std::ostream &operator<<(std::ostream &os, const CPUVector<T> &other) {
+    std::stringstream ss;
+    for (auto v : other) {
+      os << v << " ";
+    }
+    return os;
+  }
+
+  void resize(size_t size) { this->resize(size); }
+
+  T &operator[](size_t id) { return this->at(id); }
+
+  const T &operator[](size_t id) const { return this->at(id); }
+
+  template <typename D>
+  void Extend(const D &begin, const D &end) {
+    this->reserve(this->size() + size_t(end - begin));
+    this->insert(this->end(), begin, end);
+  }
+};
+
+template <typename T>
+using Vector = CPUVector<T>;
+
+#endif  // PADDLE_WITH_CUDA
+
+};  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <tuple>
 #include <vector>

+#include "paddle/fluid/framework/ir/graph.h"
+
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@@ -129,12 +131,11 @@ ParallelExecutor::ParallelExecutor(
    PADDLE_THROW("Not compiled with CUDA.");
 #endif
  }
-
  builder_ = builder_factory.Create();
+  std::unique_ptr<Graph> graph(new Graph(main_program));
+  graph = builder_->Apply(std::move(graph));
  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      exec_strategy, member_->local_scopes_, places,
-      builder_->Build(main_program)));
-
+      exec_strategy, member_->local_scopes_, places, std::move(graph)));
  member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
      exec_strategy, member_->local_scopes_, std::move(var_infos),
      member_->places_, std::move(member_->executor_)));

--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -67,7 +67,8 @@ void ReaderBase::Start() {
  }
 }

-ReaderBase::~ReaderBase() { Shutdown(); }
+ReaderBase::~ReaderBase() {}

+DecoratedReader::~DecoratedReader() { reader_->Shutdown(); }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -25,8 +25,6 @@
 namespace paddle {
 namespace framework {

-enum ReaderStatus { kRunning, kStopped };
-
 class ReaderBase {
 public:
  virtual void ReadNext(std::vector<LoDTensor>* out);
@@ -48,6 +46,8 @@ class ReaderBase {

  virtual void StartImpl() {}

+  enum ReaderStatus { kRunning, kStopped };
+
  ReaderStatus status_{kRunning};

  mutable std::mutex mu_;
@@ -74,6 +74,8 @@ class DecoratedReader : public ReaderBase,
    reader_->InsertDecoratedReader(shared_from_this());
  }

+  ~DecoratedReader();
+
 protected:
  void ShutdownImpl() override { reader_->Shutdown(); }


--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -15,6 +15,7 @@
 #include <algorithm>
 #include <limits>
 #include <vector>
+#include "paddle/fluid/framework/data_type.h"

 namespace paddle {
 namespace framework {
@@ -261,7 +262,8 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
    os.write(out.data(), size);
  }
  {  // the 3rd field, tensor data
-    uint64_t size = tensor.memory_size();
+    uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type());
+
    auto* data_ptr = tensor.data<void>();
    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
                   "Index overflow when writing tensor");
@@ -331,6 +333,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
    tensor->Resize(framework::make_ddim(dims));
    void* buf;
    auto ctx = platform::CPUDeviceContext();
+    size_t size =
+        tensor->numel() *
+        framework::SizeOfType(framework::ToTypeIndex(desc.data_type()));
    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
      Tensor cpu_tensor;
@@ -338,7 +343,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
+      is.read(static_cast<char*>(buf), size);
      auto dst_place = dev_ctx.GetPlace();
      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
 #else
@@ -348,7 +353,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), tensor->memory_size());
+      is.read(static_cast<char*>(buf), size);
    }
  }
 }

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -38,4 +38,6 @@ if(WITH_TESTING)
  # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
  add_subdirectory(tests/book)
 endif()
-add_subdirectory(api)
+if(NOT APPLE)
+  add_subdirectory(api)
+endif()
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -19,10 +19,14 @@ function (inference_analysis_test TARGET)
        set(multiValueArgs SRCS)
        cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

+        set(mem_opt "")
+        if(WITH_GPU)
+            set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
+        endif()
        cc_test(${TARGET}
                SRCS "${analysis_test_SRCS}"
                DEPS analysis
-                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt})
        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
    endif(WITH_TESTING)
 endfunction(inference_analysis_test)

--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -22,8 +22,6 @@
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"

 namespace paddle {
-namespace inference {
-namespace analysis {

 DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
            "Enable subgraph to TensorRT engine for acceleration");
@@ -31,6 +29,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
 DEFINE_string(inference_analysis_graphviz_log_root, "./",
              "Graphviz debuger for data flow graphs.");

+namespace inference {
+namespace analysis {
+
 class DfgPassManagerImpl final : public DfgPassManager {
 public:
  DfgPassManagerImpl() {

--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -45,14 +45,15 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/pass_manager.h"

 namespace paddle {
-namespace inference {
-namespace analysis {

 // TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
 // flag if not available.
 DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
 DECLARE_string(inference_analysis_graphviz_log_root);

+namespace inference {
+namespace analysis {
+
 class Analyzer : public OrderedRegistry<PassManager> {
 public:
  // Register all the pass-managers.

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -13,13 +13,21 @@
 // limitations under the License.

 #include "paddle/fluid/inference/analysis/analyzer.h"
+#include <google/protobuf/text_format.h>
 #include "paddle/fluid/inference/analysis/ut_helper.h"

 namespace paddle {
 namespace inference {
 namespace analysis {

-TEST_F(DFG_Tester, main) {
+TEST_F(DFG_Tester, analysis_without_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
+  Analyzer analyser;
+  analyser.Run(&argument);
+}
+
+TEST_F(DFG_Tester, analysis_with_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
  Analyzer analyser;
  analyser.Run(&argument);
 }

--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -222,10 +222,19 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
  return stack_.top();
 }

+inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
+  return node.inlinks.size() == n;
+}
+
 GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
    const std::vector<Node *> &source) {
  PADDLE_ENFORCE(!source.empty(),
                 "Start points of topological sorting should not be empty!");
+  // CHECK all the inputs' in-degree is 0
+  for (auto *node : source) {
+    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
+  }
+
  std::unordered_set<Node *> visited;
  std::unordered_set<Node *> to_visit{source.begin(), source.end()};

@@ -233,6 +242,11 @@ GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
  while (!to_visit.empty()) {
    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
    for (auto *p : queue) {
+      if (p->deleted()) {
+        visited.insert(p);
+        to_visit.erase(p);
+        continue;
+      }
      inlink_visited.clear();

      std::copy_if(p->inlinks.begin(), p->inlinks.end(),
@@ -292,6 +306,37 @@ Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
  return sorted_[cursor_];
 }

+std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
+  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
+  std::unordered_set<Node *> inputs;
+  std::unordered_set<Node *> outputs;
+  // Input a Value, check whether its inlink is in the subgraph.
+  auto inlink_in_subgraph = [&](Node *n) {
+    for (auto *in : n->inlinks) {
+      if (nodes.count(in)) return true;
+    }
+    return false;
+  };
+  for (auto &node : graph) {
+    for (auto *in : node->inlinks) {
+      // The Value that is written by nodes inside a sub-graph shouldn't be the
+      // input of the sub-graph.
+      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
+          !inlink_in_subgraph(in)) {
+        inputs.insert(in);
+      }
+    }
+    for (auto *out : node->outlinks) {
+      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
+        outputs.insert(out);
+      }
+    }
+  }
+  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
+                        std::vector<Node *>(outputs.begin(), outputs.end()));
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -133,7 +133,7 @@ struct GraphTraits<DataFlowGraph> {

   private:
    std::vector<Node *> sorted_;
-    int cursor_{0};
+    size_t cursor_{0};
  };

  explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
@@ -173,36 +173,8 @@ struct GraphTraits<DataFlowGraph> {
 // Extract the inputs and outputs of a graph. The inputs and outputs of a
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
-static std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
-  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
-  std::unordered_set<Node *> inputs;
-  std::unordered_set<Node *> outputs;
-  // Input a Value, check whether its inlink is in the subgraph.
-  auto inlink_in_subgraph = [&](Node *n) {
-    for (auto *in : n->inlinks) {
-      if (nodes.count(in)) return true;
-    }
-    return false;
-  };
-  for (auto &node : graph) {
-    for (auto *in : node->inlinks) {
-      // The Value that is written by nodes inside a sub-graph shouldn't be the
-      // input of the sub-graph.
-      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
-          !inlink_in_subgraph(in)) {
-        inputs.insert(in);
-      }
-    }
-    for (auto *out : node->outlinks) {
-      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
-        outputs.insert(out);
-      }
-    }
-  }
-  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
-                        std::vector<Node *>(outputs.begin(), outputs.end()));
-}
+std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);

 }  // namespace analysis
 }  // namespace inference

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -22,14 +22,18 @@

 namespace paddle {
 namespace inference {
+
+DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size");
+DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
+
 namespace analysis {

 using framework::proto::ProgramDesc;

 std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>>& nodes);
+    const std::vector<std::unique_ptr<Node>> &nodes);

-bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
+bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
  ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
  PADDLE_ENFORCE(!argument->transformed_program_desc);
@@ -47,76 +51,77 @@ bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {

 bool DataFlowGraphToFluidPass::Finalize() { return true; }

-void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
-  auto traits = GraphTraits<DataFlowGraph>(graph);
-  for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
-    if (it->deleted()) continue;
+void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
+  LOG(INFO) << "graph.inputs " << graph->inputs.size();
+  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
+    if (node.deleted()) continue;

-    switch (it->type()) {
+    switch (node.type()) {
      case Node::Type::kFunction: {
-        LOG(INFO) << "add function " << it->repr();
-        AddFluidOp(&(*it));
+        LOG(INFO) << "add function " << node.repr();
+        AddFluidOp(&node);
      } break;
      case Node::Type::kFunctionBlock: {
-        LOG(INFO) << "add engine op " << it->repr() << " , "
-                  << static_cast<FunctionBlock*>(&(*it))->subgraph.size();
-        AddEngineOp(&(*it));
+        LOG(INFO) << "add engine op " << node.repr() << " , "
+                  << static_cast<FunctionBlock *>(&node)->subgraph.size();
+        AddEngineOp(&node);
      } break;
      default:
        continue;
    }
  }
+
+  PADDLE_ENFORCE(argument_->transformed_program_desc.get());
 }

-void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
-  auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
+void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
+  auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
  // currently only the main block is analyzed.
-  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto* op = main_block->add_ops();
+  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto *op = main_block->add_ops();
  *op = *ori_op;  // copy the attributes, by default, these will not be changed
-                  // by analysis phrase.
+  // by analysis phrase.
  // The inputs and outputs of the existing ops are not changed by tensorrt
  // subgraph pass.
  // NOTE It might be changed by other passes in the long run.
 }

-void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
-                       const framework::proto::BlockDesc& block) {
+void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
+                       const framework::proto::BlockDesc &block) {
  static int counter{0};
  PADDLE_ENFORCE(node->IsFunctionBlock());
  framework::OpDesc desc;
-  auto* func = static_cast<FunctionBlock*>(node);
+  auto *func = static_cast<FunctionBlock *>(node);

  // collect inputs
  std::vector<std::string> io;
-  for (auto* x : func->inlinks) {
+  for (auto *x : func->inlinks) {
    io.push_back(x->name());
  }
  desc.SetInput("Xs", io);

  // collect outputs
  io.clear();
-  for (auto* x : func->outlinks) {
+  for (auto *x : func->outlinks) {
    io.push_back(x->name());
  }
  desc.SetOutput("Ys", io);
-
  desc.SetType("tensorrt_engine");
+
+  PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc");
  // Set attrs
  SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
-  SetAttr(desc.Proto(), "engine_unique_key",
-          "trt-" + std::to_string(counter++));
-  SetAttr(desc.Proto(), "max_batch", 100);  // TODO(Superjomn) add config latter
-  SetAttr(desc.Proto(), "max_workspace",
-          1024);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
+  SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
+  SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
  node->SetPbMsg(desc.Proto()->SerializeAsString());
 }

 std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>>& nodes) {
+    const std::vector<std::unique_ptr<Node>> &nodes) {
  std::vector<std::string> parameters;
-  for (const auto& node : nodes) {
+  for (const auto &node : nodes) {
    if (!node->IsValue()) continue;
    PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
    framework::proto::VarDesc var;
@@ -128,21 +133,30 @@ std::vector<std::string> ExtractParameters(
  return parameters;
 }

-void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
+void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
  // TODO(Superjomn) Here need to expose some arguments for default setting.
  PADDLE_ENFORCE(node->IsFunctionBlock());
-  auto* block_node = static_cast<FunctionBlock*>(node);
+  auto *block_node = static_cast<FunctionBlock *>(node);
  framework::proto::BlockDesc proto;
  framework::BlockDesc block_desc(nullptr, &proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+  LOG(INFO) << "origin variable size: "
+            << argument_->origin_program_desc->blocks(0).vars().size();
+  LOG(INFO) << "transformed variable size: "
+            << block_desc.Proto()->vars().size();
  // copy ops.
-  for (auto* node : block_node->subgraph) {
-    auto* op = block_desc.AppendOp();
+  for (auto *node : block_node->subgraph) {
+    auto *op = block_desc.AppendOp();
    PADDLE_ENFORCE(!node->pb_msg().empty());
    op->Proto()->ParseFromString(node->pb_msg());
  }
+  *block_desc.Proto()->mutable_vars() =
+      argument_->origin_program_desc->blocks(0).vars();
+  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
  CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
-  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto* op = main_block->add_ops();
+  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto *op = main_block->add_ops();
  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
  op->ParseFromString(node->pb_msg());
 }
@@ -151,7 +165,7 @@ namespace {
 class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 public:
  using Config = DFG_GraphvizDrawPass::Config;
-  explicit DFG_DebuggerPass(const Config& config)
+  explicit DFG_DebuggerPass(const Config &config)
      : DFG_GraphvizDrawPass(config) {}

  std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
@@ -160,7 +174,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }  // namespace

-Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
      FLAGS_inference_analysis_graphviz_log_root,
      "data_flow_graph_to_fluid_graphviz_debugger"));

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -26,6 +26,10 @@

 namespace paddle {
 namespace inference {
+
+DECLARE_int32(tensorrt_max_batchsize);
+DECLARE_int32(tensorrt_workspace_size);
+
 namespace analysis {
 class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
 public:

--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
    no++;
  }
  // DFG is sensitive to ProgramDesc, be careful to change the existing models.
-  ASSERT_EQ(no, 82);
+  ASSERT_EQ(no, 83);
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -28,7 +28,6 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
  PADDLE_ENFORCE(argument);
  if (!argument->main_dfg) {
-    LOG(INFO) << "Init DFG";
    argument->main_dfg.reset(new DataFlowGraph);
  }
  desc_ = argument->origin_program_desc.get();
@@ -51,6 +50,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
    v->SetPbMsg(var.SerializeAsString());
    var2id[var.name()] = v->id();
  }
+
  for (int i = 0; i < main_block.ops_size(); i++) {
    const auto &op = main_block.ops(i);
    auto *o = graph->nodes.Create(Node::Type::kFunction);
@@ -62,19 +62,31 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
    o->SetPbMsg(op.SerializeAsString());

    // set inputs and outputs
-    // TODO(Superjomn) make sure the InputNames is the real variable name.
+    std::unordered_set<Node *> inlinks;
    for (int j = 0; j < op.inputs_size(); j++) {
      auto &in_var = op.inputs(j);
      for (int k = 0; k < in_var.arguments_size(); k++) {
        auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
        in->outlinks.push_back(o);
        o->inlinks.push_back(in);
+        inlinks.insert(in);
      }
    }
    for (int j = 0; j < op.outputs_size(); j++) {
      auto &out_var = op.outputs(j);
      for (int k = 0; k < out_var.arguments_size(); k++) {
        auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
+        if (inlinks.count(out)) {
+          // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
+          auto *out_alias = graph->nodes.Create(Node::Type::kValue);
+          out_alias->SetName(out->name());
+          out_alias->SetPbDesc(out->pb_desc());
+          out_alias->SetPbMsg(out->pb_msg());
+          var2id[out_alias->name()] = out_alias->id();  // update a -> a0
+          LOG(INFO) << "loop found in graph, create SSA alias node ["
+                    << out_alias->repr() << "] for [" << out->repr() << "]";
+          out = out_alias;
+        }
        out->inlinks.push_back(o);
        o->outlinks.push_back(out);
      }

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -24,12 +24,12 @@ namespace analysis {
 TEST_F(DFG_Tester, Init) {
  FluidToDataFlowGraphPass pass;
  pass.Initialize(&argument);
-  DataFlowGraph graph;
-  pass.Run(&graph);
+  pass.Run(argument.main_dfg.get());
  // Analysis is sensitive to ProgramDesc, careful to change the original model.
-  ASSERT_EQ(graph.nodes.size(), 37UL);
+  ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
  pass.Finalize();
-  LOG(INFO) << '\n' << graph.DotString();
+  ASSERT_FALSE(argument.main_dfg->DotString().empty());
+  EXPECT_FALSE(argument.main_dfg->inputs.empty());
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -25,6 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(

 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
  SubGraphFuse(graph, node_inside_subgraph_teller_)();
+  VLOG(4) << "debug info "
+          << graph->HumanReadableInfo(false /*show_values*/,
+                                      true /*show_functions*/);
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -26,13 +26,13 @@ endif()
 function(inference_api_test TARGET_NAME)
    if (WITH_TESTING)
        set(options "")
-        set(oneValueArgs "")
+        set(oneValueArgs SRC)
        set(multiValueArgs ARGS)
        cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

        set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
        cc_test(${TARGET_NAME}
-                SRCS ${TARGET_NAME}.cc
+                SRCS ${inference_test_SRC}
                DEPS "${inference_deps}"
                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
        if(inference_test_ARGS)
@@ -73,24 +73,24 @@ if(NOT APPLE)
 endif()

 cc_test(test_paddle_inference_api
-        SRCS test_api.cc
+        SRCS api_tester.cc
        DEPS paddle_inference_api)

-inference_api_test(test_api_impl
+inference_api_test(test_api_impl SRC api_impl_tester.cc
                    ARGS test_word2vec test_image_classification)

 if(WITH_GPU AND TENSORRT_FOUND)
 cc_library(paddle_inference_tensorrt_subgraph_engine
        SRCS api_tensorrt_subgraph_engine.cc
-        DEPS paddle_inference_api analysis tensorrt_engine paddle_fluid_api)
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter)

-inference_api_test(test_api_tensorrt_subgraph_engine ARGS test_word2vec)
+inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()

 if (WITH_ANAKIN) # only needed in CI
    # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
    # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
-    # compile the libinference_anakin_api.a and compile with anakin.so.
+    # compile the libinference_anakin_api.a and anakin.so.
    nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
    nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
    target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})

--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -39,7 +39,7 @@ bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {

 bool PaddleInferenceAnakinPredictor::Run(
    const std::vector<PaddleTensor> &inputs,
-    std::vector<PaddleTensor> *output_data) {
+    std::vector<PaddleTensor> *output_data, int batch_size) {
  for (const auto &input : inputs) {
    if (input.dtype != PaddleDType::FLOAT32) {
      LOG(ERROR) << "Only support float type inputs. " << input.name

--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -37,7 +37,8 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
  // NOTE Unlike the native engine, the buffers of anakin engine's output_data
  // should be allocated first.
  bool Run(const std::vector<PaddleTensor>& inputs,
-           std::vector<PaddleTensor>* output_data) override;
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override;

  std::unique_ptr<PaddlePredictor> Clone() override;


--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -66,6 +66,7 @@ bool NativePaddlePredictor::Init(
  if (parent_scope) {
    scope_ = parent_scope;
    sub_scope_ = &(parent_scope->NewScope());
+    PADDLE_ENFORCE_NOT_NULL(sub_scope_, "create sub scope fail");
  } else {
    paddle::framework::InitDevices(false);
    scope_.reset(new paddle::framework::Scope());
@@ -102,13 +103,13 @@ bool NativePaddlePredictor::Init(

 NativePaddlePredictor::~NativePaddlePredictor() {
  if (sub_scope_) {
-    PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!");
    scope_->DeleteScope(sub_scope_);
  }
 }

 bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
-                                std::vector<PaddleTensor> *output_data) {
+                                std::vector<PaddleTensor> *output_data,
+                                int batch_size) {
  VLOG(3) << "Predictor::predict";
  Timer timer;
  timer.tic();

--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -38,7 +38,8 @@ class NativePaddlePredictor : public PaddlePredictor {
  bool Init(std::shared_ptr<framework::Scope> parent_scope);

  bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data) override;
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = -1) override;

  std::unique_ptr<PaddlePredictor> Clone() override;


--- a/paddle/fluid/inference/api/test_api_impl.cc
+++ b/paddle/fluid/inference/api/test_api_impl.cc
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/operators/tensorrt_engine_op.h"

 namespace paddle {

@@ -64,16 +65,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
      return false;
    }

-    // Analyze inference_program
-    Argument argument;
-    argument.origin_program_desc.reset(
-        new ProgramDesc(*inference_program_->Proto()));
-    Singleton<Analyzer>::Global().Run(&argument);
-    CHECK(argument.transformed_program_desc);
-    VLOG(5) << "transformed program:\n"
-            << argument.transformed_program_desc->SerializeAsString();
-    VLOG(5) << "to prepare executor";
-    *inference_program_->Proto() = *argument.transformed_program_desc;
+    OptimizeInferenceProgram();
    ctx_ = executor_->Prepare(*inference_program_, 0);

    VLOG(5) << "to create variables";
@@ -86,6 +78,29 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
    return true;
  }

+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override {
+    PADDLE_ENFORCE_GT(batch_size, 0,
+                      "TensorRT engine needs the argument batch_size set");
+    FLAGS_tensorrt_engine_batch_size = batch_size;
+    return NativePaddlePredictor::Run(inputs, output_data, batch_size);
+  }
+
+  void OptimizeInferenceProgram() {
+    // Analyze inference_program
+    Argument argument;
+    argument.origin_program_desc.reset(
+        new ProgramDesc(*inference_program_->Proto()));
+    Singleton<Analyzer>::Global().Run(&argument);
+    CHECK(argument.transformed_program_desc);
+    VLOG(5) << "transformed program:\n"
+            << argument.transformed_program_desc->SerializeAsString();
+    VLOG(5) << "to prepare executor";
+    inference_program_.reset(
+        new framework::ProgramDesc(*argument.transformed_program_desc));
+  }
+
 private:
  TensorRTConfig config_;
 };

--- a/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc
@@ -15,50 +15,79 @@
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"

 namespace paddle {

 DEFINE_string(dirname, "", "Directory of the inference model.");

-void Main(bool use_gpu) {
+void CompareTensorRTWithFluid(bool enable_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt;
+
  //# 1. Create PaddlePredictor with a config.
-  TensorRTConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
-  config.use_gpu = use_gpu;
-  config.fraction_of_gpu_memory = 0.15;
-  config.device = 0;
-  auto predictor =
+  NativeConfig config0;
+  config0.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config0.use_gpu = true;
+  config0.fraction_of_gpu_memory = 0.3;
+  config0.device = 0;
+
+  TensorRTConfig config1;
+  config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config1.use_gpu = true;
+  config1.fraction_of_gpu_memory = 0.3;
+  config1.device = 0;
+
+  auto predictor0 =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 =
      CreatePaddlePredictor<TensorRTConfig,
-                            PaddleEngineKind::kAutoMixedTensorRT>(config);
+                            PaddleEngineKind::kAutoMixedTensorRT>(config1);

-  for (int batch_id = 0; batch_id < 3; batch_id++) {
+  for (int batch_id = 0; batch_id < 1; batch_id++) {
    //# 2. Prepare input.
-    int64_t data[4] = {1, 2, 3, 4};
+    std::vector<int64_t> data(20);
+    for (int i = 0; i < 20; i++) data[i] = i;

-    PaddleTensor tensor{.name = "",
-                        .shape = std::vector<int>({4, 1}),
-                        .data = PaddleBuf(data, sizeof(data)),
-                        .dtype = PaddleDType::INT64};
+    PaddleTensor tensor{
+        .name = "",
+        .shape = std::vector<int>({10, 1}),
+        .data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)),
+        .dtype = PaddleDType::INT64};

    // For simplicity, we set all the slots with the same data.
    std::vector<PaddleTensor> slots(4, tensor);

    //# 3. Run
-    std::vector<PaddleTensor> outputs;
-    CHECK(predictor->Run(slots, &outputs));
+    std::vector<PaddleTensor> outputs0;
+    std::vector<PaddleTensor> outputs1;
+    CHECK(predictor0->Run(slots, &outputs0));
+    CHECK(predictor1->Run(slots, &outputs1, 10));

    //# 4. Get output.
-    ASSERT_EQ(outputs.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
-    const size_t num_elements = outputs.front().data.length() / sizeof(float);
-    // The outputs' buffers are in CPU memory.
-    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+    ASSERT_EQ(outputs0.size(), 1UL);
+    ASSERT_EQ(outputs1.size(), 1UL);
+
+    const size_t num_elements = outputs0.front().data.length() / sizeof(float);
+    const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
+    EXPECT_EQ(num_elements, num_elements1);
+
+    auto *data0 = static_cast<float *>(outputs0.front().data.data());
+    auto *data1 = static_cast<float *>(outputs1.front().data.data());
+
+    ASSERT_GT(num_elements, 0UL);
+    for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
+      EXPECT_NEAR(data0[i], data1[i], 1e-3);
    }
  }
 }

-TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
+TEST(paddle_inference_api_tensorrt_subgraph_engine, without_tensorrt) {
+  CompareTensorRTWithFluid(false);
+}
+
+TEST(paddle_inference_api_tensorrt_subgraph_engine, with_tensorrt) {
+  CompareTensorRTWithFluid(true);
+}

 }  // namespace paddle
--- a/paddle/fluid/inference/api/test_api.cc
+++ b/paddle/fluid/inference/api/test_api.cc
@@ -35,7 +35,8 @@ class DemoPredictor : public PaddlePredictor {
    LOG(INFO) << "I get other_config " << config.other_config;
  }
  bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data) override {
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = 0) override {
    LOG(INFO) << "Run";
    return false;
  }

--- a/paddle/fluid/inference/api/high_level_api.md
+++ b/paddle/fluid/inference/api/high_level_api.md
@@ -57,4 +57,4 @@ By specifying the engine kind and config, one can get a specific implementation.
 ## Reference

 - [paddle_inference_api.h](./paddle_inference_api.h)
- [some demos](./demo)
+- [some demos](./demo_ci)
--- a/paddle/fluid/inference/api/high_level_api_cn.md
+++ b/paddle/fluid/inference/api/high_level_api_cn.md
@@ -83,5 +83,5 @@ CHECK(predictor->Run(slots, &outputs));

 ## 详细代码参考

- [inference demos](./demo)
- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc)
+- [inference demos](./demo_ci)
+- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/test_api_impl.cc)
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -98,7 +98,8 @@ class PaddlePredictor {
  // responsible for the output tensor's buffer, either allocated or passed from
  // outside.
  virtual bool Run(const std::vector<PaddleTensor>& inputs,
-                   std::vector<PaddleTensor>* output_data) = 0;
+                   std::vector<PaddleTensor>* output_data,
+                   int batch_size = -1) = 0;

  // Clone a predictor that share the model weights, the Cloned predictor should
  // be thread-safe.

--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -93,6 +93,10 @@ class OpConverter {
  framework::Scope* scope_{nullptr};
 };

+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
 #define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)                      \
  struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
    trt_##op_type__##_converter() {                                            \
@@ -111,7 +115,3 @@ class OpConverter {
  extern int TouchConverterRegister_##op_type__();                      \
  static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
      TouchConverterRegister_##op_type__();
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -28,18 +28,20 @@ namespace tensorrt {

 int TensorRTEngine::runtime_batch_ = 1;

-void TensorRTEngine::Build(const DescType& paddle_model) {
+void TensorRTEngine::Build(const DescType &paddle_model) {
  PADDLE_ENFORCE(false, "not implemented");
 }

 void TensorRTEngine::Execute(int batch_size) {
-  std::vector<void*> buffers;
-  for (auto& buf : buffers_) {
+  batch_size_ = batch_size;
+  std::vector<void *> buffers;
+  for (auto &buf : buffers_) {
    PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
    PADDLE_ENFORCE_GT(buf.max_size, 0);
    PADDLE_ENFORCE(buf.device == DeviceType::GPU);
    buffers.push_back(buf.buffer);
  }
+  PADDLE_ENFORCE_NOT_NULL(stream_);
  infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
  cudaStreamSynchronize(*stream_);
  SetRuntimeBatch(batch_size);
@@ -48,7 +50,7 @@ void TensorRTEngine::Execute(int batch_size) {
 TensorRTEngine::~TensorRTEngine() {
  cudaStreamSynchronize(*stream_);
  // clean buffer
-  for (auto& buf : buffers_) {
+  for (auto &buf : buffers_) {
    if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
      PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
      buf.buffer = nullptr;
@@ -73,33 +75,37 @@ void TensorRTEngine::FreezeNetwork() {

  // allocate GPU buffers.
  buffers_.resize(buffer_sizes_.size());
-  for (auto& item : buffer_sizes_) {
+  for (auto &item : buffer_sizes_) {
+    // The output buffers are not set in the network building phrase, need to
+    // infer from the TesorRT network.
    if (item.second == 0) {
      auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
      auto dims = infer_engine_->getBindingDimensions(slot_offset);
      item.second = kDataTypeSize[static_cast<int>(
                        infer_engine_->getBindingDataType(slot_offset))] *
                    analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
+      PADDLE_ENFORCE_GT(item.second, 0);
    }
-    auto& buf = buffer(item.first);
+
+    auto &buf = buffer(item.first);
+    buf.max_size = item.second * max_batch_;
    CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
+
    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
-    VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
-            << buf.buffer;
-    buf.size = item.second;
-    buf.max_size = item.second * max_batch_;
+    buf.size = 0;
+    PADDLE_ENFORCE_LE(buf.max_size, 1 << 30);  // 10G
    buf.device = DeviceType::GPU;
  }
 }

-nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
+nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
                                                nvinfer1::DataType dtype,
-                                                const nvinfer1::Dims& dims) {
+                                                const nvinfer1::Dims &dims) {
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
                    name);

  PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
-  auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
+  auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
  PADDLE_ENFORCE(input, "infer network add input %s failed", name);
  buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
                        analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
@@ -108,12 +114,12 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
  return input;
 }

-void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
-                                   const std::string& name) {
+void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
+                                   const std::string &name) {
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                    name);

-  auto* output = layer->getOutput(offset);
+  auto *output = layer->getOutput(offset);
  SetITensor(name, output);
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
@@ -125,11 +131,11 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
  buffer_sizes_[name] = 0;
 }

-void TensorRTEngine::DeclareOutput(const std::string& name) {
+void TensorRTEngine::DeclareOutput(const std::string &name) {
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                    name);

-  auto* output = TensorRTEngine::GetITensor(name);
+  auto *output = TensorRTEngine::GetITensor(name);
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
  PADDLE_ENFORCE(!output->isNetworkInput());
@@ -139,13 +145,13 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
  buffer_sizes_[name] = 0;
 }

-void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
+void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
  return buffer(name).buffer;
 }

-void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst) {
+void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst) {
  // determine data size
-  auto* output = TensorRTEngine::GetITensor(name);
+  auto *output = TensorRTEngine::GetITensor(name);
  nvinfer1::Dims dims = output->getDimensions();
  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
  size_t dst_size = dim_size * runtime_batch_ *
@@ -155,17 +161,17 @@ void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst) {
  PADDLE_ENFORCE(it != buffer_sizes_.end());
  PADDLE_ENFORCE_GT(it->second, 0);
  PADDLE_ENFORCE_LE(dst_size, it->second);
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
                                    cudaMemcpyDeviceToDevice, *stream_),
                    0);
 }

-void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst) {
+void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst) {
  // determine data size

-  auto* output = TensorRTEngine::GetITensor(name);
+  auto *output = TensorRTEngine::GetITensor(name);
  nvinfer1::Dims dims = output->getDimensions();
  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
  size_t dst_size = dim_size * runtime_batch_ *
@@ -174,13 +180,13 @@ void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst) {
  PADDLE_ENFORCE(it != buffer_sizes_.end());
  PADDLE_ENFORCE_GT(it->second, 0);
  PADDLE_ENFORCE_LE(dst_size, it->second);
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
                                       cudaMemcpyDeviceToHost, *stream_));
 }

-Buffer& TensorRTEngine::buffer(const std::string& name) {
+Buffer &TensorRTEngine::buffer(const std::string &name) {
  PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
  auto it = buffer_sizes_.find(name);
  PADDLE_ENFORCE(it != buffer_sizes_.end());
@@ -188,19 +194,23 @@ Buffer& TensorRTEngine::buffer(const std::string& name) {
  return buffers_[slot_offset];
 }

-void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
+void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
                                     size_t size) {
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
+  PADDLE_ENFORCE_NOT_NULL(data);
+  PADDLE_ENFORCE_NOT_NULL(stream_);
  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
+  buf.size = size;
  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
                                       cudaMemcpyHostToDevice, *stream_));
 }

-void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
+void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
                                     size_t size) {
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
+  buf.size = size;
  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
@@ -208,15 +218,15 @@ void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
                                       cudaMemcpyDeviceToDevice, *stream_));
 }

-void TensorRTEngine::SetITensor(const std::string& name,
-                                nvinfer1::ITensor* tensor) {
+void TensorRTEngine::SetITensor(const std::string &name,
+                                nvinfer1::ITensor *tensor) {
  PADDLE_ENFORCE(tensor != nullptr);
  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
                    name);
  itensor_map_[name] = tensor;
 }

-nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
+nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
  PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
  return itensor_map_[name];
 }

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ b/paddle/fluid/operators/detection/target_assign_op.h
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
--- a/paddle/fluid/operators/distributed/bytebuffer_stream.cc
+++ b/paddle/fluid/operators/distributed/bytebuffer_stream.cc
--- a/paddle/fluid/operators/distributed/bytebuffer_stream.h
+++ b/paddle/fluid/operators/distributed/bytebuffer_stream.h
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
--- a/paddle/fluid/operators/distributed/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc_serde.h
--- a/paddle/fluid/operators/distributed/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
--- a/paddle/fluid/operators/distributed/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc_service.h
--- a/paddle/fluid/operators/distributed/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.cc
--- a/paddle/fluid/framework/details/ssa_graph.h
+++ b/paddle/fluid/framework/details/ssa_graph.h
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
--- a/paddle/fluid/operators/distributed/send_recv.proto
+++ b/paddle/fluid/operators/distributed/send_recv.proto
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
--- a/paddle/fluid/operators/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_sum_op.cc
--- a/paddle/fluid/operators/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_sum_op.h
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
--- a/python/paddle/fluid/contrib/decoder/__init__.py
+++ b/python/paddle/fluid/contrib/decoder/__init__.py
--- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
--- a/python/paddle/fluid/tests/demo/pyreader.py
+++ b/python/paddle/fluid/tests/demo/pyreader.py
--- a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
--- a/python/paddle/fluid/tests/demo/text_classification/train.py
+++ b/python/paddle/fluid/tests/demo/text_classification/train.py
--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
--- a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
--- a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
--- a/python/paddle/fluid/tests/unittests/test_version.py
+++ b/python/paddle/fluid/tests/unittests/test_version.py
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
--- a/python/setup.py.in
+++ b/python/setup.py.in