deal with conflict

f42ea489 · nhzlx · 940f5dbc · 1e417b93 · f42ea489 · f42ea489
194 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,6 +136,12 @@ else()
    set(THIRD_PARTY_BUILD_TYPE Release)
 endif()

+if(WITH_MKL)
+  option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF)
+  if (MKL_SPLIT_GEMM)
+    add_definitions(-DPADDLE_MKL_SPLIT_GEMM)
+  endif()
+endif()
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)
    if (WITH_MKL AND AVX2_FOUND)

--- a/README.md
+++ b/README.md
@@ -18,7 +18,21 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.

-### Lastest PaddlePaddle Version: [Fluid](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid)
+
+### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0)
+### Install Latest Stable Release:
+```
+# Linux CPU
+pip install paddlepaddle
+# Linux GPU cuda9cudnn7
+pip install paddlepaddle-gpu
+# Linux GPU cuda8cudnn7
+pip install paddlepaddle-gpu==0.14.0.post87
+# Linux GPU cuda8cudnn5
+pip install paddlepaddle-gpu==0.14.0.post85
+
+# For installation on other platform, refer to http://paddlepaddle.org/
+```

 ## Features


--- a/benchmark/paddle/image/run.sh
+++ b/benchmark/paddle/image/run.sh
+#!/bin/bash
+
 set -e

 function train() {

--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
+#!/bin/bash
+
 set -e

 function clock_to_seconds() {

--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
+#!/bin/bash
+
 set -e

 function train() {

--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
+#!/bin/bash
+
 set -e

 function clock_to_seconds() {

--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
+#!/bin/bash
+
 set -e

 function train() {

--- a/benchmark/paddle/rnn/run.sh
+++ b/benchmark/paddle/rnn/run.sh
+#!/bin/bash
+
 set -e

 function train() {

--- a/benchmark/tensorflow/image/run.sh
+++ b/benchmark/tensorflow/image/run.sh
+#!/bin/bash
+
 set -e

 function test() {

--- a/benchmark/tensorflow/image/run_multi.sh
+++ b/benchmark/tensorflow/image/run_multi.sh
+#!/bin/bash
+
 set -e

 function test() {

--- a/benchmark/tensorflow/rnn/run.sh
+++ b/benchmark/tensorflow/rnn/run.sh
+#!/bin/bash
+
 set -e

 function test() {

--- a/benchmark/tensorflow/rnn/run_multi.sh
+++ b/benchmark/tensorflow/rnn/run_multi.sh
+#!/bin/bash
+
 set -e

 function test() {

--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -4,25 +4,42 @@ set(tmp_version "HEAD")
 set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
 set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
 while ("${PADDLE_VERSION}" STREQUAL "")
+  # Check current branch name
  execute_process(
-    COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
+    COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version}
    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-    OUTPUT_VARIABLE GIT_TAG_NAME
-    RESULT_VARIABLE GIT_RESULT
+    OUTPUT_VARIABLE GIT_BRANCH_NAME
+    RESULT_VARIABLE GIT_BRANCH_RESULT
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if (NOT ${GIT_RESULT})
-    # Check the tag is a correct version
-    if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
-      # if no tag was found, set PADDLE_VERSION to latest
-      set(PADDLE_VERSION "latest")
-    elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
-      string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
-    else()  # otherwise, get the previous git tag name.
-      set(tmp_version "${GIT_TAG_NAME}~1")
+  if (NOT ${GIT_BRANCH_RESULT})
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+      OUTPUT_VARIABLE GIT_TAG_NAME
+      RESULT_VARIABLE GIT_RESULT
+      ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if (NOT ${GIT_RESULT})
+      # Check if current branch is release branch
+      if (${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
+        # Check the tag is a correct version
+        if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
+          # if no tag was found, set PADDLE_VERSION to latest
+          set(PADDLE_VERSION "latest")
+        elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
+          string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
+        else()  # otherwise, get the previous git tag name.
+          set(tmp_version "${GIT_TAG_NAME}~1")
+        endif()
+      else() # otherwise, we always set PADDLE_VERSION to latest
+        set(PADDLE_VERSION "latest")
+      endif()
+    else()
+      set(PADDLE_VERSION "0.0.0")
+      message(WARNING "Cannot add paddle version from git tag")
    endif()
  else()
    set(PADDLE_VERSION "0.0.0")
-    message(WARNING "Cannot add paddle version from git tag")
+    message(WARNING "Cannot add paddle version for wrong git branch result")
  endif()
 endwhile()


--- a/doc/fluid/design/ir/draft.md
+++ b/doc/fluid/design/ir/draft.md
+## Motivation
+
+There is a ```gap``` between the ```Program``` defined by
+user and the ```Executable``` that can be scheduled
+efficiently on heterogeneous hardware, either locally
+or distributedly.
+
+Usually, the ```gap``` is bridged by
+
+* A serious transformations with defined order.
+
+* These transformations usually involve
+```insert, delete, clustering, split, dependency analysis```.
+
+* Has a simple way to verify and debug each transformation.
+
+* Flexible to add, remove or customize transformations to fit
+the requirements of various algorithms (models) and hardware secenarios.
+
+Some other events also push us to a better unified pattern.
+
+* The deep learning framework is built around the concepts of graphs.
+To leverage tools such as compilation (e.g. TVM and nGraph) or
+cross-framework conversion (e.g. ONNX), we also need a intermediate
+representation that can be connected to the rest of the ecosystem.
+
+
+We need a unified pattern to naturally support the requirements
+described above. The pattern should fit both training, inference
+and other offline serielized model transformations.
+Learned from LLVM and other deep learning framework, we draft the
+design below.
+
+
+## Design
+
+### Major Concepts
+
+#### Node
+
+```Node``` represents an operation that performs some computation or
+a variable that is input or output of operation.
+
+```Node```s are connected to other ```Node```s via inputs and outputs.
+
+Other properties (maybe device placement information) can be added
+to ```Node``` in the future if it's a
+common requirement of many other ```Pass```es. Otherwise, it should live
+in a ```Node``` wrapper class that is private to some ```Pass``` or be
+a local member of a ```Pass```.
+
+#### Graph
+
+```Graph``` contains a list of ```Node```s, which are connected to
+each other via inputs and outputs.
+
+TODO: Better definitions for the graph.
+
+```Graph``` can also contain ```Attribute```s. ```Attribute```s
+can be ``any`` thing. For example, it can be a list of "wraper"
+nodes. The ```wrapper``` nodes compose ```Node```s and provide
+helper method for execution or transformation. ```Attribute```
+can also contain other things that describe some properties of
+the ```Graph``` or ```Graph``` nodes. ```Attribute``` can be passed
+across ```Pass```. However, it should be used with care.
+
+#### Pass
+
+```Pass``` represents a transformation of ```Graph```. Its input
+is a ```Graph``` and its output is also a ```Graph```. For example,
+a ```Pass``` can simply print out the ```Graph```. A ```Pass```
+can also fuse some ```Graph```'s ```Node```s.
+
+#### Optimize
+
+```Optimize``` contains a series of ```Pass``` with defined order.
+```Optimize``` transforms a ```Graph``` that only contains raw
+modeling logic to a ```Graph``` that can be run efficiently while
+maintaining the original modeling logic.
+
+
+### Optimize Process
+
+* Program is first converted to Graph.
+* Graph goes through a series of Pass
+* Graph is transformed from raw model logic to a
+form that is efficient to execute.
+
+Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
--- a/doc/v2/api/index_en.rst
+++ b/doc/v2/api/index_en.rst
@@ -4,7 +4,6 @@ API
 ..  toctree::
    :maxdepth: 1

-    overview.rst
    model_configs.rst
    data.rst
    run_logic.rst
--- a/doc/v2/faq/parameter/index_en.rst
+++ b/doc/v2/faq/parameter/index_en.rst
-#################
-Parameter Setting
-#################
+##################
+Parameter Settings
+##################

-TBD
+.. contents::
+
+1. How to Choose the Learning Rate of SGD Algorithm
+--------------------------
+
+An important issue when training with :code:`sgd/async_sgd` is to choose the correct value for :code:`learning_rate`. If it is too large, the training may not converge. If too small, the convergence may be slow, resulting in a long training time.
+
+Usually, we start with a relatively large learning rate. If the training does not converge, then we need to reduce the learning rate continuously by a factor of 10 until the training converges. We examine the convergence of the training by estimating the minimum cost at a constant output of the model.
+
+If the cost of the training process is significantly higher than the cost of the output, then we judge that the training does not converge. For example, if we have a three-class problem and use multi-class-cross-entropy as the cost, the ratio of 0, 1, and 2 in the data will be :code:`0.2, 0.5, 0.3`. The minimum cost thus will be :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03`. If the cost is greater than this number after training a pass (or even before), then the training may not be converged and the learning rate should be reduced.
+
+2. How to Implement Learning Rate Annealing
+------------------------------------------------
+
+We use the Adam algorithm as an example. Set the parameters of :code:`learning_rate_schedule` in the corresponding optimization algorithm as follows:
+
+.. code-block:: python
+
+    Optimizer = paddle.optimizer.Adam(
+        Learning_rate=1e-3,
+        Learning_rate_decay_a=0.5,
+        Learning_rate_decay_b=0.75,
+        Learning_rate_schedule="poly",)
+
+PaddlePaddle currently supports 8 learning rate schedules. The 8 learning rate schedules and their corresponding learning rates are calculated as follows:
+
+* "constant"
+  
+  Lr = learning_rate
+
+* "poly"
+
+  Lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
+
+  Variable :code:`num_samples_processed` is the number of trained samples.
+
+* "caffe_poly"
+
+  Lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
+
+* "exp"
+
+  Lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
+
+* "discexp"
+
+  Lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
+
+* "linear"
+
+  Lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
+
+* "manual"
+
+  This is a learning rate annealing method that is segmented by the number of trained samples. When using this learning rate schedule, we modify the learning rate attenuation factor piecewise function by changing the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
+
+  .. code-block:: python
+
+      Optimizer = paddle.optimizer.Adam(
+          Learning_rate=1e-3,
+          Learning_rate_schedule="manual",
+          Learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
+
+  In this example, when the number of trained samples is less than or equal to 1000, the learning rate is: code:`1e-3*1.0`; when the number of trained samples is greater than 1000 or less than or equal to 2000, the learning rate is:code:`1e- 3 * 0.9`; when the number of trained samples is greater than 2,000, the learning rate is: code:`1e-3*0.8`.
+
+* "pass_manual"
+
+  This is a learning rate annealing method that piecewisely pick values according to the number of trained passes. When using this learning rate schedule, we set the learning rate attenuation factor piecewise function by the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
+
+  .. code-block:: python
+
+      Optimizer = paddle.optimizer.Adam(
+          Learning_rate=1e-3,
+          Learning_rate_schedule="pass_manual",
+          Learning_rate_args="1:1.0,2:0.9,3:0.8",)
+
+  In this example, when the number of trained passes is less than or equal to 1, the learning rate is :code:`1e-3*1.0`; when the number of trained passes is greater than 1 or less than 2, the learning rate is :code:`1e- 3 * 0.9`; when the number of trained passes is greater than 2, the learning rate is :code:`1e-3*0.8`.
+
+3. How to Initialize Parameters
+-----------------
+
+By default, PaddlePaddle initializes parameters with an average of 0 and a standard deviation of :math:`\frac{1}{\sqrt{d}}`, where :math:`d` is the width of the parameter matrix. This initialization method does not produce bad results under normal circumstances. If users want to customize the initialization method, PaddlePaddle provides two ways to initialize the parameters:
+
+* Gaussian distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
+* Uniform distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
+
+For example, to set a full connection layer parameter initialization mode and bias initialization mode, you can use the following code:
+
+.. code-block:: python
+
+    Hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
+                      Bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
+
+The above code initializes the bias to 1.0 and initializes the parameters to a uniform distribution of :code:`[1.0, -1.0]`.
+
+4. How to Share Parameters
+---------------
+
+PaddlePaddle's parameters use :code:`name` as the ID. Parameters with the same name will share parameters//. We can set the name of the parameters using :code:`ParamAttr(name="YOUR_PARAM_NAME")`. More conveniently, we can make the parameters to be shared use the same :code:`ParamAttr` object.
+
+A simple fully connected network has its configuration of parameter sharing as follows \:
+
+.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+
+Here :code:`hidden_a` and :code:`hidden_b` have the same parameter and bias. The two input of the softmax layer also use the same parameter :code:`softmax_param`.
+
+5. How to Load Pre-training Parameters
+------------------------
+* For layers that load pre-training parameters, set :code:`is_static = True` so that the parameters of that layer remain unchanged during the training process. Take the embedding layer as an example, the code is as follows:
+
+.. code-block:: python
+
+    Emb_para = paddle.attr.Param(name='emb', is_static=True)
+    Paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
+
+
+* Load pre-training parameters from the model file into :code:`numpy.array`. After creating the parameters, load the pre-training parameters using :code:`parameters.set()`. The first 16 bytes of the model parameter file saved by PaddlePaddle is the header information. The user must loads : :code:`numpy.array` starting with the 17th byte. Take the embedding layer as an example, the code is as follows:
+
+.. code-block:: python
+
+    Def load_parameter(file_name, h, w):
+        With open(file_name, 'rb') as f:
+            F.read(16) # skip header.
+            Return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+    Parameters = paddle.parameters.create(my_cost)
+    Parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
+
+6. Format of the Stored Parameter and How to Convert the File to Plain Text
+--------------------------------------------------
+
+The model parameter file saved by PaddlePaddle consists of 16 bytes of header information and network parameters. In the header information, the first four bytes show PaddlePaddle's version information. The user should fill in with 0s. The next four bytes represent the number of bytes occupied by each parameter. If the saved network parameter is a float type, the number is four; if it is a double, the number is eight. The third group of four bytes represents the total number of saved parameters.
+
+When restoring the model parameters saved by PaddlePaddle back to plain text, we use the corresponding data type :code:`numpy.array` to load specific network parameters. At this time, you can skip the header information of the PaddlePaddle model parameter file. If not specified to compile with a precision for double in PaddlePaddle, then the parameter file will be caiculated with a precision for float, and the argument will be stored as a float. In this case, when using :code:`numpy.array`, generally we set :code:`dtype=float32`. An example is as follows:
+
+.. code-block:: python
+
+    Def read_parameter(fname, width):
+        s = open(fname).read()
+        # skip header
+        Vec = np.fromstring(s[16:], dtype=np.float32)
+        # width is the size of the corresponding layer
+        Np.savetxt(fname + ".csv", vec.reshape(width, -1),
+                Fmt="%.6f", delimiter=",")
+
+
+When the plaintext parameters are converted into PaddlePaddle loadable model parameters, the header information is constructed first, then the network parameters are written. The following code converts the randomly generated matrix into model parameters that can be loaded by PaddlePaddle:
+
+.. code-block:: python
+
+    Def gen_rand_param(param_file, width, height, need_trans):
+        Np.random.seed()
+        Header = struct.pack("iil", 0, 4, height * width)
+        Param = np.float32(np.random.rand(height, width))
+        With open(param_file, "w") as fparam:
+            Fparam.write(header + param.tostring())
+
+7. A Protocol Message Rejected Because of its Large Size
+-------------------------------------------------- ----------
+
+If you are training NLP related models, and the following error occurs:
+
+.. code-block:: bash
+
+    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit( ) in google/protobuf/io/coded_stream.h.
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
+
+The possible reason is that one of the args passed to the dataprovider is too large, which is usually caused by directly passing a large dictionary. A wrongly defineed `_py_data_sources2` is similar to:
+
+.. code-block:: python
+
+     Src_dict = dict()
+     For line_count, line in enumerate(open(src_dict_path, "r")):
+        Src_dict[line.strip()] = line_count
+
+     Define_py_data_sources2(
+        Train_list,
+        Test_list,
+        Module="dataprovider",
+        Obj="process",
+        Args={"src_dict": src_dict})
+
+The solution is to pass the address of the dictionary as args to the dataprovider, and then load the dictionary according to the address in the dataprovider. Change `_py_data_sources2` to:
+
+.. code-block:: python
+
+     Define_py_data_sources2(
+        Train_list,
+        Test_list,
+        Module="dataprovider",
+        Obj="process",
+        Args={"src_dict_path": src_dict_path})
+
+The full source code can be found in the `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_recurrent.py>`_ example.
--- a/doc/v2/howto/rnn/hierarchical_layer_en.rst
+++ b/doc/v2/howto/rnn/hierarchical_layer_en.rst
-Layers supporting hierarchical sequence as input
-================================================
-
-TBD
+###########################
+Layers that Support Hierarchical Sequences as Input
+###########################
+ 
+.. contents::
+ 
+Overview 
+====
+ 
+A sequence is a common data type in natural language processing tasks. An independent word can be regarded as a non-sequential input or a 0-level sequence. A sentence made up of words is a single-level sequence; a number of sentences make up a paragraph, which is a double-level sequence.
+ 
+A double-level sequence is a nested sequence where each element is a single-level sequence. This is a very flexible way of organizing data that helps us construct some complex input information.
+ 
+We can define non-sequences, single-level sequences, and double-level sequences at the following levels.
+ 
+ 0-level sequence: an independent element. Its type can be any input data type supported by PaddlePaddle;
+ Single-level sequence: multiple elements arranged in a row; each element is a 0-level sequence. The order of elements is an important input information;
+ Double-level sequence: multiple elements arranged in a row; each element is a single-layer sequence called a subseq of a double-level sequence, and each element of the subseq is a 0-level sequence.
+ 
+In PaddlePaddle, the following layers accept double-layer sequences as input and perform corresponding calculations.
+ 
+`pooling`
+========
+ 
+The use of pooling is as follows:
+ 
+.. code-block:: bash
+ 
+        Seq_pool = pooling(input=layer,
+                           Pooling_type=pooling.Max(),
+                           Agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- `pooling_type` currently supports two types: pooling.Max() and pooling.Avg().
+ 
+- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
+ 
+  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence 
+  - Input: a double-level sequence or a single-level sequence
+  - Output: a 0-level sequence which is the average (or maximum) of the entire input sequence (single or double)
+ 
+- When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
+ 
+  - Effect: a double-level sequence will be transformed into a single-level sequence
+  - Input: a double-level sequence
+  - Output: a single-level sequence where each element of the sequence is the average (or maximum) value of each subseq element of the original double-level sequence.
+ 
+`last_seq` and `first_seq`
+=====================
+ 
+An example of using `last_seq` is as follows (usage of `first_seq` is similar).
+ 
+.. code-block:: bash
+ 
+        Last = last_seq(input=layer,
+                        Agg_level=AggregateLevel.TO_SEQUENCE)
+        
+- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
+ 
+  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence
+  - Input: a double-level sequence or a single-level sequence
+  - Output: a 0-level sequence, which is the last or the first element of the input sequence (double or single level).
+ 
+- When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
+  - Effect: a double-level sequence will be transformed into a single-level sequence
+  - Input: a double-level sequence
+  - Output: a single-layer sequence in which each element is the last (or first) element of each subseq in a double-level sequence.
+ 
+`expand`
+======
+ 
+The use of expand is as follows.
+ 
+.. code-block:: bash
+ 
+        Ex = expand(input=layer1,
+                    Expand_as=layer2,
+                    Expand_level=ExpandLevel.FROM_NO_SEQUENCE)
+        
+- When `expand_level=ExpandLevel.FROM_NO_SEQUENCE` (default):
+ 
+  - Effect: a 0-level sequence is extended to a single-level sequence or a double-level sequence
+  - Input: layer1 must be a 0-level sequence to be extended; layer2 can be a single-level sequence or a double-level sequence that provides the extended length information
+  - Output: a single-level sequence or a double-level sequence; the type of the output sequence and the number of elements contained in the sequence are the same as layer2. If the output is a single-level sequence, each element of the single-level sequence will be a copy of the layer1 element. If the output is a double-level sequence, each element in the double-level sequence will be a copy of the layer1 element
+ 
+- When `expand_level=ExpandLevel.FROM_SEQUENCE`:
+ 
+  - Effect: a single-level sequence is extended to a double-level sequence
+  - Input: layer1 must be a single-level sequence to be extended; layer2 must be a double-level sequence providing extended length information
+  - Output: a double-level sequence with the same number of elements as that of layer2. It is required that the number of elements in the single-level sequence be the same as the number of subseq in the double-level sequences. The i-th element of the single-level sequence (the 0-level sequence) is expanded into a single-level sequence that constitutes the i-th subseq of the output, the double-level sequence.
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -179,26 +179,17 @@ paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaul
 paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
-paddle.fluid.layers.BlockGuardServ.__init__ ArgSpec(args=['self', 'server'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.ListenAndServ.__init__ ArgSpec(args=['self', 'endpoint', 'inputs', 'fan_in', 'optimizer_mode'], varargs=None, keywords=None, defaults=(1, True))
-paddle.fluid.layers.ListenAndServ.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.ListenAndServ.do ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.ListenAndServ.get_params_and_grads ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.ListenAndServ.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.Send ArgSpec(args=['endpoints', 'send_vars', 'sync'], varargs=None, keywords=None, defaults=(True,))
-paddle.fluid.layers.Recv ArgSpec(args=['endpoints', 'get_vars', 'sync'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
-paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, None, 1, True))
+paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.shuffle ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,))
-paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True))
 paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.Preprocessor.is_completed ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Preprocessor.outputs ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
 paddle.fluid.layers.load ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False))
@@ -218,9 +209,6 @@ paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=
 paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.split_lod_tensor ArgSpec(args=['input', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
 paddle.fluid.layers.merge_lod_tensor ArgSpec(args=['in_true', 'in_false', 'x', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
-paddle.fluid.layers.BlockGuard.__init__ ArgSpec(args=['self', 'main_program'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.BlockGuardWithCompletion.__init__ ArgSpec(args=['self', 'rnn'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.WhileGuard.__init__ ArgSpec(args=['self', 'while_op'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
@@ -350,6 +338,26 @@ paddle.fluid.layers.polynomial_decay ArgSpec(args=['learning_rate', 'decay_steps
 paddle.fluid.layers.piecewise_decay ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.noam_decay ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.append_LARS ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.InitState.__init__ ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32'))
+paddle.fluid.contrib.StateCell.__init__ ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.StateCell.compute_state ArgSpec(args=['self', 'inputs'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.get_input ArgSpec(args=['self', 'input_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.get_state ArgSpec(args=['self', 'state_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.out_state ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.set_state ArgSpec(args=['self', 'state_name', 'state_value'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.state_updater ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.StateCell.update_states ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.TrainingDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.contrib.TrainingDecoder.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
+paddle.fluid.contrib.TrainingDecoder.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.TrainingDecoder.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None))
+paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
+paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
 add_subdirectory(details)
+add_subdirectory(ir)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)

@@ -93,7 +94,7 @@ else()
 endif()


-cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph)

 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
-cc_library(var_handle SRCS var_handle.cc DEPS place)
+cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto)
 cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)

-cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
-cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
+cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph)
 cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
 cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)

@@ -35,7 +34,7 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS

 cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)

-cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
+cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)


--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -23,10 +23,14 @@ namespace framework {
 namespace details {

 #ifdef PADDLE_WITH_CUDA
-AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
+                                     const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places,
                                     const platform::NCCLContextMap *ctxs)
-    : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
+    : OpHandleBase(node),
+      local_scopes_(local_scopes),
+      places_(places),
+      nccl_ctxs_(ctxs) {
  if (nccl_ctxs_) {
    for (auto &p : places_) {
      this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
@@ -34,9 +38,10 @@ AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
  }
 }
 #else
-AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
+                                     const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places)
-    : local_scopes_(local_scopes), places_(places) {}
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif

 void AllReduceOpHandle::RunImpl() {

--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -30,11 +30,11 @@ namespace details {

 struct AllReduceOpHandle : public OpHandleBase {
 #ifdef PADDLE_WITH_CUDA
-  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places,
                    const platform::NCCLContextMap *ctxs);
 #else
-  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places);
 #endif
  std::string Name() const override;

--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -35,10 +35,13 @@ namespace details {
 struct BroadcastOpHandle : public OpHandleBase {
 public:
 #ifdef PADDLE_WITH_CUDA
-  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places,
                    const platform::NCCLContextMap *nccl_ctxs)
-      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+      : OpHandleBase(node),
+        local_scopes_(local_scopes),
+        places_(places),
+        nccl_ctxs_(nccl_ctxs) {
    if (nccl_ctxs_) {
      for (auto &p_ctx : nccl_ctxs_->contexts_) {
        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
@@ -46,9 +49,9 @@ struct BroadcastOpHandle : public OpHandleBase {
    }
  }
 #else
-  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places)
-      : local_scopes_(local_scopes), places_(places) {}
+      : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif

  std::string Name() const override;

--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -96,48 +96,61 @@ struct TestBroadcastOpHandle {
    }
    param_scopes_[input_scope_idx]->Var("input");

+    std::unique_ptr<ir::Node> n(
+        new ir::Node("node0", ir::Node::Type::kOperation));
    if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
+                                             nccl_ctxs_.get()));
 #else
      PADDLE_THROW("CUDA is not support.");
 #endif
    } else {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
+                                             nccl_ctxs_.get()));
 #else
-      op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
+      op_handle_.reset(
+          new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_));
 #endif
    }

-    auto* in_var_handle =
-        new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
+    std::unique_ptr<ir::Node> v(
+        new ir::Node("node1", ir::Node::Type::kVariable));
+    auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
+                                        gpu_list_[input_scope_idx]);
    vars_.emplace_back(in_var_handle);
    op_handle_->AddInput(in_var_handle);

    // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+
+    std::unique_ptr<ir::Node> v2(
+        new ir::Node("node2", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(v2.get()));
    DummyVarHandle* dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
-    dummy_var_handle->generated_op_ = nullptr;
+    dummy_var_handle->ClearGeneratedOp();
    op_handle_->AddInput(dummy_var_handle);

    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      if (!use_gpu_) {
        op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
      }
-      VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
+      std::unique_ptr<ir::Node> v3(
+          new ir::Node("node3", ir::Node::Type::kVariable));
+      VarHandle* out_var_handle =
+          new VarHandle(v3.get(), 2, j, "out", gpu_list_[j]);
      vars_.emplace_back(out_var_handle);
      op_handle_->AddOutput(out_var_handle);
    }

    // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+    std::unique_ptr<ir::Node> v4(
+        new ir::Node("node4", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(v4.get()));
    DummyVarHandle* out_dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
-    out_dummy_var_handle->generated_op_ = nullptr;
+    out_dummy_var_handle->ClearGeneratedOp();
    op_handle_->AddOutput(out_dummy_var_handle);
  }


--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -19,9 +19,10 @@
 namespace paddle {
 namespace framework {
 namespace details {
-ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
+ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
                                         platform::Place place)
-    : op_(framework::OpRegistry::CreateOp(op_desc)),
+    : OpHandleBase(node),
+      op_(framework::OpRegistry::CreateOp(*node->Op())),
      scope_(scope),
      place_(place) {}

@@ -35,8 +36,8 @@ void ComputationOpHandle::RunImpl() {

 bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
  bool need_wait =
-      in_var && in_var->generated_op_ &&
-      in_var->generated_op_->DeviceContext(place_) != dev_ctxes_[place_];
+      in_var && in_var->GeneratedOp() &&
+      in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_];
  return need_wait;
 }


--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -28,8 +28,7 @@ namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
 public:
-  ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
-                      platform::Place place);
+  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place);

  std::string Name() const override;


--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -22,10 +22,10 @@ namespace details {

 #ifdef PADDLE_WITH_CUDA
 DataBalanceOpHandle::DataBalanceOpHandle(
-    const std::vector<Scope *> &local_scopes,
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
    const std::vector<platform::Place> &places,
    const platform::NCCLContextMap *ctxs)
-    : local_scopes_(local_scopes), places_(places) {
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
  if (ctxs) {
    for (auto &p : places_) {
      this->dev_ctxes_[p] = ctxs->DevCtx(p);
@@ -34,9 +34,9 @@ DataBalanceOpHandle::DataBalanceOpHandle(
 }
 #else
 DataBalanceOpHandle::DataBalanceOpHandle(
-    const std::vector<Scope *> &local_scopes,
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
    const std::vector<platform::Place> &places)
-    : local_scopes_(local_scopes), places_(places) {}
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif

 std::string DataBalanceOpHandle::Name() const { return "data balance"; }

--- a/paddle/fluid/framework/details/data_balance_op_handle.h
+++ b/paddle/fluid/framework/details/data_balance_op_handle.h
@@ -30,11 +30,11 @@ namespace details {
 struct DataBalanceOpHandle : public OpHandleBase {
 public:
 #ifdef PADDLE_WITH_CUDA
-  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                      const std::vector<platform::Place> &places,
                      const platform::NCCLContextMap *ctxs);
 #else
-  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                      const std::vector<platform::Place> &places);
 #endif


--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -21,13 +21,16 @@ namespace paddle {
 namespace framework {
 namespace details {

-FetchOpHandle::FetchOpHandle(FeedFetchList *data, size_t offset,
+FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
                             std::vector<Scope *> *local_scopes)
-    : data_(data), offset_(offset), local_scopes_(local_scopes) {}
+    : OpHandleBase(node),
+      data_(data),
+      offset_(offset),
+      local_scopes_(local_scopes) {}

 FetchOpHandle::~FetchOpHandle() {
  for (auto *input_var : inputs_) {
-    input_var->pending_ops_.erase(this);
+    input_var->RemoveOutput(this, this->Node());
  }
 }

@@ -77,8 +80,8 @@ void FetchOpHandle::RunImpl() {
 void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
  auto cpu_ctx = platform::DeviceContextPool::Instance().Get(place);
  for (auto *input : inputs_) {
-    if (input->generated_op_) {
-      input->generated_op_->RecordWaitEventOnCtx(cpu_ctx);
+    if (input->GeneratedOp()) {
+      input->GeneratedOp()->RecordWaitEventOnCtx(cpu_ctx);
    }
  }
 }

--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -28,7 +28,7 @@ namespace details {

 struct FetchOpHandle : public OpHandleBase {
 public:
-  FetchOpHandle(FeedFetchList *data, size_t offset,
+  FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
                std::vector<Scope *> *local_scopes);

  ~FetchOpHandle();

--- a/paddle/fluid/framework/details/fuse_vars_op_handle.h
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h
@@ -30,10 +30,12 @@ namespace details {

 struct FuseVarsOpHandle : public OpHandleBase {
 public:
-  FuseVarsOpHandle(Scope *local_scope, const platform::Place &place,
+  FuseVarsOpHandle(ir::Node *node, Scope *local_scope,
+                   const platform::Place &place,
                   const std::unordered_map<std::string, int64_t> &inputs_numel,
                   const std::type_index &var_type)
-      : local_scope_(local_scope),
+      : OpHandleBase(node),
+        local_scope_(local_scope),
        place_(place),
        inputs_numel_(inputs_numel),
        type_(var_type) {

--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -20,9 +20,10 @@ namespace paddle {
 namespace framework {
 namespace details {

-GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
+GatherOpHandle::GatherOpHandle(ir::Node *node,
+                               const std::vector<Scope *> &local_scopes,
                               const std::vector<platform::Place> &places)
-    : local_scopes_(local_scopes), places_(places) {}
+    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}

 void GatherOpHandle::RunImpl() {
  if (places_.size() == 1) return;

--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -30,7 +30,7 @@ namespace details {

 struct GatherOpHandle : public OpHandleBase {
 public:
-  GatherOpHandle(const std::vector<Scope *> &local_scopes,
+  GatherOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                 const std::vector<platform::Place> &places);

  std::string Name() const override;

--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -70,6 +70,7 @@ struct TestGatherOpHandle {
  }

  void InitGatherOp(size_t input_scope_idx) {
+    std::vector<std::unique_ptr<ir::Node>> nodes;
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      local_scopes_.push_back(&(g_scope_.NewScope()));
      Scope& local_scope = local_scopes_.back()->NewScope();
@@ -81,30 +82,37 @@ struct TestGatherOpHandle {
    }
    param_scopes_[input_scope_idx]->Var("out");

-    op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
+    nodes.emplace_back(new ir::Node("node", ir::Node::Type::kOperation));
+    op_handle_.reset(
+        new GatherOpHandle(nodes.back().get(), local_scopes_, gpu_list_));
    // add input
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
-      auto* in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
+      nodes.emplace_back(new ir::Node("node1", ir::Node::Type::kVariable));
+      auto* in_var_handle =
+          new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]);
      vars_.emplace_back(in_var_handle);
      op_handle_->AddInput(in_var_handle);
    }

    // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+    nodes.emplace_back(new ir::Node("node2", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(nodes.back().get()));
    DummyVarHandle* in_dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
-    in_dummy_var_handle->generated_op_ = nullptr;
+    in_dummy_var_handle->ClearGeneratedOp();
    op_handle_->AddInput(in_dummy_var_handle);

    // add output
-    auto* out_var_handle =
-        new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]);
+    nodes.emplace_back(new ir::Node("node3", ir::Node::Type::kVariable));
+    auto* out_var_handle = new VarHandle(nodes.back().get(), 2, input_scope_idx,
+                                         "out", gpu_list_[input_scope_idx]);
    vars_.emplace_back(out_var_handle);
    op_handle_->AddOutput(out_var_handle);

    // add dummy var
-    vars_.emplace_back(new DummyVarHandle());
+    nodes.emplace_back(new ir::Node("node4", ir::Node::Type::kVariable));
+    vars_.emplace_back(new DummyVarHandle(nodes.back().get()));
    DummyVarHandle* dummy_var_handle =
        static_cast<DummyVarHandle*>(vars_.back().get());
    op_handle_->AddOutput(dummy_var_handle);

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -25,6 +25,7 @@
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/scope.h"

@@ -66,31 +67,38 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
  }
 }

-void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
-                                                const OpDesc &op,
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(Graph *result, ir::Node *node,
                                                size_t place_id) const {
  auto p = places_[place_id];
-  auto *op_handle = result->ops_.back().get();
+  auto *op_handle = result->Get<GraphOps>("ops").back().get();
  op_handle->SetDeviceContext(p,
                              platform::DeviceContextPool::Instance().Get(p));

-  for (auto &each_var_name : op.InputArgumentNames()) {
-    VarHandle *var =
-        CreateOrGetLatestVarHandle(result, each_var_name, p, place_id);
+  for (ir::Node *input : node->inputs) {
+    VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id);
    op_handle->AddInput(var);
  }

-  for (auto &each_var_name : op.OutputArgumentNames()) {
-    CreateOpOutput(result, op_handle, each_var_name, p, place_id);
+  for (ir::Node *output : node->outputs) {
+    ir::Node *new_node = nullptr;
+    if (output->Var()) {
+      new_node = result->CreateVarNode(output->Var());
+    } else {
+      new_node =
+          result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
+    }
+    CreateOpOutput(result, op_handle, new_node, p, place_id);
  }
 }

 std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
-    const ProgramDesc &program) const {
+    const std::vector<std::unique_ptr<ir::Node>> &nodes) const {
  std::vector<std::string> send_vars;
  // since parameters are all in block 0,
  // it's enough to only scan send ops in block 0
-  for (auto *op : program.Block(0).AllOps()) {
+  for (auto &node : nodes) {
+    if (node->NodeType() != ir::Node::Type::kOperation) continue;
+    OpDesc *op = node->Op();
    // TODO(Yancey1989): use a graceful method to find send op,
    // instead of the the hard code string
    if (op->Type() == "send") {
@@ -104,9 +112,11 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
 }

 std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
-    const ProgramDesc &program) const {
+    const std::vector<std::unique_ptr<ir::Node>> &nodes) const {
  std::vector<std::string> recv_vars;
-  for (auto *op : program.Block(0).AllOps()) {
+  for (auto &node : nodes) {
+    if (node->NodeType() != ir::Node::Type::kOperation) continue;
+    OpDesc *op = node->Op();
    // TODO(Yancey1989): use a graceful method to find recv op,
    // instead of the hard code string
    if (op->Type() == "recv") {
@@ -120,7 +130,7 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
 }

 bool MultiDevSSAGraphBuilder::IsDistTrainOp(
-    const OpDesc &op, const std::vector<std::string> &send_vars,
+    ir::Node *node, const std::vector<std::string> &send_vars,
    const std::vector<std::string> &recv_vars) const {
  if (send_vars.size() == 0 || recv_vars.size() == 0) {
    return false;
@@ -143,8 +153,17 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(
    return false;
  };

-  return checker(op.OutputArgumentNames(), send_vars) ||
-         checker(op.InputArgumentNames(), recv_vars);
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (ir::Node *input : node->inputs) {
+    input_var_names.push_back(input->Name());
+  }
+  for (ir::Node *output : node->outputs) {
+    output_var_names.push_back(output->Name());
+  }
+
+  return checker(output_var_names, send_vars) ||
+         checker(input_var_names, recv_vars);
 }

 size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
@@ -167,25 +186,30 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
  return dev_id;
 }

-std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
-    const ProgramDesc &program) const {
-  for (auto *var : program.Block(0).AllVars()) {
-    all_vars_.emplace(var->Name(), var);
+std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
+    std::unique_ptr<Graph> graph) const {
+  // Rebuild the graph structure.
+  auto nodes = std::move(graph->nodes);
+  graph->nodes.clear();
+
+  for (auto &node : nodes) {
+    if (node->NodeType() == ir::Node::Type::kVariable) {
+      all_vars_.emplace(node->Name(), node->Var());
+    }
  }

-  auto graph = new SSAGraph();
-  SSAGraph &result = *graph;
+  Graph &result = *graph;
  std::unordered_set<std::string> og_has_been_broadcast;

  // We cannot invoke resize. It is a bug of GCC 4.8
-  result.vars_ = std::vector<
-      std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>(
-      places_.size());
+  result.Set("vars", new GraphVars(places_.size()));
+  result.Set("dep_vars", new GraphDepVars);
+  result.Set("ops", new GraphOps);

  // find send/recv vars so that we can place the distributed training
  // realted op in the place 0
-  auto send_vars = FindDistTrainSendVars(program);
-  auto recv_vars = FindDistTrainRecvVars(program);
+  auto send_vars = FindDistTrainSendVars(nodes);
+  auto recv_vars = FindDistTrainRecvVars(nodes);

  std::vector<std::unordered_set<std::string>> bcast_var_name_set;
  bcast_var_name_set.resize(places_.size());
@@ -193,14 +217,19 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
  size_t cur_device_id = 0;
  bool is_forwarding = true;

-  for (auto *op : program.Block(0).AllOps()) {
+  // NOTE: Currently, passes before SSAGraphBuilder cannot reorder
+  // forward, backward nodes. E.g. you can't append an forward node
+  // at the end of the node list.
+  // TODO(panyx0718): FIXME: Needs to sort by forward->backward order.
+  for (auto &node : nodes) {
+    if (node->NodeType() != ir::Node::Type::kOperation) continue;
    if (boost::get<int>(
-            op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+            node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
        static_cast<int>(OpRole::kRPC)) {
-      CreateRPCOp(&result, *op);
-    } else if (IsDistTrainOp(*op, send_vars, recv_vars)) {
-      CreateDistTrainOp(&result, *op);
-    } else if (IsScaleLossOp(*op)) {
+      CreateRPCOp(&result, node.get());
+    } else if (IsDistTrainOp(node.get(), send_vars, recv_vars)) {
+      CreateDistTrainOp(&result, node.get());
+    } else if (IsScaleLossOp(node.get())) {
      // user can customize loss@grad if not use_default_grad_scale_
      if (strategy_.gradient_scale_ !=
          BuildStrategy::GradientScaleStrategy::kCustomized) {
@@ -212,33 +241,35 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
      // the block.
      is_forwarding = false;
    } else {
-      int op_dev_id = GetOpDeviceID(*op);
+      int op_dev_id = GetOpDeviceID(node.get());
      if (op_dev_id != -1) {  // This op only runs on one specific device.
-        CreateComputationalOp(&result, *op, op_dev_id);
-        for (auto &var_name : op->OutputArgumentNames()) {
-          var_name_on_devices_.emplace(var_name, op_dev_id);
+        CreateComputationalOp(&result, node.get(), op_dev_id);
+        for (ir::Node *n : node->outputs) {
+          var_name_on_devices_.emplace(n->Name(), op_dev_id);
        }
      } else {
        // This op runs on all devices, and its output may have parameter's
        // gradients.
-        if (op->Type() == "read" && strategy_.enable_data_balance_) {
-          op->SetAttr("throw_eof_exp", false);
-          CreateComputationalOps(&result, *op, places_.size());
-          const auto &data_var_names = op->Output("Out");
+        if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) {
+          node->Op()->SetAttr("throw_eof_exp", false);
+          CreateComputationalOps(&result, node.get(), places_.size());
+          // TODO(paddle-dev): builder shouldn't depend on the out logic of
+          // a specific op.
+          const auto &data_var_names = node->Op()->Output("Out");
          InsertDataBalanceOp(&result, data_var_names);
        } else {
-          CreateComputationalOps(&result, *op, places_.size());
+          CreateComputationalOps(&result, node.get(), places_.size());
        }

        if (!is_forwarding && places_.size() > 1) {
          // Currently, we assume that once gradient is generated, it can be
          // broadcast, and each gradient is only broadcast once.
-          if (static_cast<bool>(boost::get<int>(op->GetAttr(
+          if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
                                    OpProtoAndCheckerMaker::OpRoleAttrName())) &
                                static_cast<int>(OpRole::kBackward))) {
            try {
-              auto backward_vars =
-                  boost::get<std::vector<std::string>>(op->GetNullableAttr(
+              auto backward_vars = boost::get<std::vector<std::string>>(
+                  node->Op()->GetNullableAttr(
                      OpProtoAndCheckerMaker::OpRoleVarAttrName()));

              PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
@@ -302,8 +333,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
   * Only variables should be the leaves of graph.
   */
  AddOutputToLeafOps(&result);
-
-  return std::unique_ptr<SSAGraph>(graph);
+  return graph;
 }

 bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
@@ -327,78 +357,96 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
 #endif
 }

-void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result,
+void MultiDevSSAGraphBuilder::CreateBroadcastOp(Graph *result,
                                                const std::string &p_name,
                                                size_t src_dev_id) const {
 #ifdef PADDLE_WITH_CUDA
-  auto *op_handle = new BroadcastOpHandle(local_scopes_, places_, nccl_ctxs_);
+  auto *op_handle = new BroadcastOpHandle(
+      result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
+      local_scopes_, places_, nccl_ctxs_);
 #else
-  auto *op_handle = new BroadcastOpHandle(local_scopes_, places_);
+  auto *op_handle = new BroadcastOpHandle(
+      result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
+      local_scopes_, places_);
 #endif
+  result->Get<GraphOps>("ops").emplace_back(op_handle);

-  result->ops_.emplace_back(op_handle);
-  auto *in = result->vars_.at(src_dev_id).at(p_name).back().get();
+  auto *in =
+      result->Get<GraphVars>("vars").at(src_dev_id).at(p_name).back().get();
  op_handle->AddInput(in);

  for (size_t i = 0; i < places_.size(); ++i) {
    auto &p = places_[i];
    SetCommunicationContext(op_handle, p);
-    auto &vars = result->vars_.at(i).at(p_name);
-    auto *out_var = new VarHandle(vars.size(), i, p_name, p);
+    auto &vars = result->Get<GraphVars>("vars").at(i).at(p_name);
+    auto *out_var = new VarHandle(
+        result->CreateEmptyNode(p_name, ir::Node::Type::kVariable), vars.size(),
+        i, p_name, p);
    vars.emplace_back(out_var);
    op_handle->AddOutput(out_var);
  }
 }

-void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result,
-                                                    const OpDesc &op,
+void MultiDevSSAGraphBuilder::CreateComputationalOp(Graph *result,
+                                                    ir::Node *node,
                                                    int dev_id) const {
-  result->ops_.emplace_back(
-      new ComputationOpHandle(op, local_scopes_[dev_id], places_[dev_id]));
-  CreateOpHandleIOs(result, op, dev_id);
+  result->Get<GraphOps>("ops").emplace_back(
+      new ComputationOpHandle(result->CreateOpNode(node->Op()),
+                              local_scopes_[dev_id], places_[dev_id]));
+  CreateOpHandleIOs(result, node, dev_id);
 }

-void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result,
+void MultiDevSSAGraphBuilder::InsertAllReduceOp(Graph *result,
                                                const std::string &og) const {
 #ifdef PADDLE_WITH_CUDA
-  result->ops_.emplace_back(
-      new AllReduceOpHandle(local_scopes_, places_, nccl_ctxs_));
+  result->Get<GraphOps>("ops").emplace_back(new AllReduceOpHandle(
+      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+      local_scopes_, places_, nccl_ctxs_));
 #else
-  result->ops_.emplace_back(new AllReduceOpHandle(local_scopes_, places_));
+  result->Get<GraphOps>("ops").emplace_back(new AllReduceOpHandle(
+      result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+      local_scopes_, places_));
 #endif
-  auto *op_handle = result->ops_.back().get();
+  auto *op_handle = result->Get<GraphOps>("ops").back().get();

  for (size_t i = 0; i < places_.size(); ++i) {
    auto &p = places_[i];
    SetCommunicationContext(op_handle, p);
-    auto &vars = result->vars_[i][og];
+    auto &vars = result->Get<GraphVars>("vars")[i][og];
    PADDLE_ENFORCE(!vars.empty());
    auto &prev_grad = vars.back();
    op_handle->AddInput(prev_grad.get());

-    auto var = new VarHandle(vars.size(), i, og, p);
+    auto var =
+        new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
+                      vars.size(), i, og, p);
    vars.emplace_back(var);
    op_handle->AddOutput(var);
  }
 }

 void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
-    SSAGraph *result, const std::vector<std::string> &datas) const {
+    Graph *result, const std::vector<std::string> &datas) const {
 #ifdef PADDLE_WITH_CUDA
-  result->ops_.emplace_back(
-      new DataBalanceOpHandle(local_scopes_, places_, nccl_ctxs_));
+  result->Get<GraphOps>("ops").emplace_back(new DataBalanceOpHandle(
+      result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
+      local_scopes_, places_, nccl_ctxs_));
 #else
-  result->ops_.emplace_back(new DataBalanceOpHandle(local_scopes_, places_));
+  result->Get<GraphOps>("ops").emplace_back(new DataBalanceOpHandle(
+      result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
+      local_scopes_, places_));
 #endif
-  auto *op_handle = result->ops_.back().get();
+  auto *op_handle = result->Get<GraphOps>("ops").back().get();
  for (size_t i = 0; i < places_.size(); ++i) {
    auto &p = places_[i];
    SetCommunicationContext(op_handle, p);
    for (const std::string &d_name : datas) {
-      auto &vars = result->vars_[i][d_name];
+      auto &vars = result->Get<GraphVars>("vars")[i][d_name];
      PADDLE_ENFORCE(!vars.empty());
      op_handle->AddInput(vars.back().get());
-      auto var = new VarHandle(vars.size(), i, d_name, p);
+      auto var = new VarHandle(
+          result->CreateEmptyNode(d_name, ir::Node::Type::kVariable),
+          vars.size(), i, d_name, p);
      vars.emplace_back(var);
      op_handle->AddOutput(var);
    }
@@ -417,22 +465,22 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
  return is_pg_once;
 }

-int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
+int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
    return -1;
  }
  int op_role = boost::get<int>(
-      op.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+      node->Op()->GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
  if (op_role != static_cast<int>(framework::OpRole::kOptimize)) {
    return -1;
  }
  auto param_grad = boost::get<std::vector<std::string>>(
-      op.GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+      node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));

  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
  int dev_id = GetVarDeviceID(param_grad[1]);
-  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]", op.Type(),
-                    param_grad[0]);
+  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]",
+                    node->Op()->Type(), param_grad[0]);
  return dev_id;
 }

@@ -441,7 +489,7 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
  return got == var_name_on_devices_.end() ? -1 : got->second;
 }

-void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
+void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(Graph *result) const {
  for (size_t i = 0; i < places_.size(); ++i) {
 // Insert ScaleCost OpHandle
 #ifdef PADDLE_WITH_CUDA
@@ -452,11 +500,11 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
    auto *communication_dev_ctx =
        platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
 #endif
-
-    auto *op_handle =
-        new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i],
-                                  places_[i], communication_dev_ctx);
-    result->ops_.emplace_back(op_handle);
+    auto *op_handle = new ScaleLossGradOpHandle(
+        result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
+        local_scopes_.size(), local_scopes_[i], places_[i],
+        communication_dev_ctx);
+    result->Get<GraphOps>("ops").emplace_back(op_handle);

    // FIXME: Currently ScaleLossGradOp only use device_count as scale
    // factor. So it does not depend on any other operators.
@@ -464,43 +512,51 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
    // loss->pending_ops_.emplace_back(op_handle);
    // op_handle->inputs_.emplace_back(loss);

-    CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i],
-                   i);
+    CreateOpOutput(result, op_handle,
+                   result->CreateEmptyNode(GradVarName(loss_var_name_),
+                                           ir::Node::Type::kVariable),
+                   places_[i], i);
  }
 }

-void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
-                                                     const OpDesc &op,
+void MultiDevSSAGraphBuilder::CreateComputationalOps(Graph *result,
+                                                     ir::Node *node,
                                                     size_t num_places) const {
  for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
    auto p = places_[scope_idx];
    auto s = local_scopes_[scope_idx];
-    result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
-    CreateOpHandleIOs(result, op, scope_idx);
+    result->Get<GraphOps>("ops").emplace_back(
+        new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p));
+    CreateOpHandleIOs(result, node, scope_idx);
  }
 }

-VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
+VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(Graph *result,
                                                   const std::string &og,
                                                   int dst_dev_id) const {
 #ifdef PADDLE_WITH_CUDA
-  result->ops_.emplace_back(
-      new ReduceOpHandle(local_scopes_, places_, nccl_ctxs_));
+  result->Get<GraphOps>("ops").emplace_back(new ReduceOpHandle(
+      result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
+      local_scopes_, places_, nccl_ctxs_));
 #else
-  result->ops_.emplace_back(new ReduceOpHandle(local_scopes_, places_));
+  result->Get<GraphOps>("ops").emplace_back(new ReduceOpHandle(
+      result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
+      local_scopes_, places_));
 #endif
-  auto *op_handle = result->ops_.back().get();
+  auto *op_handle = result->Get<GraphOps>("ops").back().get();

  for (size_t i = 0; i < places_.size(); ++i) {
    auto &p = places_[i];
    SetCommunicationContext(op_handle, p);
-    auto &vars = result->vars_[i][og];
+    auto &vars = result->Get<GraphVars>("vars")[i][og];
    PADDLE_ENFORCE(!vars.empty());
    auto &prev_grad = vars.back();
    op_handle->AddInput(prev_grad.get());
  }
-  auto &vars = result->vars_[dst_dev_id][og];
-  auto var = new VarHandle(vars.size(), dst_dev_id, og, places_[dst_dev_id]);
+  auto &vars = result->Get<GraphVars>("vars")[dst_dev_id][og];
+  auto var =
+      new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
+                    vars.size(), dst_dev_id, og, places_[dst_dev_id]);
  vars.emplace_back(var);
  op_handle->AddOutput(var);
  return var;
@@ -508,35 +564,46 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,

 // Find the first occurence of `prev_op_name` and make current `op` depend
 // on it.
-void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
+void MultiDevSSAGraphBuilder::ConnectOp(Graph *result, OpHandleBase *op,
                                        const std::string &prev_op_name) const {
-  for (auto &prev_op : result->ops_) {
+  for (auto &prev_op : result->Get<GraphOps>("ops")) {
    if (prev_op->Name() == prev_op_name) {
-      auto *dep_var = new DummyVarHandle();
+      auto *dep_var = new DummyVarHandle(
+          result->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
      prev_op->AddOutput(dep_var);
-      result->dep_vars_.emplace(dep_var);
+      result->Get<GraphDepVars>("dep_vars").emplace(dep_var);
      op->AddInput(dep_var);
    }
  }
 }

-void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
-                                                const OpDesc &op) const {
+void MultiDevSSAGraphBuilder::CreateDistTrainOp(Graph *result,
+                                                ir::Node *node) const {
  int op_dev_id = -1;
-  if (op.Type() == "split_byref" || op.Type() == "split_selected_rows") {
-    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (ir::Node *input : node->inputs) {
+    input_var_names.push_back(input->Name());
+  }
+  for (ir::Node *output : node->outputs) {
+    output_var_names.push_back(output->Name());
+  }
+
+  if (node->Op()->Type() == "split_byref" ||
+      node->Op()->Type() == "split_selected_rows") {
+    op_dev_id = GetVarDeviceID(input_var_names[0]);
    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
-      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
-      for (auto &varname : op.InputArgumentNames()) {
+      op_dev_id = GetAppropriateDeviceID(input_var_names);
+      for (auto &varname : input_var_names) {
        var_name_on_devices_.emplace(varname, op_dev_id);
      }
    }
-    for (auto &varname : op.OutputArgumentNames()) {
+    for (auto &varname : output_var_names) {
      var_name_on_devices_.emplace(varname, op_dev_id);
    }
-  } else if (op.Type() == "concat") {
-    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
-    for (auto &varname : op.OutputArgumentNames()) {
+  } else if (node->Op()->Type() == "concat") {
+    op_dev_id = GetVarDeviceID(input_var_names[0]);
+    for (auto &varname : output_var_names) {
      var_name_on_devices_.emplace(varname, op_dev_id);
    }
  } else {
@@ -546,34 +613,43 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
  }

  PADDLE_ENFORCE(op_dev_id != -1,
-                 "can not find right place for distributed op: %s", op.Type());
+                 "can not find right place for distributed op: %s",
+                 node->Op()->Type());

-  CreateComputationalOp(result, op, op_dev_id);
-  if (op.Type() == "concat") {
-    ConnectOp(result, result->ops_.back().get(), "fetch_barrier");
+  CreateComputationalOp(result, node, op_dev_id);
+  if (node->Op()->Type() == "concat") {
+    ConnectOp(result, result->Get<GraphOps>("ops").back().get(),
+              "fetch_barrier");
  }
 }

 // Create RPC related op handles that connects its in ops and out ops.
-void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
-                                          const OpDesc &op) const {
+void MultiDevSSAGraphBuilder::CreateRPCOp(Graph *result, ir::Node *node) const {
  int op_dev_id = -1;
-  if (op.Type() == "send") {
-    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+  if (node->Op()->Type() == "send") {
+    op_dev_id = GetVarDeviceID(node->inputs[0]->Name());
    // the variable name which contains .block means it was splited by
    // split_byref op
    // so that we can balance the variable blocks to all the pserver
    // instances.
    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
-        op.InputArgumentNames()[0].find(".block") == std::string::npos) {
-      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
-      for (auto &varname : op.InputArgumentNames()) {
+        node->inputs[0]->Name().find(".block") == std::string::npos) {
+      std::vector<std::string> input_var_names;
+      for (ir::Node *n : node->inputs) {
+        input_var_names.push_back(n->Name());
+      }
+      op_dev_id = GetAppropriateDeviceID(input_var_names);
+      for (auto &varname : input_var_names) {
        var_name_on_devices_.emplace(varname, op_dev_id);
      }
    }
-  } else if (op.Type() == "recv") {
-    op_dev_id = GetAppropriateDeviceID(op.OutputArgumentNames());
-    for (auto &varname : op.OutputArgumentNames()) {
+  } else if (node->Op()->Type() == "recv") {
+    std::vector<std::string> output_var_names;
+    for (ir::Node *n : node->outputs) {
+      output_var_names.push_back(n->Name());
+    }
+    op_dev_id = GetAppropriateDeviceID(output_var_names);
+    for (auto &varname : output_var_names) {
      var_name_on_devices_.emplace(varname, op_dev_id);
    }
  } else {
@@ -582,18 +658,20 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
  }

  PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
-                 op.Type());
-
-  result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[op_dev_id],
-                                            op.Type(), places_[op_dev_id]));
-
-  if (op.Type() == "send_barrier") {
-    ConnectOp(result, result->ops_.back().get(), "send");
-  } else if (op.Type() == "recv") {
-    ConnectOp(result, result->ops_.back().get(), "send_barrier");
-  } else if (op.Type() == "fetch_barrier") {
-    ConnectOp(result, result->ops_.back().get(), "recv");
-  } else if (op.Type() == "send") {
+                 node->Op()->Type());
+
+  result->Get<GraphOps>("ops").emplace_back(new RPCOpHandle(
+      result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
+      node->Op()->Type(), places_[op_dev_id]));
+
+  if (node->Op()->Type() == "send_barrier") {
+    ConnectOp(result, result->Get<GraphOps>("ops").back().get(), "send");
+  } else if (node->Op()->Type() == "recv") {
+    ConnectOp(result, result->Get<GraphOps>("ops").back().get(),
+              "send_barrier");
+  } else if (node->Op()->Type() == "fetch_barrier") {
+    ConnectOp(result, result->Get<GraphOps>("ops").back().get(), "recv");
+  } else if (node->Op()->Type() == "send") {
    // do nothing
  } else {
    PADDLE_THROW(
@@ -601,12 +679,12 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
        "send, send_barrier. recv, fetch_barrier]");
  }

-  CreateOpHandleIOs(result, op, op_dev_id);
+  CreateOpHandleIOs(result, node, op_dev_id);
 }

-bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
+bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
  return boost::get<int>(
-             op.GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+             node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
             (static_cast<int>(OpRole::kBackward) |
              static_cast<int>(OpRole::kLoss)) &&
         !loss_var_name_.empty();  // If loss_var is empty. This is test mode

--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -19,6 +19,7 @@

 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/framework/ir/graph.h"

 namespace paddle {
 namespace platform {
@@ -45,13 +46,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                          const std::vector<Scope *> &local_scopes,
                          const BuildStrategy &strategy);
 #endif
-
-  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override;
  int GetVarDeviceID(const std::string &varname) const override;

 private:
-  void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
-                         size_t device_id) const;
+  void CreateOpHandleIOs(Graph *result, ir::Node *node, size_t device_id) const;

 private:
  std::string loss_var_name_;
@@ -63,48 +62,46 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
  platform::NCCLContextMap *nccl_ctxs_;
 #endif

-  bool IsScaleLossOp(const OpDesc &op) const;
+  bool IsScaleLossOp(ir::Node *node) const;

-  void CreateRPCOp(SSAGraph *result, const OpDesc &op) const;
-  void CreateDistTrainOp(SSAGraph *result, const OpDesc &op) const;
+  void CreateRPCOp(Graph *result, ir::Node *node) const;
+  void CreateDistTrainOp(Graph *result, ir::Node *node) const;

  /**
   * Is this operator as the end-point operator before/after send operator.
   */
-  bool IsDistTrainOp(const OpDesc &op,
-                     const std::vector<std::string> &send_vars,
+  bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
                     const std::vector<std::string> &recv_vars) const;

  std::vector<std::string> FindDistTrainSendVars(
-      const ProgramDesc &program) const;
+      const std::vector<std::unique_ptr<ir::Node>> &nodes) const;

  std::vector<std::string> FindDistTrainRecvVars(
-      const ProgramDesc &program) const;
+      const std::vector<std::unique_ptr<ir::Node>> &nodes) const;

-  void ConnectOp(SSAGraph *result, OpHandleBase *op,
+  void ConnectOp(Graph *result, OpHandleBase *op,
                 const std::string &prev_op_name) const;

-  void CreateComputationalOps(SSAGraph *result, const OpDesc &op,
+  void CreateComputationalOps(Graph *result, ir::Node *node,
                              size_t num_places) const;

-  void CreateScaleLossGradOp(SSAGraph *result) const;
-  VarHandle *CreateReduceOp(SSAGraph *result, const std::string &og,
+  void CreateScaleLossGradOp(Graph *result) const;
+  VarHandle *CreateReduceOp(Graph *result, const std::string &og,
                            int dst_dev_id) const;
-  void CreateComputationalOp(SSAGraph *result, const OpDesc &op,
-                             int dev_id) const;
+  void CreateComputationalOp(Graph *result, ir::Node *node, int dev_id) const;

  bool IsParameterGradientOnce(
      const std::string &og,
      std::unordered_set<std::string> *og_has_been_broadcast) const;

-  int GetOpDeviceID(const OpDesc &op) const;
+  int GetOpDeviceID(ir::Node *node) const;

-  void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
+  void InsertAllReduceOp(Graph *result, const std::string &og) const;

-  void InsertDataBalanceOp(SSAGraph *result,
+  void InsertDataBalanceOp(Graph *result,
                           const std::vector<std::string> &datas) const;

-  void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
+  void CreateBroadcastOp(Graph *result, const std::string &p_name,
                         size_t src_dev_id) const;

  bool IsSparseGradient(const std::string &og) const;

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -80,19 +80,21 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {

 void OpHandleBase::AddInput(VarHandleBase *in) {
  this->inputs_.emplace_back(in);
-  in->pending_ops_.insert(this);
+  node_->inputs.push_back(in->Node());
+  in->AddOutput(this, this->Node());
 }

 void OpHandleBase::AddOutput(VarHandleBase *out) {
  outputs_.emplace_back(out);
-  out->generated_op_ = this;
+  node_->outputs.push_back(out->Node());
+  out->AddInput(this, this->Node());
 }

 void OpHandleBase::WaitInputVarGenerated() {
  for (auto in_var : inputs_) {
    if (NeedWait(in_var)) {
      for (auto &pair : dev_ctxes_) {
-        in_var->generated_op_->RecordWaitEventOnCtx(pair.second);
+        in_var->GeneratedOp()->RecordWaitEventOnCtx(pair.second);
      }
    }
  }
@@ -101,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() {
 void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
  for (auto *in : inputs_) {
    if (NeedWait(in)) {
-      in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[place]);
+      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[place]);
    }
  }
 }
@@ -117,7 +119,7 @@ size_t OpHandleBase::NoDummyInputSize() const {
 }

 bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
-  return in_var && in_var->generated_op_;
+  return in_var && in_var->GeneratedOp();
 }

 void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/macros.h"

@@ -26,9 +27,11 @@ namespace details {

 constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";

+// Wraps ir::Node and provide helper utilities.
+// It's responsible for populating necessary fields of ir::Node.
 class OpHandleBase {
 public:
-  OpHandleBase() {}
+  explicit OpHandleBase(ir::Node *node) : node_(node) {}

  virtual ~OpHandleBase();

@@ -82,6 +85,8 @@ class OpHandleBase {

  size_t NoDummyInputSize() const;

+  ir::Node *Node() { return node_; }
+
 protected:
  void RunAndRecordEvent(const std::function<void()> &callback);

@@ -90,6 +95,7 @@ class OpHandleBase {

  virtual void RunImpl() = 0;

+  ir::Node *node_;
  std::vector<VarHandleBase *> inputs_;
  std::vector<VarHandleBase *> outputs_;
  std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;

--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -35,14 +35,16 @@ struct ReduceLoDTensor {
    PADDLE_ENFORCE(!src_tensors_.empty());
    auto &t0 = *src_tensors_[0];
    PADDLE_ENFORCE_NE(t0.numel(), 0);
+
    dst_tensor_.Resize(t0.dims());
    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
-    if (dst != t0.data<T>()) {
-      std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
-    }

-    for (size_t i = 1; i < src_tensors_.size(); ++i) {
+    for (size_t i = 0; i < src_tensors_.size(); ++i) {
      auto &t = *src_tensors_[i];
+      if (dst == t.data<T>()) {
+        continue;
+      }
+
      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
      PADDLE_ENFORCE_EQ(t.type(), t0.type());
      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,

--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -37,10 +37,13 @@ struct ReduceOpHandle : public OpHandleBase {

 #ifdef PADDLE_WITH_CUDA
  const platform::NCCLContextMap *nccl_ctxs_;
-  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                 const std::vector<platform::Place> &places,
                 const platform::NCCLContextMap *nccl_ctxs)
-      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+      : OpHandleBase(node),
+        local_scopes_(local_scopes),
+        places_(places),
+        nccl_ctxs_(nccl_ctxs) {
    if (nccl_ctxs_) {
      for (auto &p_ctx : nccl_ctxs_->contexts_) {
        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
@@ -48,9 +51,9 @@ struct ReduceOpHandle : public OpHandleBase {
    }
  }
 #else
-  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+  ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                 const std::vector<platform::Place> &places)
-      : local_scopes_(local_scopes), places_(places) {}
+      : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif

  std::string Name() const override;

--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -84,6 +84,7 @@ struct TestReduceOpHandle {
  }

  void InitReduceOp(size_t out_scope_idx) {
+    std::vector<std::unique_ptr<ir::Node>> nodes;
    // init scope
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      local_scopes_.push_back(&(g_scope_.NewScope()));
@@ -96,19 +97,21 @@ struct TestReduceOpHandle {
    }
    param_scopes_[out_scope_idx]->Var("out");

+    nodes.emplace_back(new ir::Node("node"));
    if (use_gpu_) {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
+                                          gpu_list_, nccl_ctxs_.get()));
 #else
      PADDLE_THROW("CUDA is not support.");
 #endif
    } else {
 #ifdef PADDLE_WITH_CUDA
-      op_handle_.reset(
-          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+      op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
+                                          gpu_list_, nccl_ctxs_.get()));
 #else
-      op_handle_.reset(new ReduceOpHandle(local_scopes_, gpu_list_));
+      op_handle_.reset(
+          new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_));
 #endif
    }

@@ -118,8 +121,10 @@ struct TestReduceOpHandle {
      if (!use_gpu_) {
        op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
      }
-      auto *in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
-      in_var_handle->generated_op_ = nullptr;
+      nodes.emplace_back(new ir::Node("node1"));
+      auto *in_var_handle =
+          new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]);
+      in_var_handle->ClearGeneratedOp();
      vars_.emplace_back(in_var_handle);
      op_handle_->AddInput(in_var_handle);
    }
@@ -128,12 +133,13 @@ struct TestReduceOpHandle {
    vars_.emplace_back(new DummyVarHandle());
    DummyVarHandle *in_dummy_var_handle =
        static_cast<DummyVarHandle *>(vars_.back().get());
-    in_dummy_var_handle->generated_op_ = nullptr;
+    in_dummy_var_handle->ClearGeneratedOp();
    op_handle_->AddInput(in_dummy_var_handle);

    // add output
-    auto *out_var_handle =
-        new VarHandle(2, out_scope_idx, "out", gpu_list_[out_scope_idx]);
+    nodes.emplace_back(new ir::Node("node2"));
+    auto *out_var_handle = new VarHandle(nodes.back().get(), 2, out_scope_idx,
+                                         "out", gpu_list_[out_scope_idx]);
    vars_.emplace_back(out_var_handle);
    op_handle_->AddOutput(out_var_handle);


--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -18,10 +18,11 @@ namespace paddle {
 namespace framework {
 namespace details {

-RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc,
+RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc,
                         const Scope *local_scope, const std::string &name,
                         const platform::Place &place)
-    : op_(framework::OpRegistry::CreateOp(op_desc)),
+    : OpHandleBase(node),
+      op_(framework::OpRegistry::CreateOp(op_desc)),
      local_scope_(local_scope),
      name_(name),
      place_(place) {}
@@ -35,8 +36,8 @@ void RPCOpHandle::RunImpl() {
    if (in->DebugString() == "dummy") {  // HACK
      continue;
    }
-    if (in->generated_op_) {
-      in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[p]);
+    if (in->GeneratedOp()) {
+      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[p]);
    }
  }
  auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();

--- a/paddle/fluid/framework/details/rpc_op_handle.h
+++ b/paddle/fluid/framework/details/rpc_op_handle.h
@@ -28,8 +28,9 @@ namespace framework {
 namespace details {

 struct RPCOpHandle : public OpHandleBase {
-  RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
-              const std::string& name, const platform::Place& place);
+  RPCOpHandle(ir::Node* node, const framework::OpDesc& op_desc,
+              const Scope* local_scope, const std::string& name,
+              const platform::Place& place);

  std::string Name() const override;


--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -19,10 +19,14 @@
 namespace paddle {
 namespace framework {
 namespace details {
-ScaleLossGradOpHandle::ScaleLossGradOpHandle(size_t num_dev, Scope *scope,
+ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
+                                             Scope *scope,
                                             platform::Place place,
                                             platform::DeviceContext *dev_ctx)
-    : coeff_(static_cast<float>(1.0 / num_dev)), scope_(scope), place_(place) {
+    : OpHandleBase(node),
+      coeff_(static_cast<float>(1.0 / num_dev)),
+      scope_(scope),
+      place_(place) {
  dev_ctxes_[place_] = dev_ctx;
 }


--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -25,7 +25,8 @@ namespace framework {
 namespace details {

 struct ScaleLossGradOpHandle : public OpHandleBase {
-  ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place,
+  ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
+                        platform::Place place,
                        platform::DeviceContext *context);

  ~ScaleLossGradOpHandle() final;

--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -17,6 +17,9 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"
+
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 #include "paddle/fluid/framework/scope.h"

--- a/paddle/fluid/framework/details/ssa_graph.cc
+++ b/paddle/fluid/framework/details/ssa_graph.cc
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/ssa_graph.h"
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -17,8 +17,8 @@
 namespace paddle {
 namespace framework {
 namespace details {
-void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
-  for (auto &var_map : graph->vars_) {
+void SSAGraphBuilder::PolishGraphToSupportDataHazards(Graph *graph) {
+  for (auto &var_map : graph->Get<GraphVars>("vars")) {
    for (auto &name_pair : var_map) {
      if (name_pair.second.size() <= 1) {
        continue;
@@ -27,8 +27,8 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
      auto it_old = name_pair.second.rbegin();
      ++it_old;
      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
-        auto *write_op = (*it_new)->generated_op_;
-        auto &read_ops = (*it_old)->pending_ops_;
+        OpHandleBase *write_op = (*it_new)->GeneratedOp();
+        const auto &read_ops = (*it_old)->PendingOps();

        for (auto *read_op : read_ops) {
          // Manually add a dependency var from read_op to write_op;
@@ -37,10 +37,11 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
            continue;
          }

-          auto *dep_var = new DummyVarHandle();
+          auto *dep_var = new DummyVarHandle(
+              graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
          read_op->AddOutput(dep_var);
          write_op->AddInput(dep_var);
-          graph->dep_vars_.emplace(dep_var);
+          graph->Get<GraphDepVars>("dep_vars").emplace(dep_var);
        }
      }
    }
@@ -48,13 +49,20 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
 }

 VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
-    SSAGraph *graph, const std::string &each_var_name,
-    const platform::Place &place, size_t place_offset) {
-  auto &var_holders = graph->vars_[place_offset];
-  auto &var_holder = var_holders[each_var_name];
+    Graph *graph, ir::Node *node, const platform::Place &place,
+    size_t place_offset) {
+  auto &var_holders = graph->Get<GraphVars>("vars")[place_offset];
+  auto &var_holder = var_holders[node->Name()];
  VarHandle *var = nullptr;
  if (var_holder.empty()) {
-    var = new VarHandle(0, place_offset, each_var_name, place);
+    if (node->Var()) {
+      var = new VarHandle(graph->CreateVarNode(node->Var()), 0, place_offset,
+                          node->Name(), place);
+    } else {
+      var = new VarHandle(
+          graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0,
+          place_offset, node->Name(), place);
+    }
    var_holder.emplace_back(var);
  } else {
    var = var_holder.rbegin()->get();
@@ -62,24 +70,26 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
  return var;
 }

-void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
-                                     const std::string &each_var_name,
+void SSAGraphBuilder::CreateOpOutput(Graph *graph, OpHandleBase *op_handle,
+                                     ir::Node *new_node,
                                     const platform::Place &place,
                                     size_t place_offset) {
-  auto &vars = graph->vars_[place_offset][each_var_name];
+  auto &vars = graph->Get<GraphVars>("vars")[place_offset][new_node->Name()];
  size_t version = vars.size();
-  auto var = new VarHandle(version, place_offset, each_var_name, place);
+  auto var =
+      new VarHandle(new_node, version, place_offset, new_node->Name(), place);
  vars.emplace_back(var);
  op_handle->AddOutput(var);
 }

-void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
-  for (auto &op : graph->ops_) {
+void SSAGraphBuilder::AddOutputToLeafOps(Graph *graph) {
+  for (auto &op : graph->Get<GraphOps>("ops")) {
    if (!op->Outputs().empty()) {
      continue;
    }
-    auto *dummy_leaf = new DummyVarHandle();
-    graph->dep_vars_.emplace(dummy_leaf);
+    auto *dummy_leaf = new DummyVarHandle(
+        graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
+    graph->Get<GraphDepVars>("dep_vars").emplace(dummy_leaf);
    op->AddOutput(dummy_leaf);
  }
 }

--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -16,20 +16,42 @@

 #include <memory>
 #include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"

-#include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/place.h"

+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
 namespace paddle {
 namespace framework {
 namespace details {

-class SSAGraphBuilder {
+// all variable in each devices.
+// The outside vector is the device vector. Each element of this vector is a
+// map from variable name to variables. The variables, who have the same name,
+// will have a differsent version. The offset in the
+// `std::vector<std::unique_ptr<VarHandle>>` is the version of varaibles.
+typedef std::vector<
+    std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
+    GraphVars;
+
+// aux variables to represent dependency. Useful to resolve data hazard.
+typedef std::unordered_set<std::unique_ptr<VarHandleBase>> GraphDepVars;
+
+// all operators. NOTE that even we use a vector here, the operators is
+// unordered.
+typedef std::vector<std::unique_ptr<OpHandleBase>> GraphOps;
+
+class SSAGraphBuilder : public ir::Pass {
 public:
  SSAGraphBuilder() {}
  virtual ~SSAGraphBuilder() {}
-  virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
+
  virtual int GetVarDeviceID(const std::string &var_name) const = 0;

  DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
@@ -42,20 +64,19 @@ class SSAGraphBuilder {
   *
   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
   */
-  static void PolishGraphToSupportDataHazards(SSAGraph *graph);
+  static void PolishGraphToSupportDataHazards(Graph *graph);

-  static VarHandle *CreateOrGetLatestVarHandle(SSAGraph *graph,
-                                               const std::string &each_var_name,
+  static VarHandle *CreateOrGetLatestVarHandle(Graph *graph, ir::Node *node,
                                               const platform::Place &place,
                                               size_t place_offset);

  // Add an output variable (each_var_name, place, place_offset) to op_handle,
  // which belongs to graph
-  static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
-                             const std::string &each_var_name,
-                             const platform::Place &place, size_t place_offset);
+  static void CreateOpOutput(Graph *graph, OpHandleBase *op_handle,
+                             ir::Node *new_node, const platform::Place &place,
+                             size_t place_offset);

-  static void AddOutputToLeafOps(SSAGraph *graph);
+  static void AddOutputToLeafOps(Graph *graph);
 };
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/ssa_graph_checker.cc
+++ b/paddle/fluid/framework/details/ssa_graph_checker.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/framework/details/ssa_graph.h"
-#include <string>
 #include "paddle/fluid/framework/details/ssa_graph_checker.h"
+#include <string>
+#include "paddle/fluid/framework/ir/graph.h"

 namespace paddle {
 namespace framework {
 namespace details {

-bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
+bool SSAGraghBuilderWithChecker::IsValidGraph(const Graph *graph) const {
  std::unordered_map<OpHandleBase *, size_t> pending_ops;
  std::unordered_set<VarHandleBase *> pending_vars;
  std::unordered_set<VarHandleBase *> ready_vars;
@@ -28,12 +28,12 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {

  auto insert_pending_var = [&](VarHandleBase *var) {
    pending_vars.insert(var);
-    if (var->generated_op_ == nullptr) {
+    if (var->GeneratedOp() == nullptr) {
      ready_vars.emplace(var);
    }
  };

-  for (auto &var_map : graph->vars_) {
+  for (auto &var_map : graph->Get<GraphVars>("vars")) {
    for (auto &name_pair : var_map) {
      for (auto &version_pair : name_pair.second) {
        insert_pending_var(version_pair.get());
@@ -41,11 +41,11 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
    }
  }

-  for (auto &var : graph->dep_vars_) {
+  for (auto &var : graph->Get<GraphDepVars>("dep_vars")) {
    insert_pending_var(var.get());
  }

-  for (auto &op : graph->ops_) {
+  for (auto &op : graph->Get<GraphOps>("ops")) {
    if (op->Inputs().empty()) {
      ready_ops.insert(op.get());
    } else {
@@ -71,7 +71,7 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {

    for (auto ready_var : ready_vars) {
      pending_vars.erase(ready_var);
-      for (auto *op : ready_var->pending_ops_) {
+      for (auto *op : ready_var->PendingOps()) {
        auto &deps = --pending_ops[op];
        if (deps == 0) {
          ready_ops.insert(op);

--- a/paddle/fluid/framework/details/ssa_graph_checker.h
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -21,7 +21,6 @@
 namespace paddle {
 namespace framework {
 namespace details {
-struct SSAGraph;

 class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
 public:
@@ -29,17 +28,17 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
      std::unique_ptr<SSAGraphBuilder>&& builder)
      : builder_(std::move(builder)) {}

-  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
-    auto graph = builder_->Build(program);
-    PADDLE_ENFORCE(IsValidGraph(graph.get()));
-    return graph;
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
+    auto new_graph = builder_->Apply(std::move(graph));
+    PADDLE_ENFORCE(IsValidGraph(new_graph.get()));
+    return new_graph;
  }

  int GetVarDeviceID(const std::string& var_name) const override {
    return builder_->GetVarDeviceID(var_name);
  }

-  bool IsValidGraph(const SSAGraph* graph) const;
+  bool IsValidGraph(const Graph* graph) const;

 private:
  std::unique_ptr<SSAGraphBuilder> builder_;

--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -18,8 +18,8 @@
 #include <string>
 #include <vector>

-#include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/ir/graph.h"

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/details/ssa_graph_printer.cc
+++ b/paddle/fluid/framework/details/ssa_graph_printer.cc
@@ -14,15 +14,15 @@

 #include "paddle/fluid/framework/details/ssa_graph_printer.h"
 #include <string>
-#include "paddle/fluid/framework/details/ssa_graph.h"
+#include "paddle/fluid/framework/ir/graph.h"

 namespace paddle {
 namespace framework {
 namespace details {

 template <typename Callback>
-static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
-  for (auto &each : graph.vars_) {
+static inline void IterAllVar(const Graph &graph, Callback callback) {
+  for (auto &each : graph.Get<GraphVars>("vars")) {
    for (auto &pair1 : each) {
      for (auto &pair2 : pair1.second) {
        callback(*pair2);
@@ -30,12 +30,12 @@ static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
    }
  }

-  for (auto &var : graph.dep_vars_) {
+  for (auto &var : graph.Get<GraphDepVars>("dep_vars")) {
    callback(*var);
  }
 }

-void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
+void GraphvizSSAGraphPrinter::Print(const Graph &graph,
                                    std::ostream &sout) const {
  size_t var_id = 0;
  std::unordered_map<const VarHandleBase *, size_t> vars;
@@ -61,7 +61,7 @@ void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
  });

  size_t op_id = 0;
-  for (auto &op : graph.ops_) {
+  for (auto &op : graph.Get<GraphOps>("ops")) {
    std::string op_name = "op_" + std::to_string(op_id++);
    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
         << std::endl;

--- a/paddle/fluid/framework/details/ssa_graph_printer.h
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -21,16 +21,16 @@
 namespace paddle {
 namespace framework {
 namespace details {
-struct SSAGraph;
+
 class SSAGraphPrinter {
 public:
  virtual ~SSAGraphPrinter() {}
-  virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0;
+  virtual void Print(const Graph& graph, std::ostream& sout) const = 0;
 };

 class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
 public:
-  void Print(const SSAGraph& graph, std::ostream& sout) const override;
+  void Print(const Graph& graph, std::ostream& sout) const override;
 };

 class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
@@ -50,10 +50,10 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
        stream_ptr_(std::move(sout)),
        stream_ref_(*stream_ptr_) {}

-  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
-    auto graph = builder_->Build(program);
-    printer_->Print(*graph, stream_ref_);
-    return graph;
+  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
+    auto new_graph = builder_->Apply(std::move(graph));
+    printer_->Print(*new_graph, stream_ref_);
+    return new_graph;
  }

  int GetVarDeviceID(const std::string& var_name) const override {

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -14,13 +14,14 @@

 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"

+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+
 namespace paddle {
 namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    std::unique_ptr<SSAGraph> &&graph)
+    const std::vector<platform::Place> &places, std::unique_ptr<Graph> &&graph)
    : graph_(std::move(graph)),
      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                       : nullptr),
@@ -43,18 +44,18 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
  std::unordered_set<OpHandleBase *> delayed_ops;

  // Transform SSAGraph to pending_ops & pending_vars
-  for (auto &var_map : graph_->vars_) {
+  for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
    for (auto &name_pair : var_map) {
      for (auto &version_pair : name_pair.second) {
        InsertPendingVar(&pending_vars, &ready_vars, version_pair.get());
      }
    }
  }
-  for (auto &var : graph_->dep_vars_) {
+  for (auto &var : graph_->Get<details::GraphDepVars>("dep_vars")) {
    InsertPendingVar(&pending_vars, &ready_vars, var.get());
  }

-  for (auto &op : graph_->ops_) {
+  for (auto &op : graph_->Get<details::GraphOps>("ops")) {
    if (op->Inputs().empty()) {  // Special case, Op has no input.
      ready_ops.insert(op.get());
    } else {
@@ -64,11 +65,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(

  // Step 2. Insert FetchOps
  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
+  std::vector<std::unique_ptr<ir::Node>> tmp_nodes;
  std::unordered_set<std::unique_ptr<VarHandleBase>> fetch_dependencies;
  FeedFetchList fetch_data(fetch_tensors.size());

-  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
-                 &pending_vars, &ready_vars, &fetch_data);
+  InsertFetchOps(fetch_tensors, &fetch_ops, &tmp_nodes, &fetch_dependencies,
+                 &pending_ops, &pending_vars, &ready_vars, &fetch_data);

  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
    for (auto *op : set) {
@@ -125,7 +127,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    // Find the ready_ops after the ready_var.
    for (auto ready_var : cur_ready_vars) {
      pending_vars.erase(ready_var);
-      for (auto *op : ready_var->pending_ops_) {
+      for (auto *op : ready_var->PendingOps()) {
        auto &deps = pending_ops[op];
        --deps;
        if (deps == 0) {
@@ -151,6 +153,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 void ThreadedSSAGraphExecutor::InsertFetchOps(
    const std::vector<std::string> &fetch_tensors,
    std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
+    std::vector<std::unique_ptr<ir::Node>> *temp_nodes,
    std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
    std::unordered_map<OpHandleBase *, size_t> *pending_ops,
    std::unordered_set<VarHandleBase *> *pending_vars,
@@ -158,7 +161,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;

  for (auto &fetch_var_name : fetch_tensors) {
-    for (auto &var_map : graph_->vars_) {
+    for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
      auto it = var_map.find(fetch_var_name);
      if (it != var_map.end()) {
        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
@@ -168,8 +171,16 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(

  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
    auto &var_name = fetch_tensors[i];
-    auto &vars = fetched_vars.at(var_name);
-    auto *op = new FetchOpHandle(fetch_data, i, &local_scopes_);
+    auto fetched_var_it = fetched_vars.find(var_name);
+    PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
+                   "Cannot find fetched variable.(Perhaps the main_program "
+                   "is not set to ParallelExecutor)");
+
+    auto &vars = fetched_var_it->second;
+
+    temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
+    auto *op = new FetchOpHandle(temp_nodes->back().get(), fetch_data, i,
+                                 &local_scopes_);
    fetch_ops->emplace_back(op);

    for (auto &p : places_) {
@@ -180,7 +191,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
      op->AddInput(var);
    }

-    auto *fetch_dummy = new DummyVarHandle();
+    temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
+    auto *fetch_dummy = new DummyVarHandle(temp_nodes->back().get());
    op->AddOutput(fetch_dummy);
    fetch_dependencies->emplace(fetch_dummy);
    this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy);
@@ -198,7 +210,7 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
    std::unordered_set<VarHandleBase *> *pending_vars,
    BlockingQueue<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
  pending_vars->insert(var);
-  if (var->generated_op_ == nullptr) {
+  if (var->GeneratedOp() == nullptr) {
    ready_vars->Push(var);
  }
 }

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -27,6 +27,7 @@
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
+#include "paddle/fluid/framework/ir/graph.h"

 namespace paddle {
 namespace framework {
@@ -39,7 +40,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                           const std::vector<Scope *> &local_scopes,
                           const std::vector<platform::Place> &places,
-                           std::unique_ptr<SSAGraph> &&graph);
+                           std::unique_ptr<Graph> &&graph);

  // Run a SSAGraph by a thread pool
  // Use topological sort algorithm
@@ -52,7 +53,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
             details::OpHandleBase *op);

 private:
-  std::unique_ptr<SSAGraph> graph_;
+  std::unique_ptr<Graph> graph_;
  std::unique_ptr<::ThreadPool> pool_;
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
@@ -71,6 +72,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  void InsertFetchOps(
      const std::vector<std::string> &fetch_tensors,
      std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
+      std::vector<std::unique_ptr<ir::Node>> *temp_nodes,
      std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
      std::unordered_map<OpHandleBase *, size_t> *pending_ops,
      std::unordered_set<VarHandleBase *> *pending_vars,

--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -13,11 +13,14 @@
 // limitations under the License.

 #pragma once
+
+#include <algorithm>
 #include <sstream>
 #include <string>
 #include <unordered_set>
 #include <utility>

+#include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/platform/place.h"

 namespace paddle {
@@ -25,19 +28,60 @@ namespace framework {
 namespace details {
 class OpHandleBase;

+// Wraps ir::Node and provide helper utilities.
+// It's responsible for populating necessary fields of ir::Node.
+//
 // VarHandleBase is the var node in the dependency graph.
 // A variable can only be generated by a single operator. i.e.
 // This is a single assignment graph.
 struct VarHandleBase {
+  explicit VarHandleBase(ir::Node* node) : node_(node) {}
+
  virtual ~VarHandleBase();
+
  virtual std::string DebugString() const = 0;

+  void AddInput(OpHandleBase* in, ir::Node* node) {
+    node_->inputs.clear();
+    node_->inputs.push_back(node);
+    generated_op_ = in;
+  }
+
+  void AddOutput(OpHandleBase* out, ir::Node* node) {
+    if (pending_ops_.find(out) == pending_ops_.end()) {
+      pending_ops_.insert(out);
+      node_->outputs.push_back(node);
+    }
+  }
+
+  void RemoveOutput(OpHandleBase* out, ir::Node* node) {
+    pending_ops_.erase(out);
+    node_->outputs.erase(
+        std::remove(node_->outputs.begin(), node_->outputs.end(), node),
+        node_->outputs.end());
+  }
+
+  void ClearGeneratedOp() {
+    generated_op_ = nullptr;
+    node_->inputs.clear();
+  }
+
+  OpHandleBase* GeneratedOp() { return generated_op_; }
+
+  const std::unordered_set<OpHandleBase*>& PendingOps() const {
+    return pending_ops_;
+  }
+
+  ir::Node* Node() { return node_; }
+
+ protected:
  // The operator who generate this variable. nullptr if the variable
  // is a root node.
  OpHandleBase* generated_op_{nullptr};

  // Operators which depend on this variable ready.
  std::unordered_set<OpHandleBase*> pending_ops_;
+  ir::Node* node_;
 };

 // VarHandle is actually a single version of Runtime Variable.
@@ -46,11 +90,14 @@ struct VarHandleBase {
 //
 // NOTE: runtime variables have place.
 struct VarHandle : public VarHandleBase {
+  explicit VarHandle(ir::Node* node) : VarHandleBase(node) {}
+
  std::string DebugString() const override;

-  VarHandle(size_t version, size_t scope_index, std::string name,
-            platform::Place place)
-      : version_(version),
+  VarHandle(ir::Node* node, size_t version, size_t scope_index,
+            std::string name, platform::Place place)
+      : VarHandleBase(node),
+        version_(version),
        scope_idx_(scope_index),
        name_(std::move(name)),
        place_(std::move(place)) {}
@@ -70,6 +117,8 @@ struct VarHandle : public VarHandleBase {

 // Dummy Variable. It is used to represent dependencies between operators
 struct DummyVarHandle : public VarHandleBase {
+  explicit DummyVarHandle(ir::Node* node) : VarHandleBase(node) {}
+
  std::string DebugString() const override;
 };


--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
+cc_library(node SRCS node.cc DEPS proto_desc)
+cc_library(graph SRCS graph.cc DEPS node)
+cc_library(pass SRCS pass.cc DEPS graph node)
+
+cc_test(graph_test SRCS graph_test.cc DEPS graph proto_desc op_registry)
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+
+namespace paddle {
+namespace framework {
+
+// NOTE(paddle-dev): This graph contains circle.
+Graph::Graph(const ProgramDesc &program) : program_(program) {
+  VLOG(3) << "block in program:" << program_.Size();
+  std::unordered_map<std::string, VarDesc *> all_vars;
+  for (auto *var : program.Block(0).AllVars()) {
+    all_vars.emplace(var->Name(), var);
+  }
+
+  std::map<std::string, ir::Node *> var_nodes;
+  for (auto *op : program.Block(0).AllOps()) {
+    ir::Node *node = CreateOpNode(op);
+
+    for (auto &each_var_name : op->InputArgumentNames()) {
+      ir::Node *var = nullptr;
+      if (var_nodes.find(each_var_name) != var_nodes.end()) {
+        var = var_nodes.at(each_var_name);
+      } else if (all_vars.count(each_var_name) != 0) {
+        var = CreateVarNode(all_vars.at(each_var_name));
+        var_nodes[each_var_name] = var;
+      } else {
+        // TODO(paddle-dev): Seems some assumption doesn't hold?
+        VLOG(3) << op->Type()
+                << " input var not in all_var list: " << each_var_name;
+        var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
+        var_nodes[each_var_name] = var;
+      }
+      node->inputs.push_back(var);
+      var->outputs.push_back(node);
+    }
+
+    for (auto &each_var_name : op->OutputArgumentNames()) {
+      ir::Node *var = nullptr;
+      if (var_nodes.find(each_var_name) != var_nodes.end()) {
+        var = var_nodes.at(each_var_name);
+      } else {
+        var = CreateVarNode(all_vars.at(each_var_name));
+        var_nodes[each_var_name] = var;
+      }
+      node->outputs.push_back(var);
+      var->inputs.push_back(node);
+    }
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+
+class Graph {
+ public:
+  explicit Graph(const ProgramDesc& program);
+
+  virtual ~Graph() {
+    for (auto& attr : attrs_) {
+      attr_dels_[attr.first]();
+    }
+    attrs_.clear();
+    attr_dels_.clear();
+  }
+
+  template <typename AttrType>
+  AttrType& Get(const std::string& attr_name) const {
+    return *boost::any_cast<AttrType*>(attrs_.at(attr_name));
+  }
+
+  template <typename AttrType>
+  void Set(const std::string& attr_name, AttrType* attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+    attrs_[attr_name] = attr;
+    attr_dels_[attr_name] = [attr, attr_name]() {
+      VLOG(3) << "deleting " << attr_name;
+      delete attr;
+    };
+  }
+
+  ir::Node* CreateVarNode(VarDesc* var_desc) {
+    nodes.emplace_back(new ir::Node(var_desc));
+    return nodes.back().get();
+  }
+
+  ir::Node* CreateOpNode(OpDesc* op_desc) {
+    nodes.emplace_back(new ir::Node(op_desc));
+    return nodes.back().get();
+  }
+
+  ir::Node* CreateEmptyNode(const std::string& name, ir::Node::Type type) {
+    nodes.emplace_back(new ir::Node(name, type));
+    return nodes.back().get();
+  }
+
+  std::vector<std::unique_ptr<ir::Node>> nodes;
+
+ private:
+  // NOTE: program_ shouldn't be exposed to user.
+  const ProgramDesc& program_;
+  std::map<std::string, boost::any> attrs_;
+  std::map<std::string, std::function<void(void)>> attr_dels_;
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+class NOP : public OperatorBase {
+ public:
+  NOP(const std::string &type, const VariableNameMap &inputs,
+      const VariableNameMap &outputs, const AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  void RunImpl(const Scope &scope,
+               const platform::Place &place) const override {}
+};
+
+class SumOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class SumOpVarTypeInference : public VarTypeInference {
+ public:
+  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
+    auto &inputs = op_desc.Input("X");
+    auto default_var_type = proto::VarType::SELECTED_ROWS;
+
+    bool any_input_is_lod_tensor = std::any_of(
+        inputs.begin(), inputs.end(), [block](const std::string &name) {
+          return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR;
+        });
+    if (any_input_is_lod_tensor) {
+      default_var_type = proto::VarType::LOD_TENSOR;
+    }
+
+    auto out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetType(default_var_type);
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
+                  paddle::framework::SumOpVarTypeInference);
+REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
+                  paddle::framework::SumOpMaker);
+
+namespace paddle {
+namespace framework {
+
+TEST(GraphTest, Basic) {
+  ProgramDesc prog;
+  auto *op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("sum");
+  op->SetInput("X", {"test_a", "test_b", "test_c"});
+  op->SetOutput("Out", {"test_out"});
+
+  prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarType::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarType::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_out");
+
+  op->InferVarType(prog.MutableBlock(0));
+
+  ASSERT_EQ(proto::VarType::SELECTED_ROWS,
+            prog.MutableBlock(0)->Var("test_out")->GetType());
+
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::LOD_TENSOR);
+  op->InferVarType(prog.MutableBlock(0));
+  ASSERT_EQ(proto::VarType::LOD_TENSOR,
+            prog.MutableBlock(0)->Var("test_out")->GetType());
+
+  std::unique_ptr<Graph> g(new Graph(prog));
+  ASSERT_EQ(g->nodes[0]->Name(), "sum");
+  ASSERT_EQ(g->nodes[0]->inputs[0]->Name(), "test_a");
+  ASSERT_EQ(g->nodes[0]->inputs[1]->Name(), "test_b");
+  ASSERT_EQ(g->nodes[0]->inputs[2]->Name(), "test_c");
+  ASSERT_EQ(g->nodes[0]->outputs[0]->Name(), "test_out");
+  ASSERT_EQ(g->nodes[1]->Name(), "test_a");
+  ASSERT_EQ(g->nodes[1]->outputs[0]->Name(), "sum");
+  ASSERT_EQ(g->nodes[2]->Name(), "test_b");
+  ASSERT_EQ(g->nodes[2]->outputs[0]->Name(), "sum");
+  ASSERT_EQ(g->nodes[3]->Name(), "test_c");
+  ASSERT_EQ(g->nodes[3]->outputs[0]->Name(), "sum");
+  ASSERT_EQ(g->nodes[4]->Name(), "test_out");
+  ASSERT_EQ(g->nodes[4]->inputs[0]->Name(), "sum");
+  ASSERT_EQ(g->nodes.size(), 5);
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/node.h"
+
+namespace paddle {
+namespace framework {}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Node {
+ public:
+  enum class Type { kOperation, kVariable };
+  explicit Node(const std::string& name, Type type)
+      : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
+
+  explicit Node(VarDesc* var_desc)
+      : name_(var_desc->Name()),
+        var_desc_(var_desc),
+        op_desc_(nullptr),
+        type_(Type::kVariable) {}
+
+  explicit Node(OpDesc* op_desc)
+      : name_(op_desc->Type()),
+        var_desc_(nullptr),
+        op_desc_(op_desc),
+        type_(Type::kOperation) {}
+
+  Type NodeType() const { return type_; }
+
+  std::string Name() const { return name_; }
+
+  VarDesc* Var() {
+    PADDLE_ENFORCE(type_ == Type::kVariable);
+    return var_desc_;
+  }
+  OpDesc* Op() {
+    PADDLE_ENFORCE(type_ == Type::kOperation);
+    return op_desc_;
+  }
+
+  std::vector<Node*> inputs;
+  std::vector<Node*> outputs;
+
+ protected:
+  const std::string name_;
+  VarDesc* var_desc_;
+  OpDesc* op_desc_;
+  Type type_;
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(Node);
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Pass {
+ public:
+  Pass() = default;
+  virtual ~Pass() {}
+
+  virtual std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const = 0;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -312,19 +312,22 @@ void WriteToRecordIO(recordio::Writer *writer,
  writer->Write(buffer.str());
 }

-std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner *scanner, const platform::DeviceContext &dev_ctx) {
-  std::vector<LoDTensor> result;
-  if (scanner->HasNext()) {
-    std::istringstream sin(scanner->Next());
-    uint32_t sz;
-    sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
-    result.resize(sz);
-    for (uint32_t i = 0; i < sz; ++i) {
-      DeserializeFromStream(sin, &result[i], dev_ctx);
-    }
+bool ReadFromRecordIO(recordio::Scanner *scanner,
+                      const platform::DeviceContext &dev_ctx,
+                      std::vector<LoDTensor> *result_ptr) {
+  if (!scanner->HasNext()) {
+    return false;
  }
-  return result;
+  std::istringstream sin(scanner->Next());
+  uint32_t sz;
+  sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
+  auto &result = *result_ptr;
+  result.resize(sz);
+  for (uint32_t i = 0; i < sz; ++i) {
+    DeserializeFromStream(sin, &result[i], dev_ctx);
+  }
+
+  return true;
 }

 std::vector<LoDTensor> LoDTensor::SplitLoDTensor(

--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -223,8 +223,9 @@ extern void WriteToRecordIO(recordio::Writer* writer,
                            const std::vector<LoDTensor>& tensor,
                            const platform::DeviceContext& dev_ctx);

-extern std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
+extern bool ReadFromRecordIO(recordio::Scanner* scanner,
+                             const platform::DeviceContext& dev_ctx,
+                             std::vector<LoDTensor>* result_ptr);

 /*
 * Convert between length-based LoD and offset-based LoD.

--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -301,11 +301,12 @@ static void TestRecordIO() {
  {
    std::unique_ptr<std::istream> stream_ptr(stream);
    recordio::Scanner scanner(std::move(stream_ptr));
-    auto tensors = ReadFromRecordIO(&scanner, ctx);
+    std::vector<framework::LoDTensor> tensors;
+    ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors));
    ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
    assert_tensor_ok(tensors[0]);
    assert_tensor_ok(tensors[1]);
-    tensors = ReadFromRecordIO(&scanner, ctx);
+    ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors));
    ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
    assert_tensor_ok(tensors[0]);
    assert_tensor_ok(tensors[1]);

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -26,6 +26,7 @@
 namespace paddle {
 namespace framework {

+#if defined(PADDLE_WITH_CUDA)
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
@@ -37,11 +38,11 @@ class Vector {
  Vector() { InitEmpty(); }

  // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T& value = T()) {
+  explicit Vector(size_t count, const T &value = T()) {
    InitEmpty();
    if (count != 0) {
      resize(count);
-      T* ptr = begin();
+      T *ptr = begin();
      for (size_t i = 0; i < count; ++i) {
        ptr[i] = value;
      }
@@ -59,7 +60,7 @@ class Vector {

  // implicit cast from std::vector.
  template <typename U>
-  Vector(const std::vector<U>& dat) {  // NOLINT
+  Vector(const std::vector<U> &dat) {  // NOLINT
    if (dat.size() == 0) {
      InitEmpty();
    } else {
@@ -68,10 +69,10 @@ class Vector {
  }

  // Copy ctor
-  Vector(const Vector<T>& other) { this->operator=(other); }
+  Vector(const Vector<T> &other) { this->operator=(other); }

  // Copy operator
-  Vector<T>& operator=(const Vector<T>& other) {
+  Vector<T> &operator=(const Vector<T> &other) {
    if (other.size() != 0) {
      this->InitByIter(other.size(), other.begin(), other.end());
    } else {
@@ -81,7 +82,7 @@ class Vector {
  }

  // Move ctor
-  Vector(Vector<T>&& other) {
+  Vector(Vector<T> &&other) {
    this->size_ = other.size_;
    this->flag_ = other.flag_;
    if (other.cuda_vec_.memory_size()) {
@@ -93,13 +94,13 @@ class Vector {
  }

  // CPU data access method. Mutable.
-  T& operator[](size_t i) {
+  T &operator[](size_t i) {
    MutableCPU();
-    return const_cast<T*>(cpu_vec_.data<T>())[i];
+    return const_cast<T *>(cpu_vec_.data<T>())[i];
  }

  // CPU data access method. Immutable.
-  const T& operator[](size_t i) const {
+  const T &operator[](size_t i) const {
    ImmutableCPU();
    return cpu_vec_.data<T>()[i];
  }
@@ -107,43 +108,43 @@ class Vector {
  // std::vector iterator methods. Based on CPU data access method
  size_t size() const { return size_; }

-  T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
+  T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }

-  T* end() {
+  T *end() {
    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
  }

-  T& front() { return *begin(); }
+  T &front() { return *begin(); }

-  T& back() {
+  T &back() {
    auto it = end();
    --it;
    return *it;
  }

-  const T* begin() const {
+  const T *begin() const {
    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
  }

-  const T* end() const {
+  const T *end() const {
    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
  }

-  const T* cbegin() const { return begin(); }
+  const T *cbegin() const { return begin(); }

-  const T* cend() const { return end(); }
+  const T *cend() const { return end(); }

-  const T& back() const {
+  const T &back() const {
    auto it = end();
    --it;
    return *it;
  }

-  T* data() { return begin(); }
+  T *data() { return begin(); }

-  const T* data() const { return begin(); }
+  const T *data() const { return begin(); }

-  const T& front() const { return *begin(); }
+  const T &front() const { return *begin(); }
  // end of std::vector iterator methods

  // assign this from iterator.
@@ -169,7 +170,7 @@ class Vector {
  void Extend(It begin, It end) {
    size_t pre_size = size_;
    resize(pre_size + (end - begin));
-    T* ptr = this->begin() + pre_size;
+    T *ptr = this->begin() + pre_size;
    for (; begin < end; ++begin, ++ptr) {
      *ptr = *begin;
    }
@@ -183,9 +184,9 @@ class Vector {
      MutableCPU();
      Tensor cpu_tensor;
      platform::Place cpu = platform::CPUPlace();
-      T* ptr = cpu_tensor.mutable_data<T>(
+      T *ptr = cpu_tensor.mutable_data<T>(
          framework::make_ddim({static_cast<int64_t>(size)}), cpu);
-      const T* old_ptr =
+      const T *old_ptr =
          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
      if (old_ptr != nullptr) {
        std::copy(old_ptr, old_ptr + size_, ptr);
@@ -196,7 +197,7 @@ class Vector {
  }

  // get cuda ptr. immutable
-  const T* CUDAData(platform::Place place) const {
+  const T *CUDAData(platform::Place place) const {
    PADDLE_ENFORCE(platform::is_gpu_place(place),
                   "CUDA Data must on CUDA place");
    ImmutableCUDA(place);
@@ -204,10 +205,10 @@ class Vector {
  }

  // get cuda ptr. mutable
-  T* CUDAMutableData(platform::Place place) {
-    const T* ptr = CUDAData(place);
+  T *CUDAMutableData(platform::Place place) {
+    const T *ptr = CUDAData(place);
    flag_ = kDirty | kDataInCUDA;
-    return const_cast<T*>(ptr);
+    return const_cast<T *>(ptr);
  }

  // clear
@@ -228,7 +229,7 @@ class Vector {
  }

  // the unify method to access CPU or CUDA data. immutable.
-  const T* Data(platform::Place place) const {
+  const T *Data(platform::Place place) const {
    if (platform::is_gpu_place(place)) {
      return CUDAData(place);
    } else {
@@ -237,7 +238,7 @@ class Vector {
  }

  // the unify method to access CPU or CUDA data. mutable.
-  T* MutableData(platform::Place place) {
+  T *MutableData(platform::Place place) {
    if (platform::is_gpu_place(place)) {
      return CUDAMutableData(place);
    } else {
@@ -253,7 +254,7 @@ class Vector {
    return result;
  }

-  bool operator==(const Vector<T>& other) const {
+  bool operator==(const Vector<T> &other) const {
    if (size() != other.size()) return false;
    auto it1 = cbegin();
    auto it2 = other.cbegin();
@@ -274,7 +275,7 @@ class Vector {
  template <typename Iter>
  void InitByIter(size_t size, Iter begin, Iter end) {
    platform::Place cpu = platform::CPUPlace();
-    T* ptr = this->cpu_vec_.template mutable_data<T>(
+    T *ptr = this->cpu_vec_.template mutable_data<T>(
        framework::make_ddim({static_cast<int64_t>(size)}), cpu);
    for (size_t i = 0; i < size; ++i) {
      *ptr++ = *begin++;
@@ -368,7 +369,7 @@ class Vector {
    }
  }

-  static T& EmptyDummy() {
+  static T &EmptyDummy() {
    static T dummy = T();
    return dummy;
  }
@@ -379,5 +380,53 @@ class Vector {
  size_t size_;
 };

-}  // namespace framework
+#else  // PADDLE_WITH_CUDA
+
+template <typename T>
+class CPUVector : public std::vector<T, std::allocator<T>> {
+ public:
+  CPUVector() : std::vector<T>() {}
+  CPUVector(size_t count, const T &value = T())
+      : std::vector<T>(count, value) {}
+  CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
+  CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}
+  explicit CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
+  CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
+  CPUVector(std::vector<T> &&other) : std::vector<T>(std::move(other)) {}
+  CPUVector &operator=(const CPUVector &other) {
+    this->assign(other.begin(), other.end());
+    return *this;
+  }
+  CPUVector &operator=(const std::vector<T> &other) {
+    this->assign(other.begin(), other.end());
+    return *this;
+  }
+
+  friend std::ostream &operator<<(std::ostream &os, const CPUVector<T> &other) {
+    std::stringstream ss;
+    for (auto v : other) {
+      os << v << " ";
+    }
+    return os;
+  }
+
+  void resize(size_t size) { this->resize(size); }
+
+  T &operator[](size_t id) { return this->at(id); }
+
+  const T &operator[](size_t id) const { return this->at(id); }
+
+  template <typename D>
+  void Extend(const D &begin, const D &end) {
+    this->reserve(this->size() + size_t(end - begin));
+    this->insert(this->end(), begin, end);
+  }
+};
+
+template <typename T>
+using Vector = CPUVector<T>;
+
+#endif  // PADDLE_WITH_CUDA
+
+};  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <tuple>
 #include <vector>

+#include "paddle/fluid/framework/ir/graph.h"
+
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
@@ -129,12 +131,11 @@ ParallelExecutor::ParallelExecutor(
    PADDLE_THROW("Not compiled with CUDA.");
 #endif
  }
-
  builder_ = builder_factory.Create();
+  std::unique_ptr<Graph> graph(new Graph(main_program));
+  graph = builder_->Apply(std::move(graph));
  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      exec_strategy, member_->local_scopes_, places,
-      builder_->Build(main_program)));
-
+      exec_strategy, member_->local_scopes_, places, std::move(graph)));
  member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
      exec_strategy, member_->local_scopes_, std::move(var_infos),
      member_->places_, std::move(member_->executor_)));

--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -67,7 +67,8 @@ void ReaderBase::Start() {
  }
 }

-ReaderBase::~ReaderBase() { Shutdown(); }
+ReaderBase::~ReaderBase() {}

+DecoratedReader::~DecoratedReader() { reader_->Shutdown(); }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -25,8 +25,6 @@
 namespace paddle {
 namespace framework {

-enum ReaderStatus { kRunning, kStopped };
-
 class ReaderBase {
 public:
  virtual void ReadNext(std::vector<LoDTensor>* out);
@@ -48,6 +46,8 @@ class ReaderBase {

  virtual void StartImpl() {}

+  enum ReaderStatus { kRunning, kStopped };
+
  ReaderStatus status_{kRunning};

  mutable std::mutex mu_;
@@ -74,6 +74,8 @@ class DecoratedReader : public ReaderBase,
    reader_->InsertDecoratedReader(shared_from_this());
  }

+  ~DecoratedReader();
+
 protected:
  void ShutdownImpl() override { reader_->Shutdown(); }


--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -15,6 +15,7 @@
 #include <algorithm>
 #include <limits>
 #include <vector>
+#include "paddle/fluid/framework/data_type.h"

 namespace paddle {
 namespace framework {
@@ -261,7 +262,8 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
    os.write(out.data(), size);
  }
  {  // the 3rd field, tensor data
-    uint64_t size = tensor.memory_size();
+    uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type());
+
    auto* data_ptr = tensor.data<void>();
    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
                   "Index overflow when writing tensor");
@@ -331,6 +333,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
    tensor->Resize(framework::make_ddim(dims));
    void* buf;
    auto ctx = platform::CPUDeviceContext();
+    size_t size =
+        tensor->numel() *
+        framework::SizeOfType(framework::ToTypeIndex(desc.data_type()));
    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
      Tensor cpu_tensor;
@@ -338,7 +343,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
+      is.read(static_cast<char*>(buf), size);
      auto dst_place = dev_ctx.GetPlace();
      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
 #else
@@ -348,7 +353,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
      framework::VisitDataType(
          desc.data_type(),
          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
-      is.read(static_cast<char*>(buf), tensor->memory_size());
+      is.read(static_cast<char*>(buf), size);
    }
  }
 }

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -38,4 +38,6 @@ if(WITH_TESTING)
  # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
  add_subdirectory(tests/book)
 endif()
-add_subdirectory(api)
+if(NOT APPLE)
+  add_subdirectory(api)
+endif()
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -19,10 +19,14 @@ function (inference_analysis_test TARGET)
        set(multiValueArgs SRCS)
        cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

+        set(mem_opt "")
+        if(WITH_GPU)
+            set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
+        endif()
        cc_test(${TARGET}
                SRCS "${analysis_test_SRCS}"
                DEPS analysis
-                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt})
        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
    endif(WITH_TESTING)
 endfunction(inference_analysis_test)

--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -22,8 +22,6 @@
 #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"

 namespace paddle {
-namespace inference {
-namespace analysis {

 DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
            "Enable subgraph to TensorRT engine for acceleration");
@@ -31,6 +29,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
 DEFINE_string(inference_analysis_graphviz_log_root, "./",
              "Graphviz debuger for data flow graphs.");

+namespace inference {
+namespace analysis {
+
 class DfgPassManagerImpl final : public DfgPassManager {
 public:
  DfgPassManagerImpl() {

--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -45,14 +45,15 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/pass_manager.h"

 namespace paddle {
-namespace inference {
-namespace analysis {

 // TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
 // flag if not available.
 DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
 DECLARE_string(inference_analysis_graphviz_log_root);

+namespace inference {
+namespace analysis {
+
 class Analyzer : public OrderedRegistry<PassManager> {
 public:
  // Register all the pass-managers.

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -13,13 +13,21 @@
 // limitations under the License.

 #include "paddle/fluid/inference/analysis/analyzer.h"
+#include <google/protobuf/text_format.h>
 #include "paddle/fluid/inference/analysis/ut_helper.h"

 namespace paddle {
 namespace inference {
 namespace analysis {

-TEST_F(DFG_Tester, main) {
+TEST_F(DFG_Tester, analysis_without_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
+  Analyzer analyser;
+  analyser.Run(&argument);
+}
+
+TEST_F(DFG_Tester, analysis_with_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
  Analyzer analyser;
  analyser.Run(&argument);
 }

--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -222,10 +222,19 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
  return stack_.top();
 }

+inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
+  return node.inlinks.size() == n;
+}
+
 GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
    const std::vector<Node *> &source) {
  PADDLE_ENFORCE(!source.empty(),
                 "Start points of topological sorting should not be empty!");
+  // CHECK all the inputs' in-degree is 0
+  for (auto *node : source) {
+    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
+  }
+
  std::unordered_set<Node *> visited;
  std::unordered_set<Node *> to_visit{source.begin(), source.end()};

@@ -233,6 +242,11 @@ GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
  while (!to_visit.empty()) {
    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
    for (auto *p : queue) {
+      if (p->deleted()) {
+        visited.insert(p);
+        to_visit.erase(p);
+        continue;
+      }
      inlink_visited.clear();

      std::copy_if(p->inlinks.begin(), p->inlinks.end(),
@@ -292,6 +306,37 @@ Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
  return sorted_[cursor_];
 }

+std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
+  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
+  std::unordered_set<Node *> inputs;
+  std::unordered_set<Node *> outputs;
+  // Input a Value, check whether its inlink is in the subgraph.
+  auto inlink_in_subgraph = [&](Node *n) {
+    for (auto *in : n->inlinks) {
+      if (nodes.count(in)) return true;
+    }
+    return false;
+  };
+  for (auto &node : graph) {
+    for (auto *in : node->inlinks) {
+      // The Value that is written by nodes inside a sub-graph shouldn't be the
+      // input of the sub-graph.
+      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
+          !inlink_in_subgraph(in)) {
+        inputs.insert(in);
+      }
+    }
+    for (auto *out : node->outlinks) {
+      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
+        outputs.insert(out);
+      }
+    }
+  }
+  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
+                        std::vector<Node *>(outputs.begin(), outputs.end()));
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -133,7 +133,7 @@ struct GraphTraits<DataFlowGraph> {

   private:
    std::vector<Node *> sorted_;
-    int cursor_{0};
+    size_t cursor_{0};
  };

  explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
@@ -173,36 +173,8 @@ struct GraphTraits<DataFlowGraph> {
 // Extract the inputs and outputs of a graph. The inputs and outputs of a
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
-static std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
-  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
-  std::unordered_set<Node *> inputs;
-  std::unordered_set<Node *> outputs;
-  // Input a Value, check whether its inlink is in the subgraph.
-  auto inlink_in_subgraph = [&](Node *n) {
-    for (auto *in : n->inlinks) {
-      if (nodes.count(in)) return true;
-    }
-    return false;
-  };
-  for (auto &node : graph) {
-    for (auto *in : node->inlinks) {
-      // The Value that is written by nodes inside a sub-graph shouldn't be the
-      // input of the sub-graph.
-      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
-          !inlink_in_subgraph(in)) {
-        inputs.insert(in);
-      }
-    }
-    for (auto *out : node->outlinks) {
-      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
-        outputs.insert(out);
-      }
-    }
-  }
-  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
-                        std::vector<Node *>(outputs.begin(), outputs.end()));
-}
+std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);

 }  // namespace analysis
 }  // namespace inference

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -22,14 +22,18 @@

 namespace paddle {
 namespace inference {
+
+DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size");
+DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
+
 namespace analysis {

 using framework::proto::ProgramDesc;

 std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>>& nodes);
+    const std::vector<std::unique_ptr<Node>> &nodes);

-bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
+bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
  ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
  PADDLE_ENFORCE(!argument->transformed_program_desc);
@@ -47,76 +51,77 @@ bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {

 bool DataFlowGraphToFluidPass::Finalize() { return true; }

-void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
-  auto traits = GraphTraits<DataFlowGraph>(graph);
-  for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
-    if (it->deleted()) continue;
+void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
+  LOG(INFO) << "graph.inputs " << graph->inputs.size();
+  for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
+    if (node.deleted()) continue;

-    switch (it->type()) {
+    switch (node.type()) {
      case Node::Type::kFunction: {
-        LOG(INFO) << "add function " << it->repr();
-        AddFluidOp(&(*it));
+        LOG(INFO) << "add function " << node.repr();
+        AddFluidOp(&node);
      } break;
      case Node::Type::kFunctionBlock: {
-        LOG(INFO) << "add engine op " << it->repr() << " , "
-                  << static_cast<FunctionBlock*>(&(*it))->subgraph.size();
-        AddEngineOp(&(*it));
+        LOG(INFO) << "add engine op " << node.repr() << " , "
+                  << static_cast<FunctionBlock *>(&node)->subgraph.size();
+        AddEngineOp(&node);
      } break;
      default:
        continue;
    }
  }
+
+  PADDLE_ENFORCE(argument_->transformed_program_desc.get());
 }

-void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
-  auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
+void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
+  auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
  // currently only the main block is analyzed.
-  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto* op = main_block->add_ops();
+  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto *op = main_block->add_ops();
  *op = *ori_op;  // copy the attributes, by default, these will not be changed
-                  // by analysis phrase.
+  // by analysis phrase.
  // The inputs and outputs of the existing ops are not changed by tensorrt
  // subgraph pass.
  // NOTE It might be changed by other passes in the long run.
 }

-void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
-                       const framework::proto::BlockDesc& block) {
+void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
+                       const framework::proto::BlockDesc &block) {
  static int counter{0};
  PADDLE_ENFORCE(node->IsFunctionBlock());
  framework::OpDesc desc;
-  auto* func = static_cast<FunctionBlock*>(node);
+  auto *func = static_cast<FunctionBlock *>(node);

  // collect inputs
  std::vector<std::string> io;
-  for (auto* x : func->inlinks) {
+  for (auto *x : func->inlinks) {
    io.push_back(x->name());
  }
  desc.SetInput("Xs", io);

  // collect outputs
  io.clear();
-  for (auto* x : func->outlinks) {
+  for (auto *x : func->outlinks) {
    io.push_back(x->name());
  }
  desc.SetOutput("Ys", io);
-
  desc.SetType("tensorrt_engine");
+
+  PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc");
  // Set attrs
  SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
-  SetAttr(desc.Proto(), "engine_unique_key",
-          "trt-" + std::to_string(counter++));
-  SetAttr(desc.Proto(), "max_batch", 100);  // TODO(Superjomn) add config latter
-  SetAttr(desc.Proto(), "max_workspace",
-          1024);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
+  SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
+  SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
  node->SetPbMsg(desc.Proto()->SerializeAsString());
 }

 std::vector<std::string> ExtractParameters(
-    const std::vector<std::unique_ptr<Node>>& nodes) {
+    const std::vector<std::unique_ptr<Node>> &nodes) {
  std::vector<std::string> parameters;
-  for (const auto& node : nodes) {
+  for (const auto &node : nodes) {
    if (!node->IsValue()) continue;
    PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
    framework::proto::VarDesc var;
@@ -128,21 +133,30 @@ std::vector<std::string> ExtractParameters(
  return parameters;
 }

-void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
+void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
  // TODO(Superjomn) Here need to expose some arguments for default setting.
  PADDLE_ENFORCE(node->IsFunctionBlock());
-  auto* block_node = static_cast<FunctionBlock*>(node);
+  auto *block_node = static_cast<FunctionBlock *>(node);
  framework::proto::BlockDesc proto;
  framework::BlockDesc block_desc(nullptr, &proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+  LOG(INFO) << "origin variable size: "
+            << argument_->origin_program_desc->blocks(0).vars().size();
+  LOG(INFO) << "transformed variable size: "
+            << block_desc.Proto()->vars().size();
  // copy ops.
-  for (auto* node : block_node->subgraph) {
-    auto* op = block_desc.AppendOp();
+  for (auto *node : block_node->subgraph) {
+    auto *op = block_desc.AppendOp();
    PADDLE_ENFORCE(!node->pb_msg().empty());
    op->Proto()->ParseFromString(node->pb_msg());
  }
+  *block_desc.Proto()->mutable_vars() =
+      argument_->origin_program_desc->blocks(0).vars();
+  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
  CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
-  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
-  auto* op = main_block->add_ops();
+  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto *op = main_block->add_ops();
  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
  op->ParseFromString(node->pb_msg());
 }
@@ -151,7 +165,7 @@ namespace {
 class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 public:
  using Config = DFG_GraphvizDrawPass::Config;
-  explicit DFG_DebuggerPass(const Config& config)
+  explicit DFG_DebuggerPass(const Config &config)
      : DFG_GraphvizDrawPass(config) {}

  std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
@@ -160,7 +174,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
 };
 }  // namespace

-Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
      FLAGS_inference_analysis_graphviz_log_root,
      "data_flow_graph_to_fluid_graphviz_debugger"));

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -26,6 +26,10 @@

 namespace paddle {
 namespace inference {
+
+DECLARE_int32(tensorrt_max_batchsize);
+DECLARE_int32(tensorrt_workspace_size);
+
 namespace analysis {
 class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
 public:

--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
    no++;
  }
  // DFG is sensitive to ProgramDesc, be careful to change the existing models.
-  ASSERT_EQ(no, 82);
+  ASSERT_EQ(no, 83);
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -28,7 +28,6 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
  PADDLE_ENFORCE(argument);
  if (!argument->main_dfg) {
-    LOG(INFO) << "Init DFG";
    argument->main_dfg.reset(new DataFlowGraph);
  }
  desc_ = argument->origin_program_desc.get();
@@ -51,6 +50,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
    v->SetPbMsg(var.SerializeAsString());
    var2id[var.name()] = v->id();
  }
+
  for (int i = 0; i < main_block.ops_size(); i++) {
    const auto &op = main_block.ops(i);
    auto *o = graph->nodes.Create(Node::Type::kFunction);
@@ -62,19 +62,31 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
    o->SetPbMsg(op.SerializeAsString());

    // set inputs and outputs
-    // TODO(Superjomn) make sure the InputNames is the real variable name.
+    std::unordered_set<Node *> inlinks;
    for (int j = 0; j < op.inputs_size(); j++) {
      auto &in_var = op.inputs(j);
      for (int k = 0; k < in_var.arguments_size(); k++) {
        auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
        in->outlinks.push_back(o);
        o->inlinks.push_back(in);
+        inlinks.insert(in);
      }
    }
    for (int j = 0; j < op.outputs_size(); j++) {
      auto &out_var = op.outputs(j);
      for (int k = 0; k < out_var.arguments_size(); k++) {
        auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
+        if (inlinks.count(out)) {
+          // Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
+          auto *out_alias = graph->nodes.Create(Node::Type::kValue);
+          out_alias->SetName(out->name());
+          out_alias->SetPbDesc(out->pb_desc());
+          out_alias->SetPbMsg(out->pb_msg());
+          var2id[out_alias->name()] = out_alias->id();  // update a -> a0
+          LOG(INFO) << "loop found in graph, create SSA alias node ["
+                    << out_alias->repr() << "] for [" << out->repr() << "]";
+          out = out_alias;
+        }
        out->inlinks.push_back(o);
        o->outlinks.push_back(out);
      }

--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -24,12 +24,12 @@ namespace analysis {
 TEST_F(DFG_Tester, Init) {
  FluidToDataFlowGraphPass pass;
  pass.Initialize(&argument);
-  DataFlowGraph graph;
-  pass.Run(&graph);
+  pass.Run(argument.main_dfg.get());
  // Analysis is sensitive to ProgramDesc, careful to change the original model.
-  ASSERT_EQ(graph.nodes.size(), 37UL);
+  ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
  pass.Finalize();
-  LOG(INFO) << '\n' << graph.DotString();
+  ASSERT_FALSE(argument.main_dfg->DotString().empty());
+  EXPECT_FALSE(argument.main_dfg->inputs.empty());
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -25,6 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(

 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
  SubGraphFuse(graph, node_inside_subgraph_teller_)();
+  VLOG(4) << "debug info "
+          << graph->HumanReadableInfo(false /*show_values*/,
+                                      true /*show_functions*/);
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -26,13 +26,13 @@ endif()
 function(inference_api_test TARGET_NAME)
    if (WITH_TESTING)
        set(options "")
-        set(oneValueArgs "")
+        set(oneValueArgs SRC)
        set(multiValueArgs ARGS)
        cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

        set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
        cc_test(${TARGET_NAME}
-                SRCS ${TARGET_NAME}.cc
+                SRCS ${inference_test_SRC}
                DEPS "${inference_deps}"
                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
        if(inference_test_ARGS)
@@ -73,24 +73,24 @@ if(NOT APPLE)
 endif()

 cc_test(test_paddle_inference_api
-        SRCS test_api.cc
+        SRCS api_tester.cc
        DEPS paddle_inference_api)

-inference_api_test(test_api_impl
+inference_api_test(test_api_impl SRC api_impl_tester.cc
                    ARGS test_word2vec test_image_classification)

 if(WITH_GPU AND TENSORRT_FOUND)
 cc_library(paddle_inference_tensorrt_subgraph_engine
        SRCS api_tensorrt_subgraph_engine.cc
-        DEPS paddle_inference_api analysis tensorrt_engine paddle_fluid_api)
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter)

-inference_api_test(test_api_tensorrt_subgraph_engine ARGS test_word2vec)
+inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()

 if (WITH_ANAKIN) # only needed in CI
    # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
    # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
-    # compile the libinference_anakin_api.a and compile with anakin.so.
+    # compile the libinference_anakin_api.a and anakin.so.
    nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
    nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
    target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})

--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -39,7 +39,7 @@ bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {

 bool PaddleInferenceAnakinPredictor::Run(
    const std::vector<PaddleTensor> &inputs,
-    std::vector<PaddleTensor> *output_data) {
+    std::vector<PaddleTensor> *output_data, int batch_size) {
  for (const auto &input : inputs) {
    if (input.dtype != PaddleDType::FLOAT32) {
      LOG(ERROR) << "Only support float type inputs. " << input.name

--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -37,7 +37,8 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
  // NOTE Unlike the native engine, the buffers of anakin engine's output_data
  // should be allocated first.
  bool Run(const std::vector<PaddleTensor>& inputs,
-           std::vector<PaddleTensor>* output_data) override;
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override;

  std::unique_ptr<PaddlePredictor> Clone() override;


--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -66,6 +66,7 @@ bool NativePaddlePredictor::Init(
  if (parent_scope) {
    scope_ = parent_scope;
    sub_scope_ = &(parent_scope->NewScope());
+    PADDLE_ENFORCE_NOT_NULL(sub_scope_, "create sub scope fail");
  } else {
    paddle::framework::InitDevices(false);
    scope_.reset(new paddle::framework::Scope());
@@ -102,13 +103,13 @@ bool NativePaddlePredictor::Init(

 NativePaddlePredictor::~NativePaddlePredictor() {
  if (sub_scope_) {
-    PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!");
    scope_->DeleteScope(sub_scope_);
  }
 }

 bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
-                                std::vector<PaddleTensor> *output_data) {
+                                std::vector<PaddleTensor> *output_data,
+                                int batch_size) {
  VLOG(3) << "Predictor::predict";
  Timer timer;
  timer.tic();

--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -38,7 +38,8 @@ class NativePaddlePredictor : public PaddlePredictor {
  bool Init(std::shared_ptr<framework::Scope> parent_scope);

  bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data) override;
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = -1) override;

  std::unique_ptr<PaddlePredictor> Clone() override;


--- a/paddle/fluid/inference/api/test_api_impl.cc
+++ b/paddle/fluid/inference/api/test_api_impl.cc
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/operators/tensorrt_engine_op.h"

 namespace paddle {

@@ -64,16 +65,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
      return false;
    }

-    // Analyze inference_program
-    Argument argument;
-    argument.origin_program_desc.reset(
-        new ProgramDesc(*inference_program_->Proto()));
-    Singleton<Analyzer>::Global().Run(&argument);
-    CHECK(argument.transformed_program_desc);
-    VLOG(5) << "transformed program:\n"
-            << argument.transformed_program_desc->SerializeAsString();
-    VLOG(5) << "to prepare executor";
-    *inference_program_->Proto() = *argument.transformed_program_desc;
+    OptimizeInferenceProgram();
    ctx_ = executor_->Prepare(*inference_program_, 0);

    VLOG(5) << "to create variables";
@@ -86,6 +78,29 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
    return true;
  }

+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override {
+    PADDLE_ENFORCE_GT(batch_size, 0,
+                      "TensorRT engine needs the argument batch_size set");
+    FLAGS_tensorrt_engine_batch_size = batch_size;
+    return NativePaddlePredictor::Run(inputs, output_data, batch_size);
+  }
+
+  void OptimizeInferenceProgram() {
+    // Analyze inference_program
+    Argument argument;
+    argument.origin_program_desc.reset(
+        new ProgramDesc(*inference_program_->Proto()));
+    Singleton<Analyzer>::Global().Run(&argument);
+    CHECK(argument.transformed_program_desc);
+    VLOG(5) << "transformed program:\n"
+            << argument.transformed_program_desc->SerializeAsString();
+    VLOG(5) << "to prepare executor";
+    inference_program_.reset(
+        new framework::ProgramDesc(*argument.transformed_program_desc));
+  }
+
 private:
  TensorRTConfig config_;
 };

--- a/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc
@@ -15,50 +15,79 @@
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"

 namespace paddle {

 DEFINE_string(dirname, "", "Directory of the inference model.");

-void Main(bool use_gpu) {
+void CompareTensorRTWithFluid(bool enable_tensorrt) {
+  FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt;
+
  //# 1. Create PaddlePredictor with a config.
-  TensorRTConfig config;
-  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
-  config.use_gpu = use_gpu;
-  config.fraction_of_gpu_memory = 0.15;
-  config.device = 0;
-  auto predictor =
+  NativeConfig config0;
+  config0.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config0.use_gpu = true;
+  config0.fraction_of_gpu_memory = 0.3;
+  config0.device = 0;
+
+  TensorRTConfig config1;
+  config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config1.use_gpu = true;
+  config1.fraction_of_gpu_memory = 0.3;
+  config1.device = 0;
+
+  auto predictor0 =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 =
      CreatePaddlePredictor<TensorRTConfig,
-                            PaddleEngineKind::kAutoMixedTensorRT>(config);
+                            PaddleEngineKind::kAutoMixedTensorRT>(config1);

-  for (int batch_id = 0; batch_id < 3; batch_id++) {
+  for (int batch_id = 0; batch_id < 1; batch_id++) {
    //# 2. Prepare input.
-    int64_t data[4] = {1, 2, 3, 4};
+    std::vector<int64_t> data(20);
+    for (int i = 0; i < 20; i++) data[i] = i;

-    PaddleTensor tensor{.name = "",
-                        .shape = std::vector<int>({4, 1}),
-                        .data = PaddleBuf(data, sizeof(data)),
-                        .dtype = PaddleDType::INT64};
+    PaddleTensor tensor{
+        .name = "",
+        .shape = std::vector<int>({10, 1}),
+        .data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)),
+        .dtype = PaddleDType::INT64};

    // For simplicity, we set all the slots with the same data.
    std::vector<PaddleTensor> slots(4, tensor);

    //# 3. Run
-    std::vector<PaddleTensor> outputs;
-    CHECK(predictor->Run(slots, &outputs));
+    std::vector<PaddleTensor> outputs0;
+    std::vector<PaddleTensor> outputs1;
+    CHECK(predictor0->Run(slots, &outputs0));
+    CHECK(predictor1->Run(slots, &outputs1, 10));

    //# 4. Get output.
-    ASSERT_EQ(outputs.size(), 1UL);
-    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
-    const size_t num_elements = outputs.front().data.length() / sizeof(float);
-    // The outputs' buffers are in CPU memory.
-    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
-      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+    ASSERT_EQ(outputs0.size(), 1UL);
+    ASSERT_EQ(outputs1.size(), 1UL);
+
+    const size_t num_elements = outputs0.front().data.length() / sizeof(float);
+    const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
+    EXPECT_EQ(num_elements, num_elements1);
+
+    auto *data0 = static_cast<float *>(outputs0.front().data.data());
+    auto *data1 = static_cast<float *>(outputs1.front().data.data());
+
+    ASSERT_GT(num_elements, 0UL);
+    for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
+      EXPECT_NEAR(data0[i], data1[i], 1e-3);
    }
  }
 }

-TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
+TEST(paddle_inference_api_tensorrt_subgraph_engine, without_tensorrt) {
+  CompareTensorRTWithFluid(false);
+}
+
+TEST(paddle_inference_api_tensorrt_subgraph_engine, with_tensorrt) {
+  CompareTensorRTWithFluid(true);
+}

 }  // namespace paddle
--- a/paddle/fluid/inference/api/test_api.cc
+++ b/paddle/fluid/inference/api/test_api.cc
@@ -35,7 +35,8 @@ class DemoPredictor : public PaddlePredictor {
    LOG(INFO) << "I get other_config " << config.other_config;
  }
  bool Run(const std::vector<PaddleTensor> &inputs,
-           std::vector<PaddleTensor> *output_data) override {
+           std::vector<PaddleTensor> *output_data,
+           int batch_size = 0) override {
    LOG(INFO) << "Run";
    return false;
  }

--- a/paddle/fluid/inference/api/high_level_api.md
+++ b/paddle/fluid/inference/api/high_level_api.md
@@ -57,4 +57,4 @@ By specifying the engine kind and config, one can get a specific implementation.
 ## Reference

 - [paddle_inference_api.h](./paddle_inference_api.h)
- [some demos](./demo)
+- [some demos](./demo_ci)
--- a/paddle/fluid/inference/api/high_level_api_cn.md
+++ b/paddle/fluid/inference/api/high_level_api_cn.md
@@ -83,5 +83,5 @@ CHECK(predictor->Run(slots, &outputs));

 ## 详细代码参考

- [inference demos](./demo)
- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc)
+- [inference demos](./demo_ci)
+- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/test_api_impl.cc)
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -98,7 +98,8 @@ class PaddlePredictor {
  // responsible for the output tensor's buffer, either allocated or passed from
  // outside.
  virtual bool Run(const std::vector<PaddleTensor>& inputs,
-                   std::vector<PaddleTensor>* output_data) = 0;
+                   std::vector<PaddleTensor>* output_data,
+                   int batch_size = -1) = 0;

  // Clone a predictor that share the model weights, the Cloned predictor should
  // be thread-safe.

--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -93,6 +93,10 @@ class OpConverter {
  framework::Scope* scope_{nullptr};
 };

+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
 #define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)                      \
  struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
    trt_##op_type__##_converter() {                                            \
@@ -111,7 +115,3 @@ class OpConverter {
  extern int TouchConverterRegister_##op_type__();                      \
  static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
      TouchConverterRegister_##op_type__();
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -28,18 +28,20 @@ namespace tensorrt {

 int TensorRTEngine::runtime_batch_ = 1;

-void TensorRTEngine::Build(const DescType& paddle_model) {
+void TensorRTEngine::Build(const DescType &paddle_model) {
  PADDLE_ENFORCE(false, "not implemented");
 }

 void TensorRTEngine::Execute(int batch_size) {
-  std::vector<void*> buffers;
-  for (auto& buf : buffers_) {
+  batch_size_ = batch_size;
+  std::vector<void *> buffers;
+  for (auto &buf : buffers_) {
    PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
    PADDLE_ENFORCE_GT(buf.max_size, 0);
    PADDLE_ENFORCE(buf.device == DeviceType::GPU);
    buffers.push_back(buf.buffer);
  }
+  PADDLE_ENFORCE_NOT_NULL(stream_);
  infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
  cudaStreamSynchronize(*stream_);
  SetRuntimeBatch(batch_size);
@@ -48,7 +50,7 @@ void TensorRTEngine::Execute(int batch_size) {
 TensorRTEngine::~TensorRTEngine() {
  cudaStreamSynchronize(*stream_);
  // clean buffer
-  for (auto& buf : buffers_) {
+  for (auto &buf : buffers_) {
    if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
      PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
      buf.buffer = nullptr;
@@ -73,33 +75,37 @@ void TensorRTEngine::FreezeNetwork() {

  // allocate GPU buffers.
  buffers_.resize(buffer_sizes_.size());
-  for (auto& item : buffer_sizes_) {
+  for (auto &item : buffer_sizes_) {
+    // The output buffers are not set in the network building phrase, need to
+    // infer from the TesorRT network.
    if (item.second == 0) {
      auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
      auto dims = infer_engine_->getBindingDimensions(slot_offset);
      item.second = kDataTypeSize[static_cast<int>(
                        infer_engine_->getBindingDataType(slot_offset))] *
                    analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
+      PADDLE_ENFORCE_GT(item.second, 0);
    }
-    auto& buf = buffer(item.first);
+
+    auto &buf = buffer(item.first);
+    buf.max_size = item.second * max_batch_;
    CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
+
    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
-    VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
-            << buf.buffer;
-    buf.size = item.second;
-    buf.max_size = item.second * max_batch_;
+    buf.size = 0;
+    PADDLE_ENFORCE_LE(buf.max_size, 1 << 30);  // 10G
    buf.device = DeviceType::GPU;
  }
 }

-nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
+nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
                                                nvinfer1::DataType dtype,
-                                                const nvinfer1::Dims& dims) {
+                                                const nvinfer1::Dims &dims) {
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
                    name);

  PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
-  auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
+  auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
  PADDLE_ENFORCE(input, "infer network add input %s failed", name);
  buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
                        analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
@@ -108,12 +114,12 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
  return input;
 }

-void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
-                                   const std::string& name) {
+void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
+                                   const std::string &name) {
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                    name);

-  auto* output = layer->getOutput(offset);
+  auto *output = layer->getOutput(offset);
  SetITensor(name, output);
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
@@ -125,11 +131,11 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
  buffer_sizes_[name] = 0;
 }

-void TensorRTEngine::DeclareOutput(const std::string& name) {
+void TensorRTEngine::DeclareOutput(const std::string &name) {
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                    name);

-  auto* output = TensorRTEngine::GetITensor(name);
+  auto *output = TensorRTEngine::GetITensor(name);
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
  PADDLE_ENFORCE(!output->isNetworkInput());
@@ -139,13 +145,13 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
  buffer_sizes_[name] = 0;
 }

-void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
+void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
  return buffer(name).buffer;
 }

-void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst) {
+void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst) {
  // determine data size
-  auto* output = TensorRTEngine::GetITensor(name);
+  auto *output = TensorRTEngine::GetITensor(name);
  nvinfer1::Dims dims = output->getDimensions();
  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
  size_t dst_size = dim_size * runtime_batch_ *
@@ -155,17 +161,17 @@ void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst) {
  PADDLE_ENFORCE(it != buffer_sizes_.end());
  PADDLE_ENFORCE_GT(it->second, 0);
  PADDLE_ENFORCE_LE(dst_size, it->second);
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
                                    cudaMemcpyDeviceToDevice, *stream_),
                    0);
 }

-void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst) {
+void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst) {
  // determine data size

-  auto* output = TensorRTEngine::GetITensor(name);
+  auto *output = TensorRTEngine::GetITensor(name);
  nvinfer1::Dims dims = output->getDimensions();
  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
  size_t dst_size = dim_size * runtime_batch_ *
@@ -174,13 +180,13 @@ void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst) {
  PADDLE_ENFORCE(it != buffer_sizes_.end());
  PADDLE_ENFORCE_GT(it->second, 0);
  PADDLE_ENFORCE_LE(dst_size, it->second);
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
                                       cudaMemcpyDeviceToHost, *stream_));
 }

-Buffer& TensorRTEngine::buffer(const std::string& name) {
+Buffer &TensorRTEngine::buffer(const std::string &name) {
  PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
  auto it = buffer_sizes_.find(name);
  PADDLE_ENFORCE(it != buffer_sizes_.end());
@@ -188,19 +194,23 @@ Buffer& TensorRTEngine::buffer(const std::string& name) {
  return buffers_[slot_offset];
 }

-void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
+void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
                                     size_t size) {
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
+  PADDLE_ENFORCE_NOT_NULL(data);
+  PADDLE_ENFORCE_NOT_NULL(stream_);
  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
+  buf.size = size;
  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
                                       cudaMemcpyHostToDevice, *stream_));
 }

-void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
+void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
                                     size_t size) {
-  auto& buf = buffer(name);
+  auto &buf = buffer(name);
+  buf.size = size;
  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
@@ -208,15 +218,15 @@ void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
                                       cudaMemcpyDeviceToDevice, *stream_));
 }

-void TensorRTEngine::SetITensor(const std::string& name,
-                                nvinfer1::ITensor* tensor) {
+void TensorRTEngine::SetITensor(const std::string &name,
+                                nvinfer1::ITensor *tensor) {
  PADDLE_ENFORCE(tensor != nullptr);
  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
                    name);
  itensor_map_[name] = tensor;
 }

-nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
+nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
  PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
  return itensor_map_[name];
 }

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -57,7 +57,9 @@ class TensorRTEngine : public EngineBase {
      : max_batch_(max_batch),
        max_workspace_(max_workspace),
        stream_(stream ? stream : &default_stream_),
-        logger_(logger) {}
+        logger_(logger) {
+    cudaStreamCreate(&default_stream_);
+  }

  virtual ~TensorRTEngine();

@@ -125,6 +127,9 @@ class TensorRTEngine : public EngineBase {
  static int runtime_batch_;
  // the max memory size the engine uses
  int max_workspace_;
+
+  // batch size of the current data, will be updated each Executation.
+  int batch_size_{-1};
  cudaStream_t* stream_;
  // If stream_ is not set from outside, hold its own stream.
  cudaStream_t default_stream_;

--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -103,6 +103,11 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {

  LOG(INFO) << "to get output";
  float y_cpu[2] = {-1., -1.};
+
+  auto dims = engine_->GetITensor("y")->getDimensions();
+  ASSERT_EQ(dims.nbDims, 3);
+  ASSERT_EQ(dims.d[0], 2);
+  ASSERT_EQ(dims.d[1], 1);
  engine_->GetOutputInCPU("y", &y_cpu[0]);
  ASSERT_EQ(y_cpu[0], 4.5);
  ASSERT_EQ(y_cpu[1], 14.5);

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -168,6 +168,8 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(relu);\n")
      elseif(${TARGET} STREQUAL "fake_dequantize")
        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
+      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
+          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
      else()
        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
      endif()
@@ -237,9 +239,9 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
 if (WITH_GPU AND TENSORRT_FOUND)
-    op_library(tensorrt_engine_op DEPS tensorrt_engine)
+    op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
    nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
-      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
+      DEPS tensorrt_engine_op
      analysis)
 else()
    set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)

--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -293,11 +293,18 @@ class AdamOpKernel : public framework::OpKernel<T> {
      auto& grad_tensor = grad_merge.value();
      const T* grad_data = grad_tensor.template data<T>();
      int64_t* rows = nullptr;
+// When compiled without CUDA, the CUDAMutableData() interface should not be
+// provided.
+#if defined(PADDLE_WITH_CUDA)
      if (platform::is_gpu_place(ctx.GetPlace())) {
        rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace());
      } else {
+#endif
        rows = grad_merge.mutable_rows()->data();
+
+#if defined(PADDLE_WITH_CUDA)
      }
+#endif
      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();

      SparseAdamFunctor<T> functor(

--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -24,15 +24,16 @@ class AucOp : public framework::OperatorWithKernel {

 protected:
  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Indices"),
-                   "Input of Indices should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Predict"),
+                   "Input of Out should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Label"),
                   "Input of Label should not be null.");
-    auto inference_height = ctx->GetInputDim("Out")[0];
+    auto predict_width = ctx->GetInputDim("Predict")[1];
+    PADDLE_ENFORCE_EQ(predict_width, 2, "Only support binary classification");
+    auto predict_height = ctx->GetInputDim("Predict")[0];
    auto label_height = ctx->GetInputDim("Label")[0];

-    PADDLE_ENFORCE_EQ(inference_height, label_height,
+    PADDLE_ENFORCE_EQ(predict_height, label_height,
                      "Out and Label should have same height.");

    int num_thres = ctx->Attrs().Get<int>("num_thresholds");
@@ -43,14 +44,14 @@ class AucOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("FPOut", {num_thres});
    ctx->SetOutputDim("FNOut", {num_thres});

-    ctx->ShareLoD("Out", /*->*/ "AUC");
+    ctx->ShareLoD("Predict", /*->*/ "AUC");
  }

 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
+        framework::ToDataType(ctx.Input<Tensor>("Predict")->type()),
        ctx.device_context());
  }
 };
@@ -58,18 +59,13 @@ class AucOp : public framework::OperatorWithKernel {
 class AucOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("Out",
-             "A floating point 2D tensor, values are in the range [0, 1]."
-             "Each row is sorted in descending order. This input should be the"
-             "output of topk."
+    AddInput("Predict",
+             "A floating point 2D tensor with shape [batch_size, 2], values "
+             "are in the range [0, 1]."
             "Typically, this tensor indicates the probability of each label");
-    AddInput("Indices",
-             "An int 2D tensor, indicating the indices of original"
-             "tensor before sorting. Typically, this tensor indicates which "
-             "label the probability stands for.");
    AddInput("Label",
-             "A 2D int tensor indicating the label of the training data."
-             "The height is batch size and width is always 1.");
+             "A 2D int tensor indicating the label of the training data. "
+             "shape: [batch_size, 1]");
    AddInput("TP", "True-Positive value.");
    AddInput("FP", "False-Positive value.");
    AddInput("TN", "True-Negative value.");

--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -31,7 +31,7 @@ template <typename DeviceContext, typename T>
 class AucKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<Tensor>("Out");
+    auto* predict = ctx.Input<Tensor>("Predict");
    auto* label = ctx.Input<Tensor>("Label");
    auto* auc = ctx.Output<Tensor>("AUC");
    // Only use output var for now, make sure it's persistable and
@@ -41,24 +41,24 @@ class AucKernel : public framework::OpKernel<T> {
    auto* true_negative = ctx.Output<Tensor>("TNOut");
    auto* false_negative = ctx.Output<Tensor>("FNOut");

-    float* auc_data = auc->mutable_data<float>(ctx.GetPlace());
+    auto* auc_data = auc->mutable_data<double>(ctx.GetPlace());

    std::string curve = ctx.Attr<std::string>("curve");
    int num_thresholds = ctx.Attr<int>("num_thresholds");
-    std::vector<float> thresholds_list;
+    std::vector<double> thresholds_list;
    thresholds_list.reserve(num_thresholds);
    for (int i = 1; i < num_thresholds - 1; i++) {
-      thresholds_list[i] = static_cast<float>(i) / (num_thresholds - 1);
+      thresholds_list[i] = static_cast<double>(i) / (num_thresholds - 1);
    }
-    const float kEpsilon = 1e-7;
+    const double kEpsilon = 1e-7;
    thresholds_list[0] = 0.0f - kEpsilon;
    thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;

-    size_t batch_size = inference->dims()[0];
-    size_t inference_width = inference->dims()[1];
+    size_t batch_size = predict->dims()[0];
+    size_t inference_width = predict->dims()[1];

-    const T* inference_data = inference->data<T>();
-    const int64_t* label_data = label->data<int64_t>();
+    const T* inference_data = predict->data<T>();
+    const auto* label_data = label->data<int64_t>();

    auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
    auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
@@ -66,20 +66,19 @@ class AucKernel : public framework::OpKernel<T> {
    auto* fp_data = false_positive->mutable_data<int64_t>(ctx.GetPlace());

    for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
-      // caculate TP, FN, TN, FP for current thresh
+      // calculate TP, FN, TN, FP for current thresh
      int64_t tp = 0, fn = 0, tn = 0, fp = 0;
      for (size_t i = 0; i < batch_size; i++) {
-        // NOTE: label_data used as bool, labels >0 will be treated as true.
+        // NOTE: label_data used as bool, labels > 0 will be treated as true.
        if (label_data[i]) {
-          // use first(max) data in each row
-          if (inference_data[i * inference_width] >=
+          if (inference_data[i * inference_width + 1] >=
              (thresholds_list[idx_thresh])) {
            tp++;
          } else {
            fn++;
          }
        } else {
-          if (inference_data[i * inference_width] >=
+          if (inference_data[i * inference_width + 1] >=
              (thresholds_list[idx_thresh])) {
            fp++;
          } else {
@@ -94,21 +93,21 @@ class AucKernel : public framework::OpKernel<T> {
      fp_data[idx_thresh] += fp;
    }
    // epsilon to avoid divide by zero.
-    float epsilon = 1e-6;
+    double epsilon = 1e-6;
    // Riemann sum to caculate auc.
    Tensor tp_rate, fp_rate, rec_rate;
    tp_rate.Resize({num_thresholds});
    fp_rate.Resize({num_thresholds});
    rec_rate.Resize({num_thresholds});
-    float* tp_rate_data = tp_rate.mutable_data<float>(ctx.GetPlace());
-    float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
-    float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
+    auto* tp_rate_data = tp_rate.mutable_data<double>(ctx.GetPlace());
+    auto* fp_rate_data = fp_rate.mutable_data<double>(ctx.GetPlace());
+    auto* rec_rate_data = rec_rate.mutable_data<double>(ctx.GetPlace());
    for (int i = 0; i < num_thresholds; i++) {
-      tp_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
+      tp_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
                        (tp_data[i] + fn_data[i] + epsilon);
      fp_rate_data[i] =
-          static_cast<float>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
-      rec_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
+          static_cast<double>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
+      rec_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
                         (tp_data[i] + fp_data[i] + epsilon);
    }
    *auc_data = 0.0f;

--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ b/paddle/fluid/operators/detection/target_assign_op.h
@@ -106,7 +106,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
    int64_t k = x->dims()[2];

    auto x_lod = x->lod().back();
+#if defined(PADDLE_WITH_CUDA)
    size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
+#else
+    size_t* x_lod_data = x_lod.data();
+#endif

    TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data,
                                       mismatch_value, n, m, p, k, out_data,
@@ -121,7 +125,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
      const int* neg_idx_data = neg_indices->data<int>();
      auto neg_lod = neg_indices->lod().back();
+#if defined(PADDLE_WITH_CUDA)
      size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
+#else
+      size_t* neg_lod_data = neg_lod.data();
+#endif
      NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
      neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k,
                      mismatch_value, out_data, out_wt_data);

--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
+if(NOT WITH_DISTRIBUTE)
+    return()
+endif()
+
+if(WITH_GRPC)
+    set(cc_generic_services "false")
+else()
+    set(cc_generic_services "true")
+endif()
+configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
+
 if(WITH_GRPC)
-  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-      request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
-      selected_rows memory)
+  grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
+        request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc
+      PROTO send_recv.proto 
+      DEPS lod_tensor selected_rows memory)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
-          cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
-  cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc
-          grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
-          proto_desc lookup_table_op SERIAL)
+  cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
+     DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
+  cc_test(grpc_server_test SRCS rpc_server_test.cc 
+    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_table_op SERIAL)
  return()
 endif()


 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc  rpc_client.cc request_handler_impl.cc
+
+set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
+    brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
+brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc 
+    brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
  PROTO send_recv.proto
  DEPS lod_tensor selected_rows memory)

-find_library(OPENSSL_CRYPTO_LIBRARY_STATIC NAMES libcrypto.so)
-ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY_STATIC})
-
+set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)

-find_library(OPENSSL_SSL_LIBRARY_STATIC NAMES libssl.so)
-ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY_STATIC})
+cc_test(brpc_server_test SRCS rpc_server_test.cc 
+    DEPS ${brpc_test_depends} SERIAL)

-cc_test(brpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_brpc 
-       brpc protobuf leveldb gflags glog
-       protobuf executor proto_desc lookup_table_op snappystream snappy ssl crypto SERIAL)
+cc_test(brpc_serde_test SRCS brpc_serde_test.cc 
+    DEPS ${brpc_test_depends} SERIAL)
--- a/paddle/fluid/operators/distributed/bytebuffer_stream.cc
+++ b/paddle/fluid/operators/distributed/bytebuffer_stream.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 //       file and did some modifications so that we can send gRPC
 //       requests without too much copying of the tensor data.

-#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/distributed/bytebuffer_stream.h
+++ b/paddle/fluid/operators/distributed/bytebuffer_stream.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "grpc++/grpc++.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"

 namespace grpc {
 // A ZeroCopyInputStream that reads from grpc_byte_buffer
@@ -107,25 +108,6 @@ class GrpcBufferReader final
 namespace paddle {
 namespace operators {
 namespace distributed {
-// Source provides a way for a particular RPC implementation to provide
-// received data to ParseFrom.
-class Source {
- public:
-  virtual ~Source() {}
-
-  // Return the stream that contains the data to be parsed.
-  // Note that this method might be invoked more than once if
-  // ParseFrom needs to fall back to a more expensive parsing method.
-  // Every call must return a stream pointing at the beginning of
-  // the serialized RecvTensorResponse.
-  //
-  // Note that a subsequent call to contents() invalidates previous
-  // results of contents().
-  //
-  // Ownership of the returned stream is retained by the Source and
-  // should not be deleted by the caller.
-  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
-};

 // A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
 class GrpcByteBufferSource

--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -20,6 +20,7 @@ limitations under the License. */

 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/distributed/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/profiler.h"


--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -38,7 +38,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN

@@ -46,23 +49,6 @@ namespace paddle {
 namespace operators {
 namespace distributed {

-struct VarHandle {
-  // RPC endpoint.
-  std::string ep;
-  const platform::DeviceContext* ctx;
-  const framework::Scope* scope;
-  // Variable name.
-  std::string name;
-  // RPC method name.
-  std::string method;
-
-  std::string String() const {
-    std::ostringstream s;
-    s << method << " name:[" << name << "], ep:[" << ep << "]";
-    return s.str();
-  }
-};
-
 void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);

 class BaseProcessor {

--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
+#include <sys/time.h>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
+                           const platform::DeviceContext& ctx,
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_name) {
+  // Default DestroyCallback does nothing, When using GPU
+  // the CPU buffer need to be freed.
+  DestroyCallback destroy_callback = [](void* backing) {};
+  VarMsg request;
+  void* payload = nullptr;
+  size_t payload_size;
+
+  request.set_varname(name);
+  // Note: normally the profiler is enabled in 1 trainer, hence only
+  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
+  // servers the trainer's profiling state so that PS can follow the
+  // trainer.
+  if (platform::ShouldSendProfileState()) {
+    if (platform::IsProfileEnabled()) {
+      request.set_profile(platform::kEnableProfiler);
+    } else {
+      request.set_profile(platform::kDisableProfiler);
+    }
+  }
+  if (!out_name.empty()) {
+    request.set_out_varname(out_name);
+  }
+  if (var->IsType<framework::LoDTensor>()) {
+    request.set_type(::sendrecv::LOD_TENSOR);
+    GetTensorPayload(var, ctx, &request, &payload, &payload_size);
+  } else if (var->IsType<framework::SelectedRows>()) {
+    request.set_type(::sendrecv::SELECTED_ROWS);
+    GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size);
+#ifdef PADDLE_WITH_CUDA
+  } else if (var->IsType<ncclUniqueId>()) {
+    request.set_type(::sendrecv::NCCL_ID);
+#endif
+  } else {
+    PADDLE_THROW("Serialize does not support type: %s",
+                 typeid(var->Type()).name());
+  }
+
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+    // GPU data is copied to CPU buffer when sending,
+    // free the buffer when possible.
+    destroy_callback = [](void* backing) {
+      platform::CUDAPinnedPlace cuda_pinned;
+      memory::Free(cuda_pinned, backing);
+    };
+#endif
+  }
+
+  std::string header;
+  request.AppendToString(&header);
+  auto buffer = std::unique_ptr<char[]>(new char[1024]);
+  void* buf = buffer.get();
+  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
+  e.WriteRawBytes(std::string(header.data(), header.size()));
+// NCCLID is copied directly to the message, return bytebuffer
+// with only one slice if serializing NCCLID.
+#ifdef PADDLE_WITH_CUDA
+  if (var->IsType<ncclUniqueId>()) {
+    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
+                              NCCL_UNIQUE_ID_BYTES);
+    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
+    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
+
+    // for serialize NCCL_ID
+    ::grpc::Slice slices(e.size());
+    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
+    ::grpc::ByteBuffer tmp(&slices, 1);
+    msg->Swap(&tmp);
+    return;
+  }
+#endif
+
+  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
+  // steal reference of tensor data
+  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
+  int num_slices = 2;       // only SelectedRows have rows buffer
+  slices[0] = ::grpc::Slice(e.size());
+  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
+  slices[1] = ::grpc::Slice(
+      grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
+                                    static_cast<char*>(payload)),
+      ::grpc::Slice::STEAL_REF);
+
+  if (var->IsType<framework::SelectedRows>()) {
+    auto* slr = var->GetMutable<framework::SelectedRows>();
+    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
+    size_t rows_memory_size =
+        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
+    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
+    slices[2] = ::grpc::Slice(e2.size());
+    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
+
+    slices[3] = ::grpc::Slice(
+        grpc_slice_new_with_user_data(
+            const_cast<void*>(
+                reinterpret_cast<const void*>(slr->rows().data())),
+            rows_memory_size, [](void* backing) {},
+            const_cast<char*>(
+                reinterpret_cast<const char*>(slr->rows().data()))),
+        ::grpc::Slice::STEAL_REF);
+    num_slices = 4;
+  }
+
+  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
+  msg->Swap(&tmp);
+}
+
+void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                               const platform::DeviceContext& ctx,
+                               const framework::Scope* scope,
+                               framework::Variable** var) {
+  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
+  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
+  *var = resp.GetVar();
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/distributed/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc_serde.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <sys/time.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+typedef void (*DestroyCallback)(void*);
+
+void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
+                           const platform::DeviceContext& ctx,
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_varname = std::string());
+
+void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                               const platform::DeviceContext& ctx,
+                               const framework::Scope* scope,
+                               framework::Variable** var);
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/distributed/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc
@@ -21,8 +21,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/grpc_serde.h"
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
@@ -84,7 +86,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
  // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
  framework::Scope scope;
  scope.Var("myvar");
-  operators::distributed::VariableResponse resp(&scope, &ctx);
+  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
  EXPECT_EQ(resp.Parse(msg), 0);

  framework::Variable* var2 = resp.GetVar();
@@ -171,7 +173,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
  // deserialize zero-copy
  framework::Scope scope;
  scope.Var("myvar");
-  operators::distributed::VariableResponse resp(&scope, &ctx);
+  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
  if (from_type == 0) {
    EXPECT_EQ(resp.Parse(msg), 0);
  } else {

--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <limits>
 #include <string>

+#include "paddle/fluid/operators/distributed/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/grpc_server.h"

 using ::grpc::ServerAsyncResponseWriter;
@@ -84,9 +85,9 @@ class RequestSend final : public RequestBase {
                       ::grpc::ServerCompletionQueue* cq,
                       RequestHandler* request_handler, int req_id)
      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new VariableResponse(request_handler->scope(),
-                                        request_handler->dev_ctx(),
-                                        !request_handler->sync_mode()));
+    request_.reset(new GRPCVariableResponse(request_handler->scope(),
+                                            request_handler->dev_ctx(),
+                                            !request_handler->sync_mode()));
    int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
    service_->RequestAsyncUnary(
        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
@@ -109,7 +110,7 @@ class RequestSend final : public RequestBase {

 protected:
  sendrecv::VoidMessage reply_;
-  std::shared_ptr<VariableResponse> request_;
+  std::shared_ptr<GRPCVariableResponse> request_;
  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };

@@ -161,8 +162,8 @@ class RequestPrefetch final : public RequestBase {
      : RequestBase(service, cq, request_handler, req_id),
        responder_(&ctx_),
        local_scope_(nullptr) {
-    request_.reset(new VariableResponse(request_handler->scope(),
-                                        request_handler->dev_ctx(), true));
+    request_.reset(new GRPCVariableResponse(request_handler->scope(),
+                                            request_handler->dev_ctx(), true));
    int method_id =
        static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
    service_->RequestAsyncUnary(
@@ -194,7 +195,7 @@ class RequestPrefetch final : public RequestBase {
  }

 protected:
-  std::shared_ptr<VariableResponse> request_;
+  std::shared_ptr<GRPCVariableResponse> request_;
  ::grpc::ByteBuffer reply_;
  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
  framework::Scope* local_scope_;
@@ -206,8 +207,8 @@ class RequestCheckpointNotify final : public RequestBase {
                                   ::grpc::ServerCompletionQueue* cq,
                                   RequestHandler* request_handler, int req_id)
      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new VariableResponse(request_handler->scope(),
-                                        request_handler->dev_ctx()));
+    request_.reset(new GRPCVariableResponse(request_handler->scope(),
+                                            request_handler->dev_ctx()));
    int method_id =
        static_cast<int>(distributed::GrpcMethod::kCheckpointNotify);
    service_->RequestAsyncUnary(
@@ -234,7 +235,7 @@ class RequestCheckpointNotify final : public RequestBase {
  }

 protected:
-  std::shared_ptr<VariableResponse> request_;
+  std::shared_ptr<GRPCVariableResponse> request_;
  sendrecv::VoidMessage reply_;
  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
 };

--- a/paddle/fluid/operators/distributed/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc_service.h
@@ -23,8 +23,7 @@
 #include <grpc++/impl/codegen/stub_options.h>
 #include <grpc++/impl/codegen/sync_stream.h>
 #include <grpc++/support/byte_buffer.h>
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
 #include "paddle/fluid/platform/profiler.h"

 // NOTE: This method was originally created by tensorflow
@@ -42,17 +41,18 @@ class ServerContext;
 // Support parsing/unparsing of tensorflow::VariableResponse.
 // Wire-format is identical to RecvVariableResponse.
 template <>
-class SerializationTraits<paddle::operators::distributed::VariableResponse> {
+class SerializationTraits<
+    paddle::operators::distributed::GRPCVariableResponse> {
 public:
  static Status Serialize(
-      const paddle::operators::distributed::VariableResponse& msg,
+      const paddle::operators::distributed::GRPCVariableResponse& msg,
      grpc_byte_buffer** bp, bool* own_buffer) {
    PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
    return Status();
  }
  static Status Deserialize(
      grpc_byte_buffer* buffer,
-      paddle::operators::distributed::VariableResponse* msg,
+      paddle::operators::distributed::GRPCVariableResponse* msg,
      int max_message_size = INT_MAX) {
    if (buffer == nullptr) {
      return Status(StatusCode::INTERNAL, "No payload");

--- a/paddle/fluid/operators/distributed/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <utility>
+#include <vector>
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
+
+#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+enum WireType {
+  WIRETYPE_VARINT = 0,
+  WIRETYPE_LENGTH_DELIMITED = 2,
+};
+
+inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
+
+inline WireType GetTagWireType(uint32_t tag) {
+  return static_cast<WireType>(tag & 0x7);
+}
+
+bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
+                         int* result) {
+  uint64_t v;
+  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
+    *result = static_cast<int>(v);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+int GRPCVariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
+  GrpcByteBufferSource source;
+  source.Init(byte_buffer);
+  GrpcByteBufferSourceWrapper r(&source);
+
+  return Parse(&r);
+}
+
+bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
+                  std::vector<int64_t>* lod) {
+  while (true) {
+    auto p = input->ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+
+    if (!p.second) {
+      return (tag == 0);
+    }
+
+    switch (tag) {
+      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
+        uint64_t v;
+        if (wt == WIRETYPE_VARINT) {
+          if (!input->ReadVarint64(&v)) {
+            return false;
+          }
+          lod->push_back(v);
+          break;
+        }
+
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int num_bytes = 0;
+          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
+            return tag;
+          }
+          int start_pos = input->CurrentPosition();
+          while (input->CurrentPosition() - start_pos < num_bytes) {
+            uint64_t v;
+            if (!input->ReadVarint64(&v)) {
+              return tag;
+            }
+            lod->push_back(v);
+          }
+          break;
+        }
+
+        return false;
+      }
+      default: { return false; }
+    }
+  }
+
+  return true;
+}
+
+int GRPCVariableResponse::Parse(Source* source) {
+  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
+      source->contents();
+  ::google::protobuf::io::CodedInputStream input(input_stream);
+  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
+
+  while (true) {
+    auto p = input.ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+    if (!p.second) {
+      if (tag != 0) {
+        return -1;
+      }
+      return 0;
+    }
+
+    switch (tag) {
+      case sendrecv::VariableMessage::kVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_varname(temp);
+        break;
+      }
+      case sendrecv::VariableMessage::kTypeFieldNumber: {
+        uint32_t v;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
+          return tag;
+        }
+
+        meta_.set_type(static_cast<::sendrecv::VarType>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
+        uint32_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
+          return tag;
+        }
+
+        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDimsFieldNumber: {
+        // not packed
+        if (wt == WIRETYPE_VARINT) {
+          uint64_t v;
+          if (!input.ReadVarint64(&v)) {
+            return tag;
+          }
+          meta_.add_dims(v);
+          break;
+        }
+
+        // packed
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int num_bytes = 0;
+          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
+            return tag;
+          }
+          int start_pos = input.CurrentPosition();
+          while (input.CurrentPosition() - start_pos < num_bytes) {
+            uint64_t v;
+            if (!input.ReadVarint64(&v)) {
+              return tag;
+            }
+            meta_.add_dims(v);
+          }
+          break;
+        }
+        return tag;
+      }
+      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_lod_level(static_cast<int64_t>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kLodFieldNumber: {
+        int length = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &length)) {
+          return tag;
+        }
+
+        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
+            input.IncrementRecursionDepthAndPushLimit(length);
+
+        std::vector<int64_t> lod_data;
+        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
+          return tag;
+        }
+
+        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
+          return tag;
+        }
+
+        if (lod_data.size() == 0) {
+          break;
+        }
+
+        auto lod = meta_.add_lod();
+        for (uint32_t i = 0; i < lod_data.size(); i++) {
+          lod->add_lod_data(lod_data[i]);
+        }
+        break;
+      }
+      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_slr_height(static_cast<int64_t>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kSerializedFieldNumber: {
+        int num_bytes = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &num_bytes)) {
+          return tag;
+        }
+
+        if (!ProcSerializedField(tag, &input, num_bytes)) {
+          return tag;
+        }
+
+        break;
+      }
+      case sendrecv::VariableMessage::kRowsFieldNumber: {
+        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                        meta_.type() == sendrecv::LOD_TENSOR) &&
+                           meta_.varname() != "",
+                       "meta info should be got first!");
+
+        int num_bytes = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &num_bytes)) {
+          return tag;
+        }
+
+        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
+          return tag;
+        }
+        break;
+      }
+      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_out_varname(temp);
+        break;
+      }
+      case sendrecv::VariableMessage::kProfileFieldNumber: {
+        uint64_t profiling = 0;
+        if (!input.ReadVarint64(&profiling)) {
+          return tag;
+        }
+        meta_.set_profile(profiling);
+        int64_t listener_id = platform::ListenerId();
+        if (listener_id <= 0) {
+          break;
+        }
+        if (profiling == platform::kEnableProfiler &&
+            !platform::IsProfileEnabled()) {
+          platform::EnableProfiler(platform::ProfilerState::kCPU);
+        } else if (profiling == platform::kDisableProfiler &&
+                   platform::IsProfileEnabled()) {
+          // TODO(panyx0718): Should we allow to customize file dir.
+          platform::DisableProfiler(
+              platform::EventSortingKey::kDefault,
+              string::Sprintf("/tmp/profile_ps_%lld", listener_id));
+        }
+        break;
+      }
+      default: {
+        // Unknown tag, return unknown error.
+        return -1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
--- a/paddle/fluid/framework/details/ssa_graph.h
+++ b/paddle/fluid/framework/details/ssa_graph.h
@@ -14,36 +14,45 @@

 #pragma once

-#include <map>
 #include <string>
-#include <vector>

-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"

 namespace paddle {
-namespace framework {
-namespace details {
-
-// A SSA graph used by parallel executor.
-struct SSAGraph {
-  // all variable in each devices.
-  // The outside vector is the device vector. Each element of this vector is a
-  // map from variable name to variables. The variables, who have the same name,
-  // will have a different version. The offset in the
-  // `std::vector<std::unique_ptr<VarHandle>>` is the version of varaibles.
-  std::vector<
-      std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
-      vars_;
-
-  // aux variables to represent dependency. Useful to resolve data hazard.
-  std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_;
-
-  // all operators. NOTE that even we use a vector here, the operators is
-  // unordered.
-  std::vector<std::unique_ptr<OpHandleBase>> ops_;
+namespace operators {
+namespace distributed {
+
+class GRPCVariableResponse : public VariableResponse {
+ public:
+  GRPCVariableResponse(const framework::Scope* scope,
+                       const platform::DeviceContext* dev_ctx,
+                       bool create_scope = false)
+      : VariableResponse(scope, dev_ctx, create_scope) {}
+
+  virtual ~GRPCVariableResponse() {}
+
+  int Parse(Source* source) override;
+
+  // return:
+  // 0:ok.
+  // -1: unkown error.
+  // other: number of error field.
+  int Parse(const ::grpc::ByteBuffer& byte_buffer);
 };

-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -51,6 +51,23 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier";

 class RPCServer;

+struct VarHandle {
+  // RPC endpoint.
+  std::string ep;
+  const platform::DeviceContext* ctx;
+  const framework::Scope* scope;
+  // Variable name.
+  std::string name;
+  // RPC method name.
+  std::string method;
+
+  std::string String() const {
+    std::ostringstream s;
+    s << method << " name:[" << name << "], ep:[" << ep << "]";
+    return s.str();
+  }
+};
+
 class RequestHandler {
 public:
  explicit RequestHandler(bool sync_mode)

--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -53,7 +53,7 @@ bool RequestSendHandler::Handle(const std::string& varname,

  // Sync
  if (varname == BATCH_BARRIER_MESSAGE) {
-    VLOG(3) << "sync: recv batch barrier message";
+    VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
    rpc_server_->IncreaseBatchBarrier(kRequestSend);
  } else if (varname == BEGIN_PASS_MESSAGE) {
    VLOG(3) << "sync: recv begin pass message";
@@ -65,8 +65,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
    VLOG(3) << "sync: processing received var: " << varname;

    if (invar == nullptr) {
-      LOG(ERROR) << "sync: Can not find server side var: " << varname;
-      PADDLE_THROW("sync: Can not find server side var");
+      LOG(FATAL) << "sync: Can not find server side var: " << varname;
      return false;
    }
    if (invar->IsType<framework::SelectedRows>()) {

--- a/paddle/fluid/operators/distributed/send_recv.proto
+++ b/paddle/fluid/operators/distributed/send_recv.proto
+
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
 the Apache License, Version 2.0 (the "License"); you may not use this file
 except in compliance with the License.
@@ -14,7 +15,7 @@ limitations under the License. */
 syntax = "proto3";
 package sendrecv;

-// option cc_generic_services = true;
+option cc_generic_services = @cc_generic_services@;

 service SendRecvService {
  // For parameter server round-robin like hashing, do not split tensors.

--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -12,21 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-
 #ifdef PADDLE_WITH_CUDA
 #include <nccl.h>
 #endif
 #include <sys/time.h>
 #include <thread>  // NOLINT

-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace operators {
@@ -34,6 +28,13 @@ namespace distributed {

 using VarMsg = sendrecv::VariableMessage;

+#ifdef PADDLE_WITH_CUDA
+void* GetVarPayLoad(const std::string varname, int64_t size) {
+  platform::CUDAPinnedPlace cuda_pinned;
+  return memory::Alloc(cuda_pinned, size);
+}
+#endif
+
 void GetTensorPayload(framework::Variable* var,
                      const platform::DeviceContext& ctx, VarMsg* request,
                      void** payload, size_t* payload_size) {
@@ -58,15 +59,17 @@ void GetTensorPayload(framework::Variable* var,
  if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
    PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
-    platform::CUDAPinnedPlace cuda_pinned;
+    // platform::CUDAPinnedPlace cuda_pinned;
    auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
    auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
-    *payload = memory::Alloc(cuda_pinned, copy_size);
+    *payload = GetVarPayLoad(request->varname(), copy_size);

+    platform::CUDAPinnedPlace cuda_pinned;
    memory::Copy(cuda_pinned, *payload,
                 boost::get<platform::CUDAPlace>(tensor.place()),
                 reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
                 gpu_dev_ctx.stream());
+
    ctx.Wait();
 #endif
  } else {
@@ -91,10 +94,11 @@ void GetSelectedRowsPayload(framework::Variable* var,
  auto* tensor = slr->mutable_value();
  if (platform::is_gpu_place(ctx.GetPlace())) {
 #ifdef PADDLE_WITH_CUDA
-    platform::CUDAPinnedPlace cuda_pinned;
    auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
    auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type());
-    *payload = memory::Alloc(cuda_pinned, copy_size);
+    *payload = GetVarPayLoad(request->varname(), copy_size);
+
+    platform::CUDAPinnedPlace cuda_pinned;
    memory::Copy(cuda_pinned, *payload,
                 boost::get<platform::CUDAPlace>(tensor->place()),
                 reinterpret_cast<const void*>(tensor->data<void>()), copy_size,
@@ -107,126 +111,6 @@ void GetSelectedRowsPayload(framework::Variable* var,
  *payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
 }

-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_name) {
-  // Default DestroyCallback does nothing, When using GPU
-  // the CPU buffer need to be freed.
-  DestroyCallback destroy_callback = [](void* backing) {};
-  VarMsg request;
-  void* payload = nullptr;
-  size_t payload_size;
-
-  request.set_varname(name);
-  // Note: normally the profiler is enabled in 1 trainer, hence only
-  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
-  // servers the trainer's profiling state so that PS can follow the
-  // trainer.
-  if (platform::ShouldSendProfileState()) {
-    if (platform::IsProfileEnabled()) {
-      request.set_profile(platform::kEnableProfiler);
-    } else {
-      request.set_profile(platform::kDisableProfiler);
-    }
-  }
-  if (!out_name.empty()) {
-    request.set_out_varname(out_name);
-  }
-  if (var->IsType<framework::LoDTensor>()) {
-    request.set_type(::sendrecv::LOD_TENSOR);
-    GetTensorPayload(var, ctx, &request, &payload, &payload_size);
-  } else if (var->IsType<framework::SelectedRows>()) {
-    request.set_type(::sendrecv::SELECTED_ROWS);
-    GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size);
-#ifdef PADDLE_WITH_CUDA
-  } else if (var->IsType<ncclUniqueId>()) {
-    request.set_type(::sendrecv::NCCL_ID);
-#endif
-  } else {
-    PADDLE_THROW("Serialize does not support type: %s",
-                 typeid(var->Type()).name());
-  }
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-    // GPU data is copied to CPU buffer when sending,
-    // free the buffer when possible.
-    destroy_callback = [](void* backing) {
-      platform::CUDAPinnedPlace cuda_pinned;
-      memory::Free(cuda_pinned, backing);
-    };
-#endif
-  }
-
-  std::string header;
-  request.AppendToString(&header);
-  auto buffer = std::unique_ptr<char[]>(new char[1024]);
-  void* buf = buffer.get();
-  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
-  e.WriteRawBytes(std::string(header.data(), header.size()));
-// NCCLID is copied directly to the message, return bytebuffer
-// with only one slice if serializing NCCLID.
-#ifdef PADDLE_WITH_CUDA
-  if (var->IsType<ncclUniqueId>()) {
-    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
-                              NCCL_UNIQUE_ID_BYTES);
-    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
-    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
-
-    // for serialize NCCL_ID
-    ::grpc::Slice slices(e.size());
-    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
-    ::grpc::ByteBuffer tmp(&slices, 1);
-    msg->Swap(&tmp);
-    return;
-  }
-#endif
-
-  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
-  // steal reference of tensor data
-  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
-  int num_slices = 2;       // only SelectedRows have rows buffer
-  slices[0] = ::grpc::Slice(e.size());
-  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
-  slices[1] = ::grpc::Slice(
-      grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
-                                    static_cast<char*>(payload)),
-      ::grpc::Slice::STEAL_REF);
-
-  if (var->IsType<framework::SelectedRows>()) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
-    size_t rows_memory_size =
-        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
-    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
-    slices[2] = ::grpc::Slice(e2.size());
-    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
-
-    slices[3] = ::grpc::Slice(
-        grpc_slice_new_with_user_data(
-            const_cast<void*>(
-                reinterpret_cast<const void*>(slr->rows().data())),
-            rows_memory_size, [](void* backing) {},
-            const_cast<char*>(
-                reinterpret_cast<const char*>(slr->rows().data()))),
-        ::grpc::Slice::STEAL_REF);
-    num_slices = 4;
-  }
-
-  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
-  msg->Swap(&tmp);
-}
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var) {
-  operators::distributed::VariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
-  *var = resp.GetVar();
-}
-
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -25,24 +25,21 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"

-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"

 namespace paddle {
 namespace operators {
 namespace distributed {

-typedef void (*DestroyCallback)(void*);
+using VarMsg = sendrecv::VariableMessage;

-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_varname = std::string());
+void GetTensorPayload(framework::Variable* var,
+                      const platform::DeviceContext& ctx, VarMsg* request,
+                      void** payload, size_t* payload_size);

-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var);
+void GetSelectedRowsPayload(framework::Variable* var,
+                            const platform::DeviceContext& ctx, VarMsg* request,
+                            void** payload, size_t* payload_size);

 inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
  switch (type) {

--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,50 +13,20 @@
 // limitations under the License.

 #include "paddle/fluid/operators/distributed/variable_response.h"
-
-#include <string>
-#include <utility>
 #include <vector>
-#ifdef PADDLE_WITH_CUDA
-#include <nccl.h>
-#endif
-#include "paddle/fluid/platform/profiler.h"
-
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"

 namespace paddle {
 namespace operators {
 namespace distributed {

-enum WireType {
-  WIRETYPE_VARINT = 0,
-  WIRETYPE_LENGTH_DELIMITED = 2,
-};
-
-inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
-
-inline WireType GetTagWireType(uint32_t tag) {
-  return static_cast<WireType>(tag & 0x7);
-}
-
-bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
-                         int* result) {
-  uint64_t v;
-  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
-    *result = static_cast<int>(v);
-    return true;
-  } else {
-    return false;
-  }
-}
-
-bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
-             const platform::DeviceContext& dev_ctx, platform::Place place,
-             void* dest, int size) {
+bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
+                               const platform::DeviceContext& dev_ctx,
+                               platform::Place place, void* dest,
+                               int64_t size) {
  const void* data = NULL;
  int size_to_write = 0;
-  int length = size;
+  int64_t length = size;
  int total_written = 0;

  if (platform::is_gpu_place(place)) {
@@ -194,294 +164,49 @@ bool VariableResponse::CopySelectRowsData(
  return true;
 }

-bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
-                  std::vector<int64_t>* lod) {
-  while (true) {
-    auto p = input->ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-
-    if (!p.second) {
-      return (tag == 0);
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
-        uint64_t v;
-        if (wt == WIRETYPE_VARINT) {
-          if (!input->ReadVarint64(&v)) {
-            return false;
-          }
-          lod->push_back(v);
-          break;
-        }
-
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input->CurrentPosition();
-          while (input->CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input->ReadVarint64(&v)) {
-              return tag;
-            }
-            lod->push_back(v);
-          }
-          break;
-        }
+bool VariableResponse::ProcSerializedField(
+    int tag, ::google::protobuf::io::CodedInputStream* input,
+    int64_t num_bytes) {
+  PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                  meta_.type() == sendrecv::LOD_TENSOR ||
+                  meta_.type() == sendrecv::NCCL_ID) &&
+                     meta_.varname() != "",
+                 "meta info should be got first!");

+  if (meta_.type() == sendrecv::NCCL_ID) {
+#ifdef PADDLE_WITH_CUDA
+    auto* var = scope_->FindVar(meta_.varname());
+    if (var != nullptr) {
+      ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
+      if (!ReadRaw(input, *dev_ctx_, platform::CPUPlace(), id->internal,
+                   num_bytes)) {
        return false;
      }
-      default: { return false; }
    }
-  }
-
-  return true;
-}
-
-int VariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
-  GrpcByteBufferSource source;
-  source.Init(byte_buffer);
-  GrpcByteBufferSourceWrapper r(&source);
-
-  return Parse(&r);
-}
-
-int VariableResponse::Parse(Source* source) {
-  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
-      source->contents();
-  ::google::protobuf::io::CodedInputStream input(input_stream);
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
-
-  while (true) {
-    auto p = input.ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-    if (!p.second) {
-      if (tag != 0) {
-        return -1;
-      }
-      return 0;
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage::kVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kTypeFieldNumber: {
-        uint32_t v;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_type(static_cast<::sendrecv::VarType>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
-        uint32_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDimsFieldNumber: {
-        // not packed
-        if (wt == WIRETYPE_VARINT) {
-          uint64_t v;
-          if (!input.ReadVarint64(&v)) {
-            return tag;
-          }
-          meta_.add_dims(v);
-          break;
-        }
-
-        // packed
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input.CurrentPosition();
-          while (input.CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input.ReadVarint64(&v)) {
-              return tag;
-            }
-            meta_.add_dims(v);
-          }
-          break;
-        }
-        return tag;
-      }
-      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_lod_level(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kLodFieldNumber: {
-        int length = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &length)) {
-          return tag;
-        }
-
-        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
-            input.IncrementRecursionDepthAndPushLimit(length);
-
-        std::vector<int64_t> lod_data;
-        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
-          return tag;
-        }
-
-        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
-          return false;
-        }
-
-        if (lod_data.size() == 0) {
-          break;
-        }
-
-        auto lod = meta_.add_lod();
-        for (uint32_t i = 0; i < lod_data.size(); i++) {
-          lod->add_lod_data(lod_data[i]);
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_slr_height(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kSerializedFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR ||
-                        meta_.type() == sendrecv::NCCL_ID) &&
-                           meta_.varname() != "",
-                       "meta info should be got first!");
-
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (meta_.type() == sendrecv::NCCL_ID) {
-#ifdef PADDLE_WITH_CUDA
-          auto* var = scope_->FindVar(meta_.varname());
-          if (var != nullptr) {
-            ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
-            if (!ReadRaw(&input, *dev_ctx_, platform::CPUPlace(), id->internal,
-                         num_bytes)) {
-              return tag;
-            }
-          }
-          break;
+    return true;
 #else
-          PADDLE_THROW("Not compiled with CUDA!");
+    PADDLE_THROW("Not compiled with CUDA!");
+    return false;
 #endif
-        }
-
-        framework::DDim dims = GetDims(meta_.dims());
-        if (meta_.type() == sendrecv::LOD_TENSOR) {
-          PADDLE_ENFORCE(meta_.lod_size() >= 0,
-                         "lod info should be got first!");
-          if (!CopyLodTensorData(&input, *dev_ctx_, dims, num_bytes)) {
-            return tag;
-          }
-          break;
-        }
-
-        if (meta_.type() == sendrecv::SELECTED_ROWS) {
-          if (!CopySelectRowsTensorData(&input, *dev_ctx_, dims, num_bytes)) {
-            return tag;
-          }
-          break;
-        }
-
-        return tag;
-      }
-      case sendrecv::VariableMessage::kRowsFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR) &&
-                           meta_.varname() != "",
-                       "meta info should be got first!");
-
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
-          return tag;
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
+  }

-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
+  framework::DDim dims = GetDims(meta_.dims());
+  if (meta_.type() == sendrecv::LOD_TENSOR) {
+    PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
+    if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
+      return false;
+    }
+    return true;
+  }

-        meta_.set_out_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kProfileFieldNumber: {
-        uint64_t profiling = 0;
-        if (!input.ReadVarint64(&profiling)) {
-          return tag;
-        }
-        meta_.set_profile(profiling);
-        int64_t listener_id = platform::ListenerId();
-        if (listener_id <= 0) {
-          break;
-        }
-        if (profiling == platform::kEnableProfiler &&
-            !platform::IsProfileEnabled()) {
-          platform::EnableProfiler(platform::ProfilerState::kCPU);
-        } else if (profiling == platform::kDisableProfiler &&
-                   platform::IsProfileEnabled()) {
-          // TODO(panyx0718): Should we allow to customize file dir.
-          platform::DisableProfiler(
-              platform::EventSortingKey::kDefault,
-              string::Sprintf("/tmp/profile_ps_%lld", listener_id));
-        }
-        break;
-      }
-      default: {
-        // Unknown tag, return unknown error.
-        return -1;
-      }
+  if (meta_.type() == sendrecv::SELECTED_ROWS) {
+    if (!CopySelectRowsTensorData(input, *dev_ctx_, dims, num_bytes)) {
+      return false;
    }
+    return true;
  }

-  return 0;
+  return true;
 }

 };  // namespace distributed

--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -22,18 +22,35 @@
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type.h"

-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"

 namespace paddle {
 namespace operators {
 namespace distributed {

+// Source provides a way for a particular RPC implementation to provide
+// received data to ParseFrom.
+class Source {
+ public:
+  virtual ~Source() {}
+
+  // Return the stream that contains the data to be parsed.
+  // Note that this method might be invoked more than once if
+  // ParseFrom needs to fall back to a more expensive parsing method.
+  // Every call must return a stream pointing at the beginning of
+  // the serialized RecvTensorResponse.
+  //
+  // Note that a subsequent call to contents() invalidates previous
+  // results of contents().
+  //
+  // Ownership of the returned stream is retained by the Source and
+  // should not be deleted by the caller.
+  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
+};
+
 class VariableResponse {
 public:
  VariableResponse(const framework::Scope* scope,
@@ -51,22 +68,19 @@ class VariableResponse {
    }
  }

-  // return:
-  // 0:ok.
-  // -1: unkown error.
-  // other: number of error field.
-  int Parse(Source* source);
+  int Parse(Source* source, const sendrecv::VariableMessage& meta) {
+    meta_ = meta;
+    return Parse(source);
+  }

  // return:
  // 0:ok.
  // -1: unkown error.
  // other: number of error field.
-  int Parse(const ::grpc::ByteBuffer& byte_buffer);
-
-  const framework::Scope& GetLocalScope() const { return *local_scope_; }
-
-  framework::Scope* GetMutableLocalScope() const { return local_scope_; }
+  virtual int Parse(Source* source) = 0;

+  inline const framework::Scope& GetLocalScope() const { return *local_scope_; }
+  inline framework::Scope* GetMutableLocalScope() const { return local_scope_; }
  inline std::string Varname() const { return meta_.varname(); }
  inline std::string OutVarname() const { return meta_.out_varname(); }

@@ -78,7 +92,11 @@ class VariableResponse {
    return scope_->FindVar(meta_.varname());
  }

- private:
+ protected:
+  bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
+               const platform::DeviceContext& dev_ctx, platform::Place place,
+               void* dest, int64_t size);
+
  bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
                                const platform::DeviceContext& ctx,
                                const framework::DDim& dims, int length);
@@ -90,12 +108,16 @@ class VariableResponse {
                         const platform::DeviceContext& ctx,
                         const framework::DDim& dims, int length);

- private:
+  bool ProcSerializedField(int tag,
+                           ::google::protobuf::io::CodedInputStream* input,
+                           int64_t num_bytes);
+
+ protected:
  const framework::Scope* scope_;
  const platform::DeviceContext* dev_ctx_;
  bool create_scope_ = false;
  framework::Scope* local_scope_ = nullptr;
-  // only Skeleton
+
  sendrecv::VariableMessage meta_;
 };


--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -37,6 +37,7 @@ struct CBlas<float> {
    libxsmm_sgemm(args...);
  }
 #endif
+
  template <typename... ARGS>
  static void AXPY(ARGS... args) {
    platform::dynload::cblas_saxpy(args...);
@@ -76,6 +77,7 @@ struct CBlas<double> {
    libxsmm_dgemm(args...);
  }
 #endif
+
  template <typename... ARGS>
  static void AXPY(ARGS... args) {
    platform::dynload::cblas_daxpy(args...);
@@ -150,6 +152,7 @@ struct CBlas<double> {
  }
 };
 #endif
+
 template <>
 struct CBlas<platform::float16> {
  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
@@ -190,30 +193,48 @@ inline bool UseXSMM<platform::float16>(const int &m, const int &n, const int &k,
  return false;
 }

-template <>
 template <typename T>
-void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                            CBLAS_TRANSPOSE transB, int M,
-                                            int N, int K, T alpha, const T *A,
-                                            const T *B, T beta, T *C) const {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
+inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA,
+                      CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha,
+                      const T *A, int lda, const T *B, int ldb, T beta, T *C,
+                      int ldc) {
 #ifdef PADDLE_WITH_LIBXSMM
-  if (UseXSMM(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
-              beta)) {
+  if (UseXSMM<T>(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
+                 beta)) {
    // Note: SMM use ColMajor
    const char transa = 'N';
    const char transb = 'N';
    CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda,
                       &beta, C, &ldc);
-  } else {
+    return;
+  }
 #endif
-    CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B,
-                   ldb, beta, C, ldc);
-#ifdef PADDLE_WITH_LIBXSMM
+
+#ifdef PADDLE_MKL_SPLIT_GEMM
+  constexpr int bs = 2;
+  if (M % bs == 0 && transA == CblasNoTrans && transB == CblasNoTrans) {
+    for (int off = 0; off < M; off += bs) {
+      CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, bs, N, K, alpha,
+                     A + off * lda, lda, B, ldb, beta, C + off * ldb, ldc);
+    }
+    return;
  }
 #endif
+  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                 beta, C, ldc);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                            CBLAS_TRANSPOSE transB, int M,
+                                            int N, int K, T alpha, const T *A,
+                                            const T *B, T beta, T *C) const {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  GEMM_WARP<T>(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+               beta, C, ldc);
 }

 template <>
@@ -222,9 +243,9 @@ void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M,
                                            int N, int K, T alpha, const T *A,
                                            int lda, const T *B, int ldb,
                                            T beta, T *C, int ldc) const {
-  CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-                 transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-                 lda, B, ldb, beta, C, ldc);
+  GEMM_WARP<T>(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+               transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+               lda, B, ldb, beta, C, ldc);
 }

 template <typename DeviceContext>

--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -228,3 +228,57 @@ TEST(math_funciton, set_constant) {
  }
  delete ctx;
 }
+
+template <typename T>
+void GemmWarpTest(int m, int n, int k, T alpha, T beta) {
+  paddle::framework::Tensor mat_a;
+  paddle::framework::Tensor mat_b;
+  paddle::framework::Tensor mat_c_ref;
+  paddle::framework::Tensor mat_c_mkl;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+
+  T* A = mat_a.mutable_data<T>({m, k}, *cpu_place);
+  T* B = mat_b.mutable_data<T>({k, n}, *cpu_place);
+  T* CREF = mat_c_ref.mutable_data<T>({m, n}, *cpu_place);
+  T* CMKL = mat_c_mkl.mutable_data<T>({m, n}, *cpu_place);
+
+  ASSERT_EQ(mat_c_mkl.numel(), mat_c_ref.numel());
+  for (int i = 0; i < mat_a.numel(); ++i) {
+    A[i] = static_cast<T>(i);
+  }
+  for (int i = 0; i < mat_b.numel(); ++i) {
+    B[i] = static_cast<T>(i + 1);
+  }
+  for (int i = 0; i < mat_c_ref.numel(); ++i) {
+    CREF[i] = static_cast<T>(i + 2);
+    CMKL[i] = CREF[i];
+  }
+
+  // this would call gemm_warp
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  GetBlas<T>(context).GEMM(CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B,
+                           beta, CREF);
+
+  // lda,ldb,ldc follow RowMajor
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
+  paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans,
+                                          CblasNoTrans, m, n, k, alpha, A, lda,
+                                          B, ldb, beta, CMKL, ldc);
+
+  for (int i = 0; i < mat_c_mkl.numel(); ++i) {
+    EXPECT_FLOAT_EQ(CREF[i], CMKL[i]);
+  }
+}
+
+TEST(math_function, gemm_warp) {
+  GemmWarpTest<float>(3, 2, 5, 1.f, 0.f);
+  GemmWarpTest<float>(3, 2, 5, 2.f, 1.f);
+  GemmWarpTest<float>(8, 5, 6, 1.f, 0.f);
+  GemmWarpTest<float>(8, 5, 6, 2.f, 1.f);
+  GemmWarpTest<double>(3, 2, 5, 1.0, 0.0);
+  GemmWarpTest<double>(3, 2, 5, 2.0, 1.0);
+  GemmWarpTest<double>(8, 5, 6, 1.0, 0.0);
+  GemmWarpTest<double>(8, 5, 6, 2.0, 1.0);
+}
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -78,7 +78,7 @@ class LoDTensor2BatchFunctor {
    auto lods = lod_tensor.lod();
    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");

-    auto lod = lods[0];
+    const auto& lod = lods[0];

    std::vector<SeqInfo> seq_info;
    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {

--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -98,7 +98,7 @@ The update equations are as follows:
 $$
 velocity = mu * velocity + gradient \\
 if (use\_nesterov):   \\
-  param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
+  param = param - (gradient + mu * velocity) * learning\_rate \\
 else:   \\
  param = param - learning\_rate * velocity. \\
 $$

--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -30,7 +30,7 @@ __global__ void MomentumKernel(const T* p, const T* g, const T* v,
      T g_val = g[i];
      T v_new = v[i] * mu + g_val;
      v_out[i] = v_new;
-      p_out[i] = p[i] - (g_val - v_new * mu) * lr;
+      p_out[i] = p[i] - (g_val + v_new * mu) * lr;
    }
  } else {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;

--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
@@ -46,7 +46,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {

    v_out = v * mu + g;
    if (use_nesterov) {
-      p_out = p - (g - v_out * mu) * lr[0];
+      p_out = p - (g + v_out * mu) * lr[0];
    } else {
      p_out = p - lr[0] * v_out;
    }

--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -15,12 +15,13 @@ function(reader_library TARGET_NAME)
        PARENT_SCOPE)
 endfunction()

-reader_library(open_files_op SRCS open_files_op.cc)
+cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool)
+reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader)
 reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)
 reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc)
 reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
 reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
-reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
+reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
 reader_library(create_py_reader_op SRCS create_py_reader_op.cc)

--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/buffered_reader.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+namespace reader {
+BufferedReader::~BufferedReader() {
+  reader_->Shutdown();
+  while (!position_.empty()) {
+    position_.front().wait();
+    position_.pop();
+  }
+}
+
+BufferedReader::BufferedReader(
+    const std::shared_ptr<framework::ReaderBase> &reader,
+    const platform::Place &place, size_t buffer_size)
+    : framework::DecoratedReader(reader),
+      thread_pool_(1),
+      place_(place),
+      buffer_size_(buffer_size) {
+  cpu_buffer_.resize(buffer_size);
+  gpu_buffer_.resize(buffer_size);
+  ReadTillBufferFullAsync();
+}
+
+void BufferedReader::ReadTillBufferFullAsync() {
+  PADDLE_ENFORCE_EQ(position_.size(), 0U);
+  for (size_t i = 0; i < buffer_size_; ++i) {
+    ReadAsync(i);
+  }
+}
+
+void BufferedReader::ReadAsync(size_t i) {
+  position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
+    TensorVec &cpu = cpu_buffer_[i];
+    reader_->ReadNext(&cpu);
+
+    if (cpu.empty()) {
+      return -1UL;
+    }
+
+    if (platform::is_gpu_place(place_)) {
+      TensorVec &gpu = gpu_buffer_[i];
+      gpu.resize(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        framework::TensorCopySync(cpu[i], place_, &gpu[i]);
+        gpu[i].set_lod(cpu[i].lod());
+      }
+    }
+    return i;
+  }));
+}
+
+void BufferedReader::ShutdownImpl() {
+  reader_->Shutdown();
+  while (!position_.empty()) {
+    position_.pop();
+  }
+  prev_pos_ = -1UL;
+}
+
+void BufferedReader::StartImpl() {
+  reader_->Start();
+  ReadTillBufferFullAsync();
+}
+
+void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
+  if (position_.empty()) {
+    out->clear();
+    return;
+  }
+  size_t i = position_.front().get();
+  position_.pop();
+
+  if (i == -1UL) {
+    ReadNextImpl(out);
+    return;
+  }
+
+  *out = platform::is_gpu_place(place_) ? gpu_buffer_[i] : cpu_buffer_[i];
+
+  // Do not push current position into ReadAsync. Push the previous position
+  // Since all computation in fluid are async, change the data of
+  // current position may cause data error.
+  if (prev_pos_ != -1Ul) {
+    ReadAsync(prev_pos_);
+  }
+  prev_pos_ = i;
+}
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <list>
+#include <queue>
+#include <vector>
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/reader.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class BufferedReader : public framework::DecoratedReader {
+  using TensorVec = std::vector<framework::LoDTensor>;
+  using VecFuture = std::future<TensorVec>;
+
+ public:
+  BufferedReader(const std::shared_ptr<framework::ReaderBase>& reader,
+                 const platform::Place& place, size_t buffer_size);
+
+  ~BufferedReader() override;
+
+ private:
+  void ReadTillBufferFullAsync();
+
+  void ReadAsync(size_t i);
+
+ protected:
+  void ShutdownImpl() override;
+  void StartImpl() override;
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
+
+ private:
+  ThreadPool thread_pool_;
+  platform::Place place_;
+  const size_t buffer_size_;
+
+  std::queue<std::future<size_t>> position_;
+
+  // The buffer for reading data.
+  // NOTE: the simplest way to implement buffered reader is do not use any
+  // buffer, just read async and create futures as buffer size. However, to
+  // malloc tensors every time is extremely slow. Here we store all data in
+  // buffers and prevent alloc every time.
+  std::vector<TensorVec> cpu_buffer_;
+  std::vector<TensorVec> gpu_buffer_;
+  size_t prev_pos_{-1UL};
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -12,83 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <thread>  // NOLINT
-
-#include "paddle/fluid/operators/reader/blocking_queue.h"
+#include "paddle/fluid/operators/reader/buffered_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"

 namespace paddle {
 namespace operators {
 namespace reader {
-
-// 'Double buffer' means we shall maintain two batches of input data at the same
-// time. So the kCacheSize shoul be at least 2.
-static constexpr size_t kCacheSize = 3;
-// There will be two bacthes out of the channel during training:
-// 1. the one waiting to be sent to the channel
-// 2. the one just be received from the channel, which is also being used by
-// subsequent operators.
-// So the channel size should be kChacheSize - 2
-static constexpr size_t kChannelSize = 1;  // kCacheSize - 2
-
-class DoubleBufferReader : public framework::DecoratedReader {
- public:
-  explicit DoubleBufferReader(
-      const std::shared_ptr<ReaderBase>& reader,
-      platform::Place target_place = platform::CPUPlace())
-      : DecoratedReader(reader), place_(target_place) {
-    cpu_tensor_cache_.resize(kCacheSize);
-    gpu_tensor_cache_.resize(kCacheSize);
-#ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(place_)) {
-      for (size_t i = 0; i < kCacheSize; ++i) {
-        ctxs_.emplace_back(new platform::CUDADeviceContext(
-            boost::get<platform::CUDAPlace>(place_)));
-      }
-    }
-#endif
-    StartPrefetcher();
-  }
-
-  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
-
-  ~DoubleBufferReader() { EndPrefetcher(); }
-
- private:
-  void ShutdownImpl() override {
-    EndPrefetcher();
-    reader_->Shutdown();
-  }
-
-  void StartImpl() override {
-    reader_->Start();
-    StartPrefetcher();
-  }
-
-  void StartPrefetcher() {
-    channel_ = new reader::BlockingQueue<size_t>(kChannelSize);
-    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
-  }
-
-  void EndPrefetcher() {
-    channel_->Close();
-    if (prefetcher_.joinable()) {
-      prefetcher_.join();
-    }
-    delete channel_;
-    channel_ = nullptr;
-  }
-
-  void PrefetchThreadFunc();
-
-  std::thread prefetcher_;
-  reader::BlockingQueue<size_t>* channel_;
-  platform::Place place_;
-  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache_;
-  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache_;
-  std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
-};
-
 class CreateDoubleBufferReaderOp : public framework::OperatorBase {
 public:
  using framework::OperatorBase::OperatorBase;
@@ -118,8 +47,8 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
      place = platform::CUDAPlace(static_cast<int>(num));
    }

-    out->Reset(framework::MakeDecoratedReader<DoubleBufferReader>(
-        underlying_reader, place));
+    out->Reset(framework::MakeDecoratedReader<BufferedReader>(underlying_reader,
+                                                              place, 2));
  }
 };

@@ -146,51 +75,6 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
  }
 };

-void DoubleBufferReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
-  size_t cached_tensor_id;
-  if (channel_->Receive(&cached_tensor_id)) {
-    if (platform::is_gpu_place(place_)) {
-      *out = gpu_tensor_cache_[cached_tensor_id];
-    } else {
-      // CPU place
-      *out = cpu_tensor_cache_[cached_tensor_id];
-    }
-  } else {
-    out->clear();
-  }
-}
-
-void DoubleBufferReader::PrefetchThreadFunc() {
-  VLOG(5) << "A new prefetch thread starts.";
-  size_t cached_tensor_id = 0;
-  while (true) {
-    auto& cpu_batch = cpu_tensor_cache_[cached_tensor_id];
-    reader_->ReadNext(&cpu_batch);
-    if (cpu_batch.empty()) {
-      // The underlying reader have no next data.
-      break;
-    }
-    if (platform::is_gpu_place(place_)) {
-      auto& gpu_batch = gpu_tensor_cache_[cached_tensor_id];
-      gpu_batch.resize(cpu_batch.size());
-      for (size_t i = 0; i < cpu_batch.size(); ++i) {
-        // TODO(fengjiayi): Use asynchronous TensorCopy instead
-        framework::TensorCopySync(cpu_batch[i], place_, &gpu_batch[i]);
-        gpu_batch[i].set_lod(cpu_batch[i].lod());
-      }
-    }
-    if (!channel_->Send(cached_tensor_id)) {
-      VLOG(5) << "WARNING: The double buffer channel has been closed. The "
-                 "prefetch thread will terminate.";
-      break;
-    }
-    ++cached_tensor_id;
-    cached_tensor_id %= kCacheSize;
-  }
-  channel_->Close();
-  VLOG(5) << "Prefetch thread terminates.";
-}
-
 }  // namespace reader
 }  // namespace operators
 }  // namespace paddle

--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -33,6 +33,8 @@ class PyReader : public framework::FileReader {
    if (!success) out->clear();
  }

+  ~PyReader() { queue_->Close(); }
+
  void Shutdown() override { queue_->Close(); }

  void Start() override { queue_->ReOpen(); }

--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -33,11 +33,14 @@ class RecordIOFileReader : public framework::FileReader {

 protected:
  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+    std::unique_ptr<std::lock_guard<std::mutex>> guard;
    if (ThreadSafe) {
-      std::lock_guard<std::mutex> guard(*mutex_);
-      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
-    } else {
-      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
+      guard.reset(new std::lock_guard<std::mutex>(*mutex_));
+    }
+
+    bool ok = framework::ReadFromRecordIO(&scanner_, dev_ctx_, out);
+    if (!ok) {
+      out->clear();
    }
  }


--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -48,9 +48,9 @@ class ShuffleReader : public framework::DecoratedReader {

 private:
  void ShutdownImpl() override {
+    reader_->Shutdown();
    buffer_.clear();
    iteration_pos_ = 0;
-    reader_->Shutdown();
  }

  void StartImpl() override {

--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -12,150 +12,200 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <cmath>
+#include <stdexcept>
 #include <thread>  // NOLINT
-
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/operators/reader/blocking_queue.h"
+#include "paddle/fluid/operators/reader/buffered_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"

 namespace paddle {
 namespace operators {
 namespace reader {

-class MultiFileReader : public framework::ReaderBase {
+class IReaderContainer {
 public:
-  MultiFileReader(const std::vector<std::string>& file_names, size_t thread_num,
-                  size_t buffer_size)
-      : buffer_size_(buffer_size) {
-    readers_.reserve(file_names.size());
-    for (const std::string& f_name : file_names) {
-      readers_.emplace_back(CreateReaderByFileName(f_name));
+  virtual ~IReaderContainer() {}
+  virtual void AppendReader(
+      std::unique_ptr<framework::ReaderBase>&& readers) = 0;
+  virtual void Stop() = 0;
+  virtual void Start() = 0;
+  virtual void ReadNext(std::vector<framework::LoDTensor>* out) = 0;
+};
+
+class OrderedReaderContainer : public IReaderContainer {
+ public:
+  void AppendReader(std::unique_ptr<framework::ReaderBase>&& reader) override {
+    pending_.emplace(std::move(reader));
+  }
+
+  void Stop() override {
+    while (!pending_.empty()) {
+      MoveFrontPendingToDone();
    }
-    prefetchers_.resize(thread_num);
-    StartNewScheduler();
  }

-  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
+  void Start() override { std::swap(done_, pending_); }

-  ~MultiFileReader() { EndScheduler(); }
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    if (!pending_.empty()) {
+      pending_.front()->ReadNext(out);
+      if (out->empty()) {
+        MoveFrontPendingToDone();
+        ReadNext(out);
+      }
+    } else {
+      out->clear();
+    }
+  }

 private:
-  void ShutdownImpl() override { EndScheduler(); }
-
-  void StartImpl() override { StartNewScheduler(); }
-
-  void StartNewScheduler();
-  void EndScheduler();
-  void ScheduleThreadFunc();
-  void PrefetchThreadFunc(size_t reader_idx, size_t thread_idx);
-
-  std::vector<std::unique_ptr<framework::ReaderBase>> readers_;
-  std::thread scheduler_;
-  std::vector<std::thread> prefetchers_;
-  size_t buffer_size_;
-  reader::BlockingQueue<size_t>* waiting_reader_idx_;
-  reader::BlockingQueue<size_t>* available_thread_idx_;
-  reader::BlockingQueue<std::vector<framework::LoDTensor>>* buffer_;
+  void MoveFrontPendingToDone() {
+    pending_.front()->Shutdown();
+    pending_.front()->Start();
+    done_.emplace(move(pending_.front()));
+    pending_.pop();
+  }
+
+  std::queue<std::unique_ptr<framework::ReaderBase>> pending_;
+  std::queue<std::unique_ptr<framework::ReaderBase>> done_;
 };

-void MultiFileReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
-  if (!buffer_->Receive(out)) {
-    out->clear();
-  }
-}
+class PreemptiveReaderContainer : public IReaderContainer {
+  using ReaderList = std::list<std::unique_ptr<framework::ReaderBase>>;

-void MultiFileReader::StartNewScheduler() {
-  size_t thread_num = prefetchers_.size();
-  waiting_reader_idx_ = new reader::BlockingQueue<size_t>(readers_.size());
-  available_thread_idx_ = new reader::BlockingQueue<size_t>(thread_num);
-  buffer_ = new reader::BlockingQueue<std::vector<framework::LoDTensor>>(
-      buffer_size_);
+  struct FutureItem {
+    std::vector<framework::LoDTensor> data_;
+    ReaderList::iterator reader_it_;
+    std::exception_ptr exception_;
+  };

-  for (size_t i = 0; i < readers_.size(); ++i) {
-    waiting_reader_idx_->Send(i);
-  }
-  waiting_reader_idx_->Close();
-  for (size_t i = 0; i < thread_num; ++i) {
-    available_thread_idx_->Send(i);
-  }
+  using FutureList = std::list<std::future<FutureItem>>;

-  scheduler_ = std::thread([this] { ScheduleThreadFunc(); });
-}
+ public:
+  explicit PreemptiveReaderContainer(size_t thread_num) : pool_(thread_num) {}

-void MultiFileReader::EndScheduler() {
-  available_thread_idx_->Close();
-  buffer_->Close();
-  waiting_reader_idx_->Close();
-  if (scheduler_.joinable()) {
-    scheduler_.join();
-  }
-  delete buffer_;
-  delete available_thread_idx_;
-  delete waiting_reader_idx_;
-}
-
-void MultiFileReader::ScheduleThreadFunc() {
-  VLOG(5) << "MultiFileReader schedule thread starts.";
-  size_t completed_thread_num = 0;
-  size_t thread_idx;
-  while (available_thread_idx_->Receive(&thread_idx)) {
-    std::thread& prefetcher = prefetchers_[thread_idx];
-    if (prefetcher.joinable()) {
-      prefetcher.join();
-    }
-    size_t reader_idx;
-    if (waiting_reader_idx_->Receive(&reader_idx)) {
-      // Still have files to read. Start a new prefetch thread.
-      prefetcher = std::thread([this, reader_idx, thread_idx] {
-        PrefetchThreadFunc(reader_idx, thread_idx);
-      });
-    } else {
-      // No more file to read.
-      ++completed_thread_num;
-      if (completed_thread_num == prefetchers_.size()) {
-        buffer_->Close();
-        break;
+  void Stop() override {
+    if (!pending_.empty()) {
+      for (auto& reader : pending_) {
+        reader->Shutdown();
+      }
+      for (auto& fu : futures_) {
+        fu.wait();
      }
+      futures_.clear();
+      for (auto& reader : pending_) {
+        reader->Start();
+        done_.emplace_back(std::move(reader));
+      }
+      pending_.clear();
+      bool timeout;
+      complete_queue_.PopAll(1000, &timeout);
+      PADDLE_ENFORCE(!timeout);
    }
  }
-  // If users invoke Shutdown() when scheduler is running, it will close the
-  // 'avaiable_thread_idx_' and prefecther threads have no way to tell scheduler
-  // to release their resource. So a check is needed before scheduler ends.
-  for (auto& p : prefetchers_) {
-    if (p.joinable()) {
-      p.join();
+
+  void Start() override {
+    for (auto& reader : done_) {
+      AppendReader(std::move(reader));
    }
+    done_.clear();
  }
-  VLOG(5) << "MultiFileReader schedule thread terminates.";
-}
-
-void MultiFileReader::PrefetchThreadFunc(size_t reader_idx, size_t thread_idx) {
-  VLOG(5) << "The prefetch thread of file idx '" << reader_idx << "' starts.";
-  std::unique_ptr<framework::ReaderBase>& reader = readers_[reader_idx];
-  while (true) {
-    std::vector<framework::LoDTensor> ins;
-    reader->ReadNext(&ins);
-    if (ins.empty()) {
-      reader->Shutdown();
-      reader->Start();
-      break;
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    if (!pending_.empty()) {
+      auto future_it = complete_queue_.Pop();
+      FutureItem item = future_it->get();
+      if (item.exception_) {
+        for (auto it = futures_.begin(); it != futures_.end(); ++it) {
+          if (it != future_it) {
+            it->wait();  // Wait all other threads complete.
+          }
+        }
+        std::rethrow_exception(item.exception_);
+
+      } else if (item.data_.empty()) {  // reader done.
+        done_.emplace_back(std::move(*item.reader_it_));
+        pending_.erase(item.reader_it_);
+        futures_.erase(future_it);
+        ReadNext(out);
+      } else {
+        *out = item.data_;
+        // continue read async
+        ReadAsync(item.reader_it_, &future_it);
+      }
+    } else {
+      out->clear();
    }
-    try {
-      buffer_->Send(std::move(ins));
-    } catch (paddle::platform::EnforceNotMet e) {
-      VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
-                 "thread of file idx '"
-              << reader_idx << "' will terminate.";
-      break;
+  }
+
+ private:
+  void AppendReader(std::unique_ptr<framework::ReaderBase>&& reader) override {
+    pending_.emplace_back(std::move(reader));
+    auto reader_it = pending_.end();
+    --reader_it;
+
+    futures_.emplace_back();
+    auto future_it = futures_.end();
+    --future_it;
+
+    ReadAsync(reader_it, &future_it);
+  }
+
+  void ReadAsync(const ReaderList::iterator& reader_it,
+                 FutureList::iterator* future_it_ptr) {
+    auto& future_it = *future_it_ptr;
+    *future_it = pool_.enqueue([reader_it, future_it, this] {
+      try {
+        FutureItem item;
+        item.reader_it_ = reader_it;
+        (*reader_it)->ReadNext(&item.data_);
+        if (item.data_.empty()) {
+          (*reader_it)->Shutdown();
+          (*reader_it)->Start();
+        }
+        complete_queue_.Push(future_it);
+        return item;
+      } catch (...) {
+        FutureItem item;
+        item.exception_ = std::current_exception();
+        complete_queue_.Push(future_it);
+        return item;
+      }
+    });
+  }
+
+  FutureList futures_;
+  ThreadPool pool_;
+  framework::BlockingQueue<FutureList::iterator> complete_queue_;
+  std::list<std::unique_ptr<framework::ReaderBase>> pending_;
+  std::list<std::unique_ptr<framework::ReaderBase>> done_;
+};
+
+class MultiFileReader : public framework::ReaderBase {
+ public:
+  MultiFileReader(const std::vector<std::string>& file_names,
+                  std::unique_ptr<IReaderContainer>&& container)
+      : container_(std::move(container)) {
+    for (auto& fn : file_names) {
+      container_->AppendReader(CreateReaderByFileName(fn));
    }
  }

-  if (!available_thread_idx_->Send(thread_idx)) {
-    VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
-               "Fail to send thread_idx.";
+  ~MultiFileReader() { container_->Stop(); }
+
+ protected:
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+    container_->ReadNext(out);
  }
-  VLOG(5) << "The prefetch thread of file idx '" << reader_idx
-          << "' terminates.";
-}
+  void ShutdownImpl() override { container_->Stop(); }
+  void StartImpl() override { container_->Start(); }
+
+ private:
+  std::unique_ptr<IReaderContainer> container_;
+};

 class OpenFilesOp : public framework::OperatorBase {
 public:
@@ -173,13 +223,27 @@ class OpenFilesOp : public framework::OperatorBase {
                      "shape concat's length.");
    const auto& file_names = Attr<std::vector<std::string>>("file_names");
    PADDLE_ENFORCE(!file_names.empty(), "No file to be read!");
-    const size_t thread_num = Attr<int>("thread_num");
-    const size_t buffer_size = Attr<int>("buffer_size");
+    bool is_test = Attr<bool>("is_test");

    auto* out = scope.FindVar(Output("Out"))
                    ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(
-        std::make_shared<MultiFileReader>(file_names, thread_num, buffer_size));
+    std::unique_ptr<IReaderContainer> container;
+
+    if (is_test) {
+      container.reset(new OrderedReaderContainer());
+    } else {
+      container.reset(new PreemptiveReaderContainer(
+          static_cast<size_t>(Attr<int>("thread_num"))));
+    }
+
+    std::shared_ptr<framework::ReaderBase> reader(
+        new MultiFileReader(file_names, std::move(container)));
+    auto buffer_size = Attr<int>("buffer_size");
+    if (buffer_size > 1) {
+      reader = framework::MakeDecoratedReader<BufferedReader>(
+          reader, platform::CPUPlace(), buffer_size);
+    }
+    out->Reset(reader);
  }
 };

@@ -187,9 +251,7 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
 protected:
  void Apply() override {
    AddAttr<std::vector<std::string>>("file_names", "Files to be read.");
-    AddAttr<int>("thread_num", "The maximal concurrent prefetch thread number.")
-        .GreaterThan(0);
-    AddAttr<int>("buffer_size", "The size of prefetch buffer.").GreaterThan(0);
+    AddAttr<bool>("is_test", "Used for testing data.").SetDefault(false);

    AddComment(R"DOC(
      OpenFiles Operator
@@ -197,6 +259,11 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
      An OpenFilesOp creates a MultiFileReader, which is able to
      read data multi-threaded from multiple files.
    )DOC");
+    AddAttr<int>("thread_num",
+                 "The maximal concurrent prefetch thread number. Used only "
+                 "when is_test = False");
+    AddAttr<int>("buffer_size", "The reading buffer of these files.")
+        .GreaterThan(0);
  }
 };


--- a/paddle/fluid/operators/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_sum_op.cc
@@ -23,12 +23,13 @@ REGISTER_OP_CPU_KERNEL(
    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
                      ops::SumFunctor>);
-REGISTER_OP_CPU_KERNEL(reduce_sum_grad,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             float, ops::SumGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             double, ops::SumGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int, ops::SumGradFunctor>,
-                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                             int64_t, ops::SumGradFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_sum_grad,
+    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, float,
+                             ops::SumGradFunctor>,
+    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, double,
+                             ops::SumGradFunctor>,
+    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, int,
+                             ops::SumGradFunctor>,
+    ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                             ops::SumGradFunctor>);
--- a/paddle/fluid/operators/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_sum_op.h
@@ -14,11 +14,69 @@

 #pragma once

+#include <vector>
+
 #include "paddle/fluid/operators/reduce_op.h"

 namespace paddle {
 namespace operators {

+// use for loop to speed up Eigen broadcast. 4 timer faster then broadcast
+template <typename DeviceContext, typename T, typename Functor>
+class ReduceSumGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto dims = context.Attr<std::vector<int>>("dim");
+    if (context.GetPlace().type() == typeid(platform::CPUPlace) &&
+        dims.size() == 1) {
+      auto* input0 = context.Input<Tensor>("X");
+      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+      output->mutable_data<T>(context.GetPlace());
+      const auto* input2_d = input2->data<T>();
+      auto* output_d = output->data<T>();
+
+      // handle reduce_all
+      if (input2->dims().size() == 1 && input2->dims()[0] == 1) {
+        for (int64_t i = 0; i < framework::product(input0->dims()); ++i) {
+          output_d[i] = input2_d[0];
+        }
+        return;
+      }
+
+      // handle reduce by one dimension
+      int reduce_dim_index = dims[0];
+      if (reduce_dim_index < 0) {
+        reduce_dim_index += input0->dims().size();
+      }
+
+      auto& input_dim = input0->dims();
+      int64_t before_dim = 1;
+      for (int i = 0; i < reduce_dim_index; ++i) {
+        before_dim *= input_dim[i];
+      }
+      int64_t reduce_dim = input_dim[reduce_dim_index];
+      int64_t after_dim = 1;
+      for (int i = reduce_dim_index + 1; i < input_dim.size(); ++i) {
+        after_dim *= input_dim[i];
+      }
+      for (int64_t i = 0; i < before_dim; ++i) {
+        for (int64_t j = 0; j < reduce_dim; ++j) {
+          for (int64_t k = 0; k < after_dim; ++k) {
+            output_d[i * reduce_dim * after_dim + j * after_dim + k] =
+                input2_d[i * after_dim + k];
+          }
+        }
+      }
+      return;
+    }
+
+    // default use Eigen broadcast
+    ReduceGradKernel<DeviceContext, T, Functor> kernel;
+    kernel.Compute(context);
+  }
+};
+
 struct SumFunctor {
  template <typename DeviceContext, typename X, typename Y, typename Dim>
  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
@@ -31,7 +89,7 @@ struct SumGradFunctor {
            typename DY, typename Dim>
  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                  const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim);
+    dx->device(place) = dy->eval().broadcast(dim);
  }
 };


--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -24,6 +24,9 @@
 #include "paddle/fluid/operators/tensorrt_engine_op.h"

 namespace paddle {
+
+DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
+
 namespace operators {

 using inference::Singleton;
@@ -53,7 +56,6 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
  PADDLE_ENFORCE_LE(shape.size(), 4UL,
                    "TensorRT' tensor input requires at most 4 dimensions");

-  // We should delete the batch size here.
  switch (shape.size()) {
    case 2:
      return nvinfer1::Dims2(1, shape[1]);
@@ -91,27 +93,36 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
  engine->InitNetwork();

  framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
+  VLOG(4) << "parsed var size " << block.AllVars().size();
  // Add inputs
  VLOG(4) << "declare inputs";
  for (auto &input : context.Inputs("Xs")) {
    VLOG(4) << "declare input " << input;
    auto *var = block.FindVar(input);
+    // TensorRT engine need to create parameters. The parameter's description
+    // should be set in
+    PADDLE_ENFORCE(var, "no variable called %s", input);
    PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
                      "TensorRT engine only takes LoDTensor as input");
    auto shape = var->GetShape();
+    // For the special batch_size placeholder -1, drop it and pass the real
+    // shape of data.
+    // TODO(Superjomn) fix this with batch broadcast, or it can't handle
+    // variational batch size.
+    if (shape[0] == -1) {
+      shape[0] = FLAGS_tensorrt_engine_batch_size;
+    }
    engine->DeclareInput(
        input, FluidDataType2TRT(
                   var->Proto()->type().lod_tensor().tensor().data_type()),
-        Vec2TRT_Dims(var->GetShape()));
+        Vec2TRT_Dims(shape));
  }

  inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
      block_desc, parameters, context.scope(), engine);

  // Add outputs
-  VLOG(4) << "declare outputs";
  for (auto &output : context.Outputs("Ys")) {
-    VLOG(4) << "declare output " << output;
    engine->DeclareOutput(output);
  }

@@ -152,4 +163,7 @@ REGISTER_OP_CPU_KERNEL(
    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);

+// A trick to compile with the needed TensorRT op converter.
+USE_TRT_CONVERTER(mul)
+
 #endif  // PADDLE_WITH_CUDA
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -24,6 +24,9 @@
 #include "paddle/fluid/inference/tensorrt/engine.h"

 namespace paddle {
+
+DECLARE_int32(tensorrt_engine_batch_size);
+
 namespace operators {

 using inference::Singleton;
@@ -53,7 +56,6 @@ template <typename DeviceContext, typename T>
 class TensorRTEngineKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    VLOG(4) << "TensorRTEngineKernel executing";
    auto engine_name = context.Attr<std::string>("engine_uniq_key");
    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
      Prepare(context);
@@ -61,11 +63,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
    auto input_names = context.op().Inputs("Xs");
    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
-    // Try to determine a batch_size
-    auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
-        context.scope(), input_names.front());
-    int batch_size = tensor0.dims()[0];
-    PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
+    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
+                      context.Attr<int>("max_batch"));

    // Convert input tensor from fluid to engine.
    for (const auto& x : context.Inputs("Xs")) {
@@ -81,8 +80,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
      }
    }
    // Execute the engine.
-    PADDLE_ENFORCE_GT(batch_size, 0);
-    engine->Execute(batch_size);
+    PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
+    engine->Execute(FLAGS_tensorrt_engine_batch_size);
    // Convert output tensor from engine to fluid
    for (const auto& y : context.Outputs("Ys")) {
      // convert output and copy to fluid.
@@ -94,16 +93,20 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
      auto* fluid_v = context.scope().FindVar(y);
      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
      auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+
      fluid_t->Resize(framework::make_ddim(ddim));

-      if (platform::is_cpu_place(fluid_t->place())) {
-        // TODO(Superjomn) change this float to dtype size.
-        engine->GetOutputInCPU(
-            y, fluid_t->mutable_data<float>(platform::CPUPlace()));
-      } else {
-        engine->GetOutputInGPU(
-            y, fluid_t->mutable_data<float>(platform::CUDAPlace()));
-      }
+      // TODO(Superjomn) find some way to determine which device to output the
+      // tensor.
+      // if (platform::is_cpu_place(fluid_t->place())) {
+      // TODO(Superjomn) change this float to dtype size.
+      engine->GetOutputInCPU(
+          y, fluid_t->mutable_data<float>(platform::CPUPlace()));
+      //} else {
+      // engine->GetOutputInGPU(
+      // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
+      // size * sizeof(float));
+      //}
    }

    cudaStreamSynchronize(*engine->stream());

--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -38,6 +38,7 @@ limitations under the License. */
 #endif
 #endif

+#include <boost/any.hpp>
 #include <boost/mpl/comparison.hpp>
 #include <boost/mpl/less_equal.hpp>
 #include <boost/variant.hpp>
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -248,15 +248,11 @@ PYBIND11_PLUGIN(core) {
 #endif
           })
      .def("rows", [](SelectedRows &self) {
-#ifndef PADDLE_WITH_CUDA
-        return self.rows();
-#else
-         auto rows = self.rows();
-         std::vector<int64_t> new_rows;
-         new_rows.reserve(rows.size());
-         std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
-         return new_rows;
-#endif
+        auto rows = self.rows();
+        std::vector<int64_t> new_rows;
+        new_rows.reserve(rows.size());
+        std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
+        return new_rows;
      });

  py::class_<Variable>(m, "Variable", R"DOC(Variable Class.

--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
@@ -30,7 +30,9 @@ class RecordIOWriter {
 public:
  RecordIOWriter(const std::string& filename, recordio::Compressor compressor,
                 size_t max_num_record)
-      : stream_(filename), writer_(&stream_, compressor, max_num_record) {}
+      : closed_(false),
+        stream_(filename),
+        writer_(&stream_, compressor, max_num_record) {}

  void AppendTensor(const framework::LoDTensor& tensor) {
    tensors_.push_back(tensor);
@@ -47,9 +49,17 @@ class RecordIOWriter {
    PADDLE_ENFORCE(tensors_.empty());
    writer_.Flush();
    stream_.close();
+    closed_ = true;
+  }
+
+  ~RecordIOWriter() {
+    if (!closed_) {
+      Close();
+    }
  }

 private:
+  bool closed_;
  std::vector<framework::LoDTensor> tensors_;
  std::ofstream stream_;
  recordio::Writer writer_;

--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
@@ -28,6 +28,7 @@ Scanner::Scanner(std::unique_ptr<std::istream> &&stream)

 Scanner::Scanner(const std::string &filename)
    : stream_(new std::ifstream(filename)), parser_(*stream_) {
+  PADDLE_ENFORCE(static_cast<bool>(*stream_), "Cannot open file %s", filename);
  Reset();
 }


--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -333,7 +333,7 @@ function assert_api_not_changed() {
    python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
    deactivate

-    API_CHANGE=`git diff --name-only HEAD^ | grep "paddle/fluid/API.spec" || true`
+    API_CHANGE=`git diff --name-only upstream/develop | grep "paddle/fluid/API.spec" || true`
    echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
    if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
        # TODO: curl -H 'Authorization: token ${TOKEN}'
@@ -599,11 +599,11 @@ function main() {
      cicheck)
        cmake_gen ${PYTHON_ABI:-""}
        build
-        assert_api_not_changed
        run_test
        gen_capi_package
        gen_fluid_inference_lib
        test_fluid_inference_lib
+        assert_api_not_changed
        ;;
      *)
        print_usage

--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']


-def batch(reader, batch_size, drop_last=True):
+def batch(reader, batch_size, drop_last=False):
    """
    Create a batched reader.


--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -68,8 +68,14 @@ def reader_creator(image_filename, label_filename, buffer_size):
                for i in xrange(buffer_size):
                    yield images[i, :], int(labels[i])
        finally:
-            m.terminate()
-            l.terminate()
+            try:
+                m.terminate()
+            except:
+                pass
+            try:
+                l.terminate()
+            except:
+                pass

    return reader


--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -35,6 +35,7 @@ import io
 import evaluator
 import initializer
 import layers
+import contrib
 import nets
 import optimizer
 import backward
@@ -66,6 +67,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
              'io',
              'initializer',
              'layers',
+              'contrib',
              'transpiler',
              'nets',
              'optimizer',

--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import decoder
+from decoder import *
+
+__all__ = decoder.__all__
--- a/python/paddle/fluid/contrib/decoder/__init__.py
+++ b/python/paddle/fluid/contrib/decoder/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import beam_search_decoder
+from beam_search_decoder import *
+
+__all__ = beam_search_decoder.__all__
--- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module provides a general beam search decoder API for RNN based decoders.
+The purpose of this API is to allow users to highly customize the behavior
+within their RNN decoder(vanilla RNN, LSTM, attention + LSTM, future etc.),
+without using the low level API such as while ops.
+
+This API is still under active development and may change drastically.
+"""
+
+import contextlib
+import numpy as np
+
+from ... import layers
+from ...framework import Variable
+from ... import core
+from ... import framework, unique_name
+from ...layer_helper import LayerHelper
+
+__all__ = ['InitState', 'StateCell', 'TrainingDecoder', 'BeamSearchDecoder']
+
+
+class _DecoderType:
+    TRAINING = 1
+    BEAM_SEARCH = 2
+
+
+class InitState(object):
+    """
+    The initial hidden state object. The state objects holds a variable, and may
+    use it to initialize the hidden state cell of RNN. Usually used as input to
+    `StateCell` class.
+
+    Args:
+        init (Variable): The initial variable of the hidden state. If set None,
+            the variable will be created as a tensor with constant value based
+            on `shape` and `value` param.
+        shape (tuple|list): If `init` is None, new Variable's shape. Default
+            None.
+        value (float): If `init` is None, new Variable's value. Default None.
+        init_boot (Variable): If provided, the initial variable will be created
+            with the same shape as this variable.
+        need_reorder (bool): If set true, the init will be sorted by its lod
+            rank within its batches. This should be used if `batch_size > 1`.
+        dtype (np.dtype|core.VarDesc.VarType|str): Data type of the initial
+            variable.
+
+    Returns:
+        An initialized state object.
+
+    Examples:
+        See `StateCell`.
+    """
+
+    def __init__(self,
+                 init=None,
+                 shape=None,
+                 value=0.0,
+                 init_boot=None,
+                 need_reorder=False,
+                 dtype='float32'):
+        if init is not None:
+            self._init = init
+        elif init_boot is None:
+            raise ValueError(
+                'init_boot must be provided to infer the shape of InitState .\n')
+        else:
+            self._init = layers.fill_constant_batch_size_like(
+                input=init_boot, value=value, shape=shape, dtype=dtype)
+
+        self._shape = shape
+        self._value = value
+        self._need_reorder = need_reorder
+        self._dtype = dtype
+
+    @property
+    def value(self):
+        return self._init
+
+    @property
+    def need_reorder(self):
+        return self._need_reorder
+
+
+class _MemoryState(object):
+    def __init__(self, state_name, rnn_obj, init_state):
+        self._state_name = state_name  # each is a rnn.memory
+        self._rnn_obj = rnn_obj
+        self._state_mem = self._rnn_obj.memory(
+            init=init_state.value, need_reorder=init_state.need_reorder)
+
+    def get_state(self):
+        return self._state_mem
+
+    def update_state(self, state):
+        self._rnn_obj.update_memory(self._state_mem, state)
+
+
+class _ArrayState(object):
+    def __init__(self, state_name, block, init_state):
+        self._state_name = state_name
+        self._block = block
+
+        self._state_array = self._block.create_var(
+            name=unique_name.generate('array_state_array'),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=init_state.value.dtype)
+
+        self._counter = self._block.create_var(
+            name=unique_name.generate('array_state_counter'),
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            dtype='int64')
+
+        # initialize counter
+        self._block.append_op(
+            type='fill_constant',
+            inputs={},
+            outputs={'Out': [self._counter]},
+            attrs={
+                'shape': [1],
+                'dtype': self._counter.dtype,
+                'value': float(0.0),
+                'force_cpu': True
+            })
+
+        self._counter.stop_gradient = True
+
+        # write initial state
+        block.append_op(
+            type='write_to_array',
+            inputs={'X': init_state.value,
+                    'I': self._counter},
+            outputs={'Out': self._state_array})
+
+    def get_state(self):
+        state = layers.array_read(array=self._state_array, i=self._counter)
+        return state
+
+    def update_state(self, state):
+        layers.increment(x=self._counter, value=1, in_place=True)
+        layers.array_write(state, array=self._state_array, i=self._counter)
+
+
+class StateCell(object):
+    """
+    The state cell class stores the hidden state of the RNN cell. A typical RNN
+    cell has one or more hidden states, and one or more step inputs. This class
+    allows you to defines the name of hidden states as well as step inputs, and
+    their associated variables.
+
+    Args:
+        inputs (dict): A feeding dict of {name(str) : Variable}. It specifies
+            the names of step inputs for RNN cell, and the associated variables.
+            The variable could initially be None and set manually during each
+            RNN step.
+        states (dict): A feeding dict of {name(str) : InitState object}. It
+            specifies the names of hidden states and their initialized state.
+        out_state (str): A string that specifies the name of hidden state that
+            will be used to compute the score in beam search process.
+        name (str): The name of the RNN cell. Default None.
+
+    Raises:
+        `ValueError`: If the initial state is not an instance of InitState, or
+            the out_state is not in the dict of states.
+
+    Returns:
+        StateCell: The initialized StateCell object.
+
+    Examples:
+        .. code-block:: python
+          hidden_state = InitState(init=encoder_out, need_reorder=True)
+          state_cell = StateCell(
+              inputs={'current_word': None},
+              states={'h': hidden_state},
+              out_state='h')
+    """
+
+    def __init__(self, inputs, states, out_state, name=None):
+        self._helper = LayerHelper('state_cell', name=name)
+        self._cur_states = {}
+        self._state_names = []
+        for state_name, state in states.items():
+            if not isinstance(state, InitState):
+                raise ValueError('state must be an InitState object.')
+            self._cur_states[state_name] = state
+            self._state_names.append(state_name)
+        self._inputs = inputs  # inputs is place holder here
+        self._cur_decoder_obj = None
+        self._in_decoder = False
+        self._states_holder = {}
+        self._switched_decoder = False
+        self._state_updater = None
+        self._out_state = out_state
+        if self._out_state not in self._cur_states:
+            raise ValueError('out_state must be one state in states')
+
+    def _enter_decoder(self, decoder_obj):
+        if self._in_decoder == True or self._cur_decoder_obj is not None:
+            raise ValueError('StateCell has already entered a decoder.')
+        self._in_decoder = True
+        self._cur_decoder_obj = decoder_obj
+        self._switched_decoder = False
+
+    def _leave_decoder(self, decoder_obj):
+        if not self._in_decoder:
+            raise ValueError('StateCell not in decoder, '
+                             'invalid leaving operation.')
+
+        if self._cur_decoder_obj != decoder_obj:
+            raise ValueError('Inconsistent decoder object in StateCell.')
+
+        self._in_decoder = False
+        self._cur_decoder_obj = None
+        self._switched_decoder = False
+
+    def _switch_decoder(self):  # lazy switch
+        if not self._in_decoder:
+            raise ValueError('StateCell must be enter a decoder.')
+
+        if self._switched_decoder:
+            raise ValueError('StateCell already done switching.')
+
+        for state_name in self._state_names:
+            if state_name not in self._states_holder:
+                state = self._cur_states[state_name]
+
+                if not isinstance(state, InitState):
+                    raise ValueError('Current type of state is %s, should be '
+                                     'an InitState object.' % type(state))
+
+                self._states_holder[state_name] = {}
+
+                if self._cur_decoder_obj.type == _DecoderType.TRAINING:
+                    self._states_holder[state_name][id(self._cur_decoder_obj)] \
+                        = _MemoryState(state_name,
+                                       self._cur_decoder_obj.dynamic_rnn,
+                                       state)
+                elif self._cur_decoder_obj.type == _DecoderType.BEAM_SEARCH:
+                    self._states_holder[state_name][id(self._cur_decoder_obj)] \
+                        = _ArrayState(state_name,
+                                      self._cur_decoder_obj._parent_block(),
+                                      state)
+                else:
+                    raise ValueError('Unknown decoder type, only support '
+                                     '[TRAINING, BEAM_SEARCH]')
+
+            # Read back, since current state should be LoDTensor
+            self._cur_states[state_name] = \
+                self._states_holder[state_name][
+                    id(self._cur_decoder_obj)].get_state()
+
+        self._switched_decoder = True
+
+    def get_state(self, state_name):
+        """
+        The getter of state object. Find the state variable by its name.
+
+        Args:
+            state_name (str): A string of the state's name.
+
+        Returns:
+            The associated state object.
+        """
+        if self._in_decoder and not self._switched_decoder:
+            self._switch_decoder()
+
+        if state_name not in self._cur_states:
+            raise ValueError(
+                'Unknown state %s. Please make sure _switch_decoder() '
+                'invoked.' % state_name)
+
+        return self._cur_states[state_name]
+
+    def get_input(self, input_name):
+        """
+        The getter of input variable. Find the input variable by its name.
+
+        Args:
+            input_name (str): The string of the input's name.
+
+        Returns:
+            The associated input variable.
+        """
+        if input_name not in self._inputs or self._inputs[input_name] is None:
+            raise ValueError('Invalid input %s.' % input_name)
+        return self._inputs[input_name]
+
+    def set_state(self, state_name, state_value):
+        """
+        The setter of the state variable. Change the variable of the given
+        `state_name`.
+
+        Args:
+            state_name (str): The name of the state to change.
+            state_value (Var): The variable of the new state.
+        """
+        self._cur_states[state_name] = state_value
+
+    def state_updater(self, updater):
+        """
+        Set up the updater to update the hidden state every RNN step. The
+        behavior of updater could be customized by users. The updater should be
+        a function that takes a `StateCell` object as input and update the
+        hidden state within it. The hidden state could be accessed through
+        `get_state` method.
+
+        Args:
+            updater (func): the updater to update the state cell.
+        """
+        self._state_updater = updater
+
+        def _decorator(state_cell):
+            if state_cell == self:
+                raise TypeError('Updater should only accept a StateCell object '
+                                'as argument.')
+            updater(state_cell)
+
+        return _decorator
+
+    def compute_state(self, inputs):
+        """
+        Provide the step input of RNN cell, and compute the new hidden state
+        with updater and give step input.
+
+        Args:
+            inputs (dict): A feed dict, {name(str): Variable}. name should be
+            the names of step inputs for this RNN cell, and Variable should be
+            the associated variables.
+
+        Examples:
+        .. code-block:: python
+          state_cell.compute_state(inputs={'x': current_word})
+        """
+        if self._in_decoder and not self._switched_decoder:
+            self._switch_decoder()
+
+        for input_name, input_value in inputs.items():
+            if input_name not in self._inputs:
+                raise ValueError('Unknown input %s. '
+                                 'Please make sure %s in input '
+                                 'place holder.' % (input_name, input_name))
+            self._inputs[input_name] = input_value
+        self._state_updater(self)
+
+    def update_states(self):
+        """
+        Update and record state information after each RNN step.
+        """
+        if self._in_decoder and not self._switched_decoder:
+            self._switched_decoder()
+
+        for state_name, decoder_state in self._states_holder.items():
+            if id(self._cur_decoder_obj) not in decoder_state:
+                raise ValueError('Unknown decoder object, please make sure '
+                                 'switch_decoder been invoked.')
+            decoder_state[id(self._cur_decoder_obj)].update_state(
+                self._cur_states[state_name])
+
+    def out_state(self):
+        """
+        Get the output state variable. This must be called after update_states.
+
+        Returns:
+            The output variable of the RNN cell.
+        """
+        return self._cur_states[self._out_state]
+
+
+class TrainingDecoder(object):
+    """
+    A decoder that can only be used for training. The decoder could be
+    initialized with a `StateCell` object. The computation within the RNN cell
+    could be defined with decoder's block.
+
+    Args:
+        state_cell (StateCell): A StateCell object that handles the input and
+            state variables.
+        name (str): The name of this decoder. Default None.
+
+    Returns:
+        TrainingDecoder: The initialized TrainingDecoder object.
+
+    Examples:
+        .. code-block:: python
+          decoder = TrainingDecoder(state_cell)
+          with decoder.block():
+              current_word = decoder.step_input(trg_embedding)
+              decoder.state_cell.compute_state(inputs={'x': current_word})
+              current_score = layers.fc(input=decoder.state_cell.get_state('h'),
+                                        size=32,
+                                        act='softmax')
+              decoder.state_cell.update_states()
+              decoder.output(current_score)
+    """
+    BEFORE_DECODER = 0
+    IN_DECODER = 1
+    AFTER_DECODER = 2
+
+    def __init__(self, state_cell, name=None):
+        self._helper = LayerHelper('training_decoder', name=name)
+        self._status = TrainingDecoder.BEFORE_DECODER
+        self._dynamic_rnn = layers.DynamicRNN()
+        self._type = _DecoderType.TRAINING
+        self._state_cell = state_cell
+        self._state_cell._enter_decoder(self)
+
+    @contextlib.contextmanager
+    def block(self):
+        """
+        Define the behavior of the decoder for each RNN time step.
+        """
+        if self._status != TrainingDecoder.BEFORE_DECODER:
+            raise ValueError('decoder.block() can only be invoked once')
+        self._status = TrainingDecoder.IN_DECODER
+        with self._dynamic_rnn.block():
+            yield
+        self._status = TrainingDecoder.AFTER_DECODER
+        self._state_cell._leave_decoder(self)
+
+    @property
+    def state_cell(self):
+        self._assert_in_decoder_block('state_cell')
+        return self._state_cell
+
+    @property
+    def dynamic_rnn(self):
+        return self._dynamic_rnn
+
+    @property
+    def type(self):
+        return self._type
+
+    def step_input(self, x):
+        """
+        Set the input variable as a step input to the RNN cell. For example,
+        in machine translation, each time step we read one word from the target
+        sentences, then the target sentence is a step input to the RNN cell.
+
+        Args:
+            x (Variable): the variable to be used as step input.
+
+        Returns:
+            Variable: The variable as input of current step.
+
+        Examples:
+        .. code-block:: python
+          current_word = decoder.step_input(trg_embedding)
+        """
+        self._assert_in_decoder_block('step_input')
+        return self._dynamic_rnn.step_input(x)
+
+    def static_input(self, x):
+        """
+        Set the input variable as a static input of RNN cell. In contrast to
+        step input, this variable will be used as a whole within the RNN decode
+        loop and will not be scattered into time steps.
+
+        Args:
+            x (Variable): the variable to be used as static input.
+
+        Returns:
+            Variable: The variable as input of current step.
+
+        Examples:
+        .. code-block:: python
+          encoder_vec = decoder.static_input(encoded_vector)
+        """
+        self._assert_in_decoder_block('static_input')
+        return self._dynamic_rnn.static_input(x)
+
+    def __call__(self, *args, **kwargs):
+        """
+        Get the output of RNN. This API should only be invoked after RNN.block()
+
+        Returns:
+            Variable: The specified output of the RNN cell.
+        """
+        if self._status != TrainingDecoder.AFTER_DECODER:
+            raise ValueError('Output of training decoder can only be visited '
+                             'outside the block.')
+        return self._dynamic_rnn(*args, **kwargs)
+
+    def output(self, *outputs):
+        """
+        Set the output variable of the RNN cell.
+
+        Args:
+            *outputs (Variables): a series of variables that treated as output
+                of the RNN cell.
+
+        Examples:
+        .. code-block:: python
+          out = fluid.layers.fc(input=h,
+                                size=32,
+                                bias_attr=True,
+                                act='softmax')
+          decoder.output(out)
+        """
+        self._assert_in_decoder_block('output')
+        self._dynamic_rnn.output(*outputs)
+
+    def _assert_in_decoder_block(self, method):
+        if self._status != TrainingDecoder.IN_DECODER:
+            raise ValueError('%s should be invoked inside block of '
+                             'TrainingDecoder object.' % method)
+
+
+class BeamSearchDecoder(object):
+    """
+    A beam search decoder that can be used for inference. The decoder should be
+    initialized with a `StateCell` object. The decode process can be defined
+    within its block.
+
+    Args:
+        state_cell (StateCell): A StateCell object that handles the input and
+            state variables.
+        init_ids (Variable): The init beam search token ids.
+        init_scores (Variable): The associated score of each id.
+        target_dict_dim (int): Size of dictionary.
+        word_dim (int): Word embedding dimension.
+        input_var_dict (dict): A feeding dict to feed the required input
+            variables to the state cell. It will be used by state_cell 's
+            compute method. Default empty.
+        topk_size (int): The topk size used for beam search. Default 50.
+        max_len (int): The maximum allowed length of the generated sentence.
+            Default 100.
+        beam_size (int): The beam width of beam search decode. Default 1.
+        end_id (int): The id of end token within beam search.
+        name (str): The name of this decoder. Default None.
+
+    Returns:
+        BeamSearchDecoder: A initialized BeamSearchDecoder object.
+
+    Examples:
+    .. code-block:: python
+      decoder = BeamSearchDecoder(
+          state_cell=state_cell,
+          init_ids=init_ids,
+          init_scores=init_scores,
+          target_dict_dim=target_dict_dim,
+          word_dim=word_dim,
+          init_var_dict={},
+          topk_size=topk_size,
+          sparse_emb=IS_SPARSE,
+          max_len=max_length,
+          beam_size=beam_size,
+          end_id=1,
+          name=None
+      )
+      decoder.decode()
+      translation_ids, translation_scores = decoder()
+    """
+    BEFORE_BEAM_SEARCH_DECODER = 0
+    IN_BEAM_SEARCH_DECODER = 1
+    AFTER_BEAM_SEARCH_DECODER = 2
+
+    def __init__(self,
+                 state_cell,
+                 init_ids,
+                 init_scores,
+                 target_dict_dim,
+                 word_dim,
+                 input_var_dict={},
+                 topk_size=50,
+                 sparse_emb=True,
+                 max_len=100,
+                 beam_size=1,
+                 end_id=1,
+                 name=None):
+        self._helper = LayerHelper('beam_search_decoder', name=name)
+        self._counter = layers.zeros(shape=[1], dtype='int64')
+        self._counter.stop_gradient = True
+        self._type = _DecoderType.BEAM_SEARCH
+        self._max_len = layers.fill_constant(
+            shape=[1], dtype='int64', value=max_len)
+        self._cond = layers.less_than(
+            x=self._counter,
+            y=layers.fill_constant(
+                shape=[1], dtype='int64', value=max_len))
+        self._while_op = layers.While(self._cond)
+        self._state_cell = state_cell
+        self._state_cell._enter_decoder(self)
+        self._status = BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER
+        self._zero_idx = layers.fill_constant(
+            shape=[1], value=0, dtype='int64', force_cpu=True)
+        self._array_dict = {}
+        self._array_link = []
+        self._ids_array = None
+        self._scores_array = None
+        self._beam_size = beam_size
+        self._end_id = end_id
+
+        self._init_ids = init_ids
+        self._init_scores = init_scores
+        self._target_dict_dim = target_dict_dim
+        self._topk_size = topk_size
+        self._sparse_emb = sparse_emb
+        self._word_dim = word_dim
+        self._input_var_dict = input_var_dict
+
+    @contextlib.contextmanager
+    def block(self):
+        """
+        Define the behavior of the decoder for each RNN time step.
+        """
+        if self._status != BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER:
+            raise ValueError('block() can only be invoke once.')
+
+        self._status = BeamSearchDecoder.IN_BEAM_SEARCH_DECODER
+
+        with self._while_op.block():
+            yield
+            with layers.Switch() as switch:
+                with switch.case(self._cond):
+                    layers.increment(x=self._counter, value=1.0, in_place=True)
+
+                    for value, array in self._array_link:
+                        layers.array_write(
+                            x=value, i=self._counter, array=array)
+
+                    layers.less_than(
+                        x=self._counter, y=self._max_len, cond=self._cond)
+
+        self._status = BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER
+        self._state_cell._leave_decoder(self)
+
+    @property
+    def type(self):
+        return self._type
+
+    def early_stop(self):
+        """
+        Stop the generation process in advance. Could be used as "break".
+        """
+        layers.fill_constant(
+            shape=[1], value=0, dtype='bool', force_cpu=True, out=self._cond)
+
+    def decode(self):
+        """
+        Set up the computation within the decoder. Then you could call the
+        decoder to get the result of beam search decode. If you want to define
+        a more specific decoder, you could override this function.
+
+        Examples:
+        .. code-block:: python
+          decoder.decode()
+          translation_ids, translation_scores = decoder()
+        """
+        with self.block():
+            prev_ids = self.read_array(init=self._init_ids, is_ids=True)
+            prev_scores = self.read_array(
+                init=self._init_scores, is_scores=True)
+            prev_ids_embedding = layers.embedding(
+                input=prev_ids,
+                size=[self._target_dict_dim, self._word_dim],
+                dtype='float32',
+                is_sparse=self._sparse_emb)
+
+            feed_dict = {}
+            update_dict = {}
+
+            for init_var_name, init_var in self._input_var_dict.items():
+                if init_var_name not in self.state_cell._inputs:
+                    raise ValueError('Variable ' + init_var_name +
+                                     ' not found in StateCell!\n')
+
+                read_var = self.read_array(init=init_var)
+                update_dict[init_var_name] = read_var
+                feed_var_expanded = layers.sequence_expand(read_var,
+                                                           prev_scores)
+                feed_dict[init_var_name] = feed_var_expanded
+
+            for state_str in self._state_cell._state_names:
+                prev_state = self.state_cell.get_state(state_str)
+                prev_state_expanded = layers.sequence_expand(prev_state,
+                                                             prev_scores)
+                self.state_cell.set_state(state_str, prev_state_expanded)
+
+            for i, input_name in enumerate(self._state_cell._inputs):
+                if input_name not in feed_dict:
+                    feed_dict[input_name] = prev_ids_embedding
+
+            self.state_cell.compute_state(inputs=feed_dict)
+            current_state = self.state_cell.out_state()
+            current_state_with_lod = layers.lod_reset(
+                x=current_state, y=prev_scores)
+            scores = layers.fc(input=current_state_with_lod,
+                               size=self._target_dict_dim,
+                               act='softmax')
+            topk_scores, topk_indices = layers.topk(scores, k=self._topk_size)
+            accu_scores = layers.elementwise_add(
+                x=layers.log(x=topk_scores),
+                y=layers.reshape(
+                    prev_scores, shape=[-1]),
+                axis=0)
+            selected_ids, selected_scores = layers.beam_search(
+                prev_ids,
+                prev_scores,
+                topk_indices,
+                accu_scores,
+                self._beam_size,
+                end_id=1,
+                level=0)
+
+            with layers.Switch() as switch:
+                with switch.case(layers.is_empty(selected_ids)):
+                    self.early_stop()
+                with switch.default():
+                    self.state_cell.update_states()
+                    self.update_array(prev_ids, selected_ids)
+                    self.update_array(prev_scores, selected_scores)
+                    for update_name, var_to_update in update_dict.items():
+                        self.update_array(var_to_update, feed_dict[update_name])
+
+    def read_array(self, init, is_ids=False, is_scores=False):
+        """
+        Read an array to get the decoded ids and scores generated by previous
+        RNN step. At the first step of RNN, the init variable mut be used to
+        initialize the array.
+
+        Args:
+            init (Variable): The initial variable for first step usage. init
+                must be provided.
+            is_ids (bool): Specify whether the variable is an id.
+            is_scores (bool): Specify whether the variable is a score.
+
+        Returns:
+            The associated variable generated during previous RNN steps.
+
+        Examples:
+            .. code-block:: python
+              prev_ids = decoder.read_array(init=init_ids, is_ids=True)
+              prev_scores = decoder.read_array(init=init_scores, is_scores=True)
+        """
+        self._assert_in_decoder_block('read_array')
+
+        if is_ids and is_scores:
+            raise ValueError('Shouldn\'t mark current array be ids array and'
+                             'scores array at the same time.')
+
+        if not isinstance(init, Variable):
+            raise TypeError('The input argument `init` must be a Variable.')
+
+        parent_block = self._parent_block()
+        array = parent_block.create_var(
+            name=unique_name.generate('beam_search_decoder_array'),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=init.dtype)
+        parent_block.append_op(
+            type='write_to_array',
+            inputs={'X': init,
+                    'I': self._zero_idx},
+            outputs={'Out': array})
+
+        if is_ids:
+            self._ids_array = array
+        elif is_scores:
+            self._scores_array = array
+
+        read_value = layers.array_read(array=array, i=self._counter)
+        self._array_dict[read_value.name] = array
+        return read_value
+
+    def update_array(self, array, value):
+        """
+        Store the value generated in current step in an array for each RNN step.
+        This array could be accessed by read_array method.
+
+        Args:
+            array (Variable): The array to append the new variable to.
+            value (Variable): The newly generated value to be stored.
+        """
+        self._assert_in_decoder_block('update_array')
+
+        if not isinstance(array, Variable):
+            raise TypeError(
+                'The input argument `array` of  must be a Variable.')
+        if not isinstance(value, Variable):
+            raise TypeError('The input argument `value` of must be a Variable.')
+
+        array = self._array_dict.get(array.name, None)
+        if array is None:
+            raise ValueError('Please invoke read_array before update_array.')
+        self._array_link.append((value, array))
+
+    def __call__(self):
+        """
+        Run the decode process and return the final decode result.
+
+        Returns:
+            A tuple of decoded (id, score) pairs. id is a Variable that holds
+            the generated tokens, and score is a Variable with the same shape
+            as id, holds the score for each generated token.
+        """
+        if self._status != BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER:
+            raise ValueError('Output of BeamSearchDecoder object can '
+                             'only be visited outside the block.')
+        return layers.beam_search_decode(
+            ids=self._ids_array,
+            scores=self._scores_array,
+            beam_size=self._beam_size,
+            end_id=self._end_id)
+
+    @property
+    def state_cell(self):
+        self._assert_in_decoder_block('state_cell')
+        return self._state_cell
+
+    def _parent_block(self):
+        """
+        Getter of parent block.
+
+        Returns:
+            The parent block of decoder.
+        """
+        program = self._helper.main_program
+        parent_block_idx = program.current_block().parent_idx
+        if parent_block_idx < 0:
+            raise ValueError('Invalid block with index %d.' % parent_block_idx)
+        parent_block = program.block(parent_block_idx)
+        return parent_block
+
+    def _assert_in_decoder_block(self, method):
+        if self._status != BeamSearchDecoder.IN_BEAM_SEARCH_DECODER:
+            raise ValueError('%s should be invoked inside block of '
+                             'BeamSearchDecoder object.' % method)
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -66,7 +66,8 @@ def is_persistable(var):
            res = fluid.io.is_persistable(param)
    """
    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-            var.desc.type() == core.VarDesc.VarType.FETCH_LIST:
+            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+            var.desc.type() == core.VarDesc.VarType.READER:
        return False
    return var.persistable


--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -25,9 +25,6 @@ import numpy
 __all__ = [
    'split_lod_tensor',
    'merge_lod_tensor',
-    'BlockGuard',
-    'BlockGuardWithCompletion',
-    'WhileGuard',
    'While',
    'Switch',
    'lod_rank_table',

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -12,18 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
+import multiprocessing
+import threading

-from .. import core
-from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program
-from ..unique_name import generate as unique_name
+from ..data_feeder import DataFeeder
 from control_flow import BlockGuard
-from ..layer_helper import LayerHelper
+from layer_function_generator import templatedoc
+from .. import core
 from ..executor import global_scope
-from layer_function_generator import generate_layer_fn, templatedoc
+from ..framework import convert_np_dtype_to_dtype_, default_main_program, \
+    default_startup_program, program_guard, Program
+from ..layer_helper import LayerHelper
+from ..unique_name import generate as unique_name

 __all__ = [
-    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'Recv',
-    'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
+    'data', 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
    'double_buffer', 'random_data_generator', 'py_reader', 'Preprocessor',
    'load'
 ]
@@ -446,10 +449,15 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
    return monkey_patch_reader_methods(main_prog_var)


-def py_reader(capacity, shapes, dtypes, lod_levels=None):
+def py_reader(capacity,
+              shapes,
+              dtypes,
+              lod_levels=None,
+              name=None,
+              use_double_buffer=True):
    """
    Create a reader and blocking queue for data feeding in Python
-    
+
    This layer returns a Reader Variable and a BlockingQueue.
    The BlockingQueue provides `push()` method to push a `LoDTensorArray` 
    object into the queue in Python side. In C++ side, the Reader 
@@ -459,15 +467,18 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None):
    using `close()` method when unused.

    Args:
+       use_double_buffer(bool): Whether use double buffer or not.
       capacity(int): The maximum capacity of the BlockingQueue.
-       shapes(list): List of tuples which declaring data shapes.
-       dtypes(list): List of strs which declaring data type. 
-       lod_levels(list): List of ints which declaring data lod_level.
+       shapes(list|tuple): List of tuples which declaring data shapes.
+       dtypes(list|tuple): List of strs which declaring data type.
+       lod_levels(list|tuple): List of ints which declaring data lod_level.
+       name(basestring): The prefix Python queue name and Reader name. None will
+            be generated automatically.

    Returns:
       tuple(Variable, BlockingQueue):
       A Reader Variable from which we can get feeding data.
-       
+
       A BlockingQueue object for data feeding.

    Examples:
@@ -480,7 +491,7 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None):
                                             dtypes=['float32', 'int64'])
            # Via the reader, we can use 'read_file' layer to get data:
            image, label = fluid.layers.read_file(reader)
-            
+
            # Via the blocking queue, we can feed data using threads
            def feed_data(queue, feed_images, feed_labels):
                for feed_image, feed_label in zip(feed_images, feed_labels):
@@ -488,7 +499,7 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None):
                    data.append(feed_image)
                    data.append(feed_label)
                    queue.push(data)
-            
+
            thread = threading.Thread(target=feed_data, args=(queue, feed_images, feed_labels))
            thread.start()
    """
@@ -503,15 +514,23 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None):
    if lod_levels is None:
        lod_levels = [0] * len(shapes)

-    queue_name = unique_name('lod_tensor_blocking_queue')
+    if name is None:
+        queue_name = unique_name('lod_tensor_blocking_queue')
+        reader_name = unique_name('create_py_reader')
+        double_buffer_name = unique_name('double_buffer')
+    else:
+        queue_name = "_".join([name, "queue"])
+        reader_name = "_".join([name, "reader"])
+        double_buffer_name = "_".join([name, "double_buffer"])
+
    var = global_scope().var(queue_name)
    feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)

    startup_blk = default_startup_program().current_block()
-    startup_var = startup_blk.create_var(name=unique_name('create_py_reader'))
+    startup_var = startup_blk.create_var(name=reader_name)
    startup_blk.append_op(
        type='create_py_reader',
-        inputs={'blocking_queue': queue_name},
+        inputs={'blocking_queue': [queue_name]},
        outputs={'Out': [startup_var]},
        attrs={
            'shape_concat': shape_concat,
@@ -525,17 +544,97 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None):
    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
                                      startup_var)

-    return monkey_patch_reader_methods(main_prog_var), feed_queue
+    reader = monkey_patch_reader_methods(main_prog_var)
+    if use_double_buffer:
+        double_buffer_reader = double_buffer(reader, name=double_buffer_name)
+        # we return a double buffer reader. However, the reset method comes from
+        # py_reader.
+        double_buffer_reader.reset = reader.reset
+        reader = double_buffer_reader
+
+    # monkey patch py_reader special methods
+    reader.queue = feed_queue
+    current_reset_method = reader.reset
+    reader.thread = None
+    reader.tensor_provider = None
+    reader.exited = False
+
+    def start_provide_thread(func):
+        def __provider_thread__():
+            for tensors in func():
+                array = core.LoDTensorArray()
+                for item in tensors:
+                    if not isinstance(item, core.LoDTensor):
+                        tmp = core.LoDTensor()
+                        tmp.set(item, core.CPUPlace())
+                        item = tmp
+
+                    array.append(item)
+
+                if reader.exited:
+                    break
+                feed_queue.push(array)
+                if reader.exited:
+                    break
+            feed_queue.close()
+
+        reader.thread = threading.Thread(target=__provider_thread__)
+        reader.thread.daemon = True
+        reader.thread.start()
+
+    def __set_tensor_provider__(func):
+        reader.tensor_provider = func
+
+    def __set_paddle_reader__(paddle_reader):
+        with program_guard(Program(), Program()):
+            feed_list = []
+            counter = 0
+            for dtype, shape, lod_level in zip(dtypes, shapes, lod_levels):
+                name = str(counter)
+                feed_list.append(
+                    data(
+                        name=name,
+                        dtype=dtype,
+                        shape=shape,
+                        lod_level=lod_level))
+                counter += 1
+
+            feeder = DataFeeder(feed_list=feed_list, place=core.CPUPlace())
+            paddle_reader = feeder.decorate_reader(
+                paddle_reader, multi_devices=False)
+
+        def __tensor_provider__():
+            for slots in paddle_reader():
+                yield [slots[str(idx)] for idx in xrange(counter)]
+
+        __set_tensor_provider__(__tensor_provider__)
+
+    def __reset__():
+        current_reset_method()
+        if reader.thread is not None and reader.tensor_provider is not None:
+            reader.exited = True
+            reader.thread.join()
+            reader.exited = False
+
+    def __start__():
+        start_provide_thread(reader.tensor_provider)
+
+    reader.reset = __reset__
+    reader.decorate_tensor_provider = __set_tensor_provider__
+    reader.decorate_paddle_reader = __set_paddle_reader__
+    reader.start = __start__
+
+    return reader


 def open_files(filenames,
               shapes,
               lod_levels,
               dtypes,
-               thread_num=1,
+               thread_num=None,
               buffer_size=None,
               pass_num=1,
-               for_parallel=True):
+               is_test=None):
    """
    Open files

@@ -548,14 +647,14 @@ def open_files(filenames,
       shapes(list): List of tuples which declaring data shapes.
       lod_levels(list): List of ints which declaring data lod_level.
       dtypes(list): List of strs which declaring data type.
-       thread_num(int): The maximal concurrent prefetch thread number.
-       buffer_size(int|None): The size of prefetch buffer. If it is setted None, 
-            buffer size will be thread_num * 3.
-            Default: None
+       thread_num(None): The number of thread to read files.
+            Default: min(len(filenames), cpu_number).
+       buffer_size(None): The buffer size of reader. Default: 3 * thread_num
       pass_num(int): Number of passes to run.
-       for_parallel(Bool): Set it as True if you are going to run 
-            subsequent operators in parallel.
-            Default: True
+       is_test(bool|None): Whether `open_files` used for testing or not. If it
+            is used for testing, the order of data generated is same as the file
+            order. Otherwise, it is not guaranteed the order of data is same
+            between every epoch. [Default: False].

    Returns:
       Variable: A Reader Variable via which we can get file data.
@@ -567,15 +666,21 @@ def open_files(filenames,
                                                     './data2.recordio'],
                                             shapes=[(3,224,224), (1)],
                                             lod_levels=[0, 0],
-                                             dtypes=['float32', 'int64'],
-                                             thread_num=2,
-                                             buffer_size=2)
+                                             dtypes=['float32', 'int64'])

         # Via the reader, we can use 'read_file' layer to get data:
         image, label = fluid.layers.io.read_file(reader)
    """
+    if thread_num is None:
+        thread_num = min(len(filenames), multiprocessing.cpu_count())
+    else:
+        thread_num = int(thread_num)
+
    if buffer_size is None:
-        buffer_size = thread_num * 3
+        buffer_size = 3 * thread_num
+    else:
+        buffer_size = int(buffer_size)
+
    if isinstance(filenames, basestring):
        filenames = [filenames]
    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
@@ -589,17 +694,18 @@ def open_files(filenames,
    multi_file_reader_name = unique_name('multi_file_reader')
    startup_blk = default_startup_program().current_block()
    startup_reader = startup_blk.create_var(name=multi_file_reader_name)
+    attrs = {
+        'shape_concat': shape_concat,
+        'lod_levels': lod_levels,
+        'ranks': ranks,
+        'file_names': filenames,
+        'thread_num': thread_num,
+        'buffer_size': buffer_size
+    }
+    if is_test is not None:
+        attrs['is_test'] = is_test
    startup_blk.append_op(
-        type='open_files',
-        outputs={'Out': [startup_reader]},
-        attrs={
-            'shape_concat': shape_concat,
-            'lod_levels': lod_levels,
-            'ranks': ranks,
-            'file_names': filenames,
-            'thread_num': thread_num,
-            'buffer_size': buffer_size
-        })
+        type='open_files', outputs={'Out': [startup_reader]}, attrs=attrs)

    startup_reader.desc.set_dtypes(dtypes)
    startup_reader.persistable = True
@@ -802,7 +908,7 @@ class Preprocessor(object):
        self.sink_var_names = None
        self.status = Preprocessor.BEFORE_SUB_BLOCK

-    def is_completed(self):
+    def _is_completed(self):
        return self.sub_block and self.source_var_names and self.sink_var_names

    @contextlib.contextmanager
@@ -812,7 +918,7 @@ class Preprocessor(object):
        yield
        self.main_prog.rollback()
        self.status = Preprocessor.AFTER_SUB_BLOCK
-        if not self.is_completed():
+        if not self._is_completed():
            raise RuntimeError(
                "The definition of preprocessor is incompleted! "
                "Please make sure that you have set input and output "

--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -114,23 +114,13 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
            prediction = network(image, is_infer=True)
            auc_out=fluid.layers.auc(input=prediction, label=label)
    """
-
-    warnings.warn(
-        "This interface is not recommended, fluid.layers.auc compute the auc at every minibatch, \
-        but can not aggregate them and get the pass AUC, because pass \
-        auc can not be averaged with weighted from the minibatch auc value. \
-        Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \
-        which can get every minibatch and every pass auc value.", Warning)
    helper = LayerHelper("auc", **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    topk_out, topk_indices = nn.topk(input, k=k)
-    auc_out = helper.create_tmp_variable(dtype="float32")
+    auc_out = helper.create_tmp_variable(dtype="float64")
    # make tp, tn, fp, fn persistable, so that can accumulate all batches.
-    tp = helper.create_global_variable(persistable=True)
-    tn = helper.create_global_variable(persistable=True)
-    fp = helper.create_global_variable(persistable=True)
-    fn = helper.create_global_variable(persistable=True)
+    tp = helper.create_global_variable(persistable=True, dtype='int64')
+    tn = helper.create_global_variable(persistable=True, dtype='int64')
+    fp = helper.create_global_variable(persistable=True, dtype='int64')
+    fn = helper.create_global_variable(persistable=True, dtype='int64')
    for var in [tp, tn, fp, fn]:
        helper.set_variable_initializer(
            var, Constant(
@@ -139,8 +129,7 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
    helper.append_op(
        type="auc",
        inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
+            "Predict": [input],
            "Label": [label],
            "TP": [tp],
            "TN": [tn],
@@ -156,4 +145,4 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
            "FPOut": [fp],
            "FNOut": [fn]
        })
-    return auc_out
+    return auc_out, [tp, tn, fp, fn]
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -166,7 +166,8 @@ def fc(input,
        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
            parameters/weights of this layer.
        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
-            of this layer. If it is set to None, no bias will be added to the output units.
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
        act (str, default None): Activation to be applied to the output of this layer.
        is_test(bool): A flag indicating whether execution is in test phase.
        use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
@@ -2960,7 +2961,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
            # x is a Tensor variable with following elements:
            #    [[0.2, 0.3, 0.5, 0.9]
            #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
            fluid.layers.reduce_sum(x)  # [3.5]
            fluid.layers.reduce_sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
            fluid.layers.reduce_sum(x, dim=-1)  # [1.9, 1.6]
@@ -2969,7 +2970,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
            #      [[[1, 2], [3, 4]],
            #      [[5, 6], [7, 8]]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
            fluid.layers.reduce_sum(x, dim=[1, 2]) # [10, 26]
            fluid.layers.reduce_sum(x, dim=[0, 1]) # [16, 20]


--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -591,7 +591,7 @@ class Auc(MetricBase):
                      for i in range(self._num_thresholds - 2)]
        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]

-        # caculate TP, FN, TN, FP count
+        # calculate TP, FN, TN, FP count
        for idx_thresh, thresh in enumerate(thresholds):
            tp, fn, tn, fp = 0, 0, 0, 0
            for i, lbl in enumerate(labels):

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -324,7 +324,7 @@ class MomentumOptimizer(Optimizer):

        & if (use\_nesterov):

-        &\quad   param = param - gradient * learning\_rate + mu * velocity * learning\_rate
+        &\quad   param = param - (gradient + mu * velocity) * learning\_rate

        & else:

@@ -1180,16 +1180,16 @@ class ModelAverage(Optimizer):
                self._add_average_restore_op(block, param_grad)

    def _add_average_apply_op(self, block, param_grad):
-        param = block.clone_variable(param_grad[0])
-        grad = block.clone_variable(param_grad[1])
-        sum_1 = block.clone_variable(self._get_accumulator('sum_1', param))
-        sum_2 = block.clone_variable(self._get_accumulator('sum_2', param))
-        sum_3 = block.clone_variable(self._get_accumulator('sum_3', param))
-        num_accumulates = block.clone_variable(
+        param = block._clone_variable(param_grad[0])
+        grad = block._clone_variable(param_grad[1])
+        sum_1 = block._clone_variable(self._get_accumulator('sum_1', param))
+        sum_2 = block._clone_variable(self._get_accumulator('sum_2', param))
+        sum_3 = block._clone_variable(self._get_accumulator('sum_3', param))
+        num_accumulates = block._clone_variable(
            self._get_accumulator('num_accumulates', param))
-        old_num_accumulates = block.clone_variable(
+        old_num_accumulates = block._clone_variable(
            self._get_accumulator('old_num_accumulates', param))
-        num_updates = block.clone_variable(
+        num_updates = block._clone_variable(
            self._get_accumulator('num_updates', param))
        # backup param value to grad
        layers.assign(input=param, output=grad)
@@ -1203,8 +1203,8 @@ class ModelAverage(Optimizer):
        layers.elementwise_div(x=sum, y=tmp, out=param)

    def _add_average_restore_op(self, block, param_grad):
-        param = block.clone_variable(param_grad[0])
-        grad = block.clone_variable(param_grad[1])
+        param = block._clone_variable(param_grad[0])
+        grad = block._clone_variable(param_grad[1])
        layers.assign(input=grad, output=param)

    def _append_average_accumulate_op(self, param):

--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -14,6 +14,7 @@
 from __future__ import print_function
 import argparse
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import paddle
 import sys
 import numpy
@@ -134,4 +135,4 @@ def main(use_cuda):

 if __name__ == '__main__':
    # for use_cuda in (False, True):
-    main(use_cuda=True)
+    main(use_cuda=core.is_compiled_with_cuda())
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from __future__ import print_function

+import paddle.fluid.core as core
 import math
 import os
 import sys
@@ -257,6 +258,8 @@ def inject_test_method(use_cuda, parallel, nn_type, combine):

 def inject_all_tests():
    for use_cuda in (False, True):
+        if use_cuda and not core.is_compiled_with_cuda():
+            continue
        for parallel in (False, True):
            for nn_type in ('mlp', 'conv'):
                inject_test_method(use_cuda, parallel, nn_type, True)

--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -245,7 +245,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
                    is_sparse=is_sparse,
                    is_parallel=is_parallel)

-    if use_cuda and is_sparse:
+    if (not fluid.core.is_compiled_with_cuda() or use_cuda) and is_sparse:
        fn = __impl__
    else:
        # skip the other test when on CI server

--- a/python/paddle/fluid/tests/demo/pyreader.py
+++ b/python/paddle/fluid/tests/demo/pyreader.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+
+import paddle
+import paddle.dataset.mnist as mnist
+import paddle.fluid as fluid
+import paddle.v2
+
+
+def network(is_train):
+    reader = fluid.layers.py_reader(
+        capacity=10,
+        shapes=((-1, 784), (-1, 1)),
+        dtypes=('float32', 'int64'),
+        name="train_reader" if is_train else "test_reader",
+        use_double_buffer=True)
+    img, label = fluid.layers.read_file(reader)
+
+    hidden = img
+
+    for i in xrange(2):
+        hidden = fluid.layers.fc(input=hidden, size=100, act='tanh')
+        hidden = fluid.layers.dropout(
+            hidden, dropout_prob=0.5, is_test=not is_train)
+
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    return fluid.layers.mean(loss), reader
+
+
+def main():
+    train_prog = fluid.Program()
+    startup_prog = fluid.Program()
+
+    with fluid.program_guard(train_prog, startup_prog):
+        with fluid.unique_name.guard():
+            loss, train_reader = network(True)
+            adam = fluid.optimizer.Adam(learning_rate=0.01)
+            adam.minimize(loss)
+
+    test_prog = fluid.Program()
+    test_startup = fluid.Program()
+    with fluid.program_guard(test_prog, test_startup):
+        with fluid.unique_name.guard():
+            test_loss, test_reader = network(False)
+
+    use_cuda = fluid.core.is_compiled_with_cuda()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    fluid.Executor(place).run(startup_prog)
+    fluid.Executor(place).run(test_startup)
+
+    trainer = fluid.ParallelExecutor(
+        use_cuda=use_cuda, loss_name=loss.name, main_program=train_prog)
+
+    tester = fluid.ParallelExecutor(
+        use_cuda=use_cuda, share_vars_from=trainer, main_program=test_prog)
+
+    train_reader.decorate_paddle_reader(
+        paddle.v2.reader.shuffle(
+            paddle.batch(mnist.train(), 512), buf_size=8192))
+
+    test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
+
+    for epoch_id in xrange(10):
+        train_reader.start()
+        try:
+            while True:
+                print 'train_loss', numpy.array(
+                    trainer.run(fetch_list=[loss.name]))
+        except fluid.core.EOFException:
+            print 'End of epoch', epoch_id
+            train_reader.reset()
+
+        test_reader.start()
+        try:
+            while True:
+                print 'test loss', numpy.array(
+                    tester.run(fetch_list=[test_loss.name]))
+        except fluid.core.EOFException:
+            print 'End of testing'
+            test_reader.reset()
+
+
+if __name__ == '__main__':
+    main()
--- a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
@@ -31,7 +31,10 @@ def load_vocab(filename):


 # load word dict with paddle inner function
-word_dict = load_vocab(sys.argv[1])
+if len(sys.argv) == 1:
+    word_dict = paddle.dataset.imdb.word_dict()
+else:
+    word_dict = load_vocab(sys.argv[1])
 word_dict["<unk>"] = len(word_dict)
 print "Dict dim = ", len(word_dict)


--- a/python/paddle/fluid/tests/demo/text_classification/train.py
+++ b/python/paddle/fluid/tests/demo/text_classification/train.py
@@ -41,16 +41,14 @@ def network_cfg(is_train, pass_num=100):
            pass_num=pass_num,
            shapes=[[-1, 1], [-1, 1]],
            lod_levels=[1, 0],
-            dtypes=['int64', 'int64'],
-            thread_num=1)
+            dtypes=['int64', 'int64'])

        test_file_obj = fluid.layers.open_files(
            filenames=TEST_FILES,
            pass_num=1,
            shapes=[[-1, 1], [-1, 1]],
            lod_levels=[1, 0],
-            dtypes=['int64', 'int64'],
-            thread_num=1)
+            dtypes=['int64', 'int64'])

        if is_train:
            file_obj = fluid.layers.shuffle(train_file_obj, buffer_size=1000)

--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A simple machine translation demo using beam search decoder.
+"""
+
+import contextlib
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as layers
+from paddle.fluid.executor import Executor
+from paddle.fluid.contrib.decoder.beam_search_decoder import *
+import unittest
+import os
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+word_dim = 32
+decoder_size = hidden_dim
+IS_SPARSE = True
+batch_size = 2
+max_length = 8
+topk_size = 50
+trg_dic_size = 10000
+beam_size = 2
+
+
+def encoder():
+    # encoder
+    src_word = layers.data(
+        name="src_word", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = layers.embedding(
+        input=src_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    fc1 = layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
+    return encoder_out
+
+
+def decoder_state_cell(context):
+    h = InitState(init=context, need_reorder=True)
+    state_cell = StateCell(inputs={'x': None}, states={'h': h}, out_state='h')
+
+    @state_cell.state_updater
+    def updater(state_cell):
+        current_word = state_cell.get_input('x')
+        prev_h = state_cell.get_state('h')
+        # make sure lod of h heritted from prev_h
+        h = layers.fc(input=[prev_h, current_word],
+                      size=decoder_size,
+                      act='tanh')
+        state_cell.set_state('h', h)
+
+    return state_cell
+
+
+def decoder_train(state_cell):
+    # decoder
+    trg_language_word = layers.data(
+        name="target_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = layers.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    decoder = TrainingDecoder(state_cell)
+
+    with decoder.block():
+        current_word = decoder.step_input(trg_embedding)
+        decoder.state_cell.compute_state(inputs={'x': current_word})
+        current_score = layers.fc(input=decoder.state_cell.get_state('h'),
+                                  size=target_dict_dim,
+                                  act='softmax')
+        decoder.state_cell.update_states()
+        decoder.output(current_score)
+
+    return decoder()
+
+
+def decoder_decode(state_cell):
+    init_ids = layers.data(
+        name="init_ids", shape=[1], dtype="int64", lod_level=2)
+    init_scores = layers.data(
+        name="init_scores", shape=[1], dtype="float32", lod_level=2)
+
+    decoder = BeamSearchDecoder(
+        state_cell=state_cell,
+        init_ids=init_ids,
+        init_scores=init_scores,
+        target_dict_dim=target_dict_dim,
+        word_dim=word_dim,
+        input_var_dict={},
+        topk_size=topk_size,
+        sparse_emb=IS_SPARSE,
+        max_len=max_length,
+        beam_size=beam_size,
+        end_id=1,
+        name=None)
+    decoder.decode()
+    translation_ids, translation_scores = decoder()
+
+    return translation_ids, translation_scores
+
+
+def train_main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    context = encoder()
+    state_cell = decoder_state_cell(context)
+    rnn_out = decoder_train(state_cell)
+    label = layers.data(
+        name="target_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = layers.cross_entropy(input=rnn_out, label=label)
+    avg_cost = layers.mean(x=cost)
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-3)
+    optimizer.minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+    feed_order = ['src_word', 'target_word', 'target_next_word']
+
+    exe = Executor(place)
+
+    def train_loop(main_program):
+        exe.run(framework.default_startup_program())
+
+        feed_list = [
+            main_program.global_block().var(var_name) for var_name in feed_order
+        ]
+        feeder = fluid.DataFeeder(feed_list, place)
+
+        for pass_id in xrange(1):
+            for batch_id, data in enumerate(train_reader()):
+                outs = exe.run(main_program,
+                               feed=feeder.feed(data),
+                               fetch_list=[avg_cost])
+                avg_cost_val = np.array(outs[0])
+                print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                      " avg_cost=" + str(avg_cost_val))
+                if batch_id > 3:
+                    break
+
+    train_loop(framework.default_main_program())
+
+
+def decode_main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    context = encoder()
+    state_cell = decoder_state_cell(context)
+    translation_ids, translation_scores = decoder_decode(state_cell)
+
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64')
+    init_scores_data = np.array(
+        [1. for _ in range(batch_size)], dtype='float32')
+    init_ids_data = init_ids_data.reshape((batch_size, 1))
+    init_scores_data = init_scores_data.reshape((batch_size, 1))
+    init_lod = [1] * batch_size
+    init_lod = [init_lod, init_lod]
+
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
+    init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    feed_order = ['src_word']
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
+    data = train_reader().next()
+    feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+    feed_dict['init_ids'] = init_ids
+    feed_dict['init_scores'] = init_scores
+
+    result_ids, result_scores = exe.run(
+        framework.default_main_program(),
+        feed=feed_dict,
+        fetch_list=[translation_ids, translation_scores],
+        return_numpy=False)
+    print result_ids.lod()
+
+
+class TestBeamSearchDecoder(unittest.TestCase):
+    pass
+
+
+@contextlib.contextmanager
+def scope_prog_guard():
+    prog = fluid.Program()
+    startup_prog = fluid.Program()
+    scope = fluid.core.Scope()
+    with fluid.scope_guard(scope):
+        with fluid.program_guard(prog, startup_prog):
+            yield
+
+
+def inject_test_train(use_cuda):
+    f_name = 'test_{0}_train'.format('cuda' if use_cuda else 'cpu')
+
+    def f(*args):
+        with scope_prog_guard():
+            train_main(use_cuda)
+
+    setattr(TestBeamSearchDecoder, f_name, f)
+
+
+def inject_test_decode(use_cuda, decorator=None):
+    f_name = 'test_{0}_decode'.format('cuda' if use_cuda else 'cpu', 'sparse')
+
+    def f(*args):
+        with scope_prog_guard():
+            decode_main(use_cuda)
+
+    if decorator is not None:
+        f = decorator(f)
+
+    setattr(TestBeamSearchDecoder, f_name, f)
+
+
+for _use_cuda_ in (False, True):
+    inject_test_train(_use_cuda_)
+
+for _use_cuda_ in (False, True):
+    _decorator_ = None
+    inject_test_decode(use_cuda=_use_cuda_, decorator=_decorator_)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -12,6 +12,11 @@ endif(NOT WITH_MKLDNN)

 if(NOT WITH_DISTRIBUTE)
    list(REMOVE_ITEM TEST_OPS test_recv_op)
+    list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
+    list(REMOVE_ITEM TEST_OPS test_simple_dist_transpiler)
+    list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec)
 endif(NOT WITH_DISTRIBUTE)

 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
@@ -43,13 +48,17 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
+list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
 foreach(TEST_OP ${TEST_OPS})
    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
-py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
+if(WITH_DISTRIBUTE)
+    py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
+    set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
+    set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
+    set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
+endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
-set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
-set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
-set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
+py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import sys
+import signal
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class SE_ResNeXt():
+    def __init__(self, layers=50):
+        self.params = train_parameters
+        self.layers = layers
+
+    def net(self, input, class_dim=1000):
+        layers = self.layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+        if layers == 50:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 6, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 101:
+            cardinality = 32
+            reduction_ratio = 16
+            depth = [3, 4, 23, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv,
+                pool_size=3,
+                pool_stride=2,
+                pool_padding=1,
+                pool_type='max')
+        elif layers == 152:
+            cardinality = 64
+            reduction_ratio = 16
+            depth = [3, 8, 36, 3]
+            num_filters = [128, 256, 512, 1024]
+
+            conv = self.conv_bn_layer(
+                input=input,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu')
+            conv = self.conv_bn_layer(
+                input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
+            conv = self.conv_bn_layer(
+                input=conv,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu')
+            conv = fluid.layers.pool2d(
+                input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
+                pool_type='max')
+
+        for block in range(len(depth)):
+            for i in range(depth[block]):
+                conv = self.bottleneck_block(
+                    input=conv,
+                    num_filters=num_filters[block],
+                    stride=2 if i == 0 and block != 0 else 1,
+                    cardinality=cardinality,
+                    reduction_ratio=reduction_ratio)
+
+        pool = fluid.layers.pool2d(
+            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+        drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+        stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
+        out = fluid.layers.fc(input=drop, size=class_dim, act='softmax')
+        return out
+
+    def shortcut(self, input, ch_out, stride):
+        ch_in = input.shape[1]
+        if ch_in != ch_out or stride != 1:
+            filter_size = 1
+            return self.conv_bn_layer(input, ch_out, filter_size, stride)
+        else:
+            return input
+
+    def bottleneck_block(self, input, num_filters, stride, cardinality,
+                         reduction_ratio):
+        conv0 = self.conv_bn_layer(
+            input=input, num_filters=num_filters, filter_size=1, act='relu')
+        conv1 = self.conv_bn_layer(
+            input=conv0,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            groups=cardinality,
+            act='relu')
+        conv2 = self.conv_bn_layer(
+            input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+        scale = self.squeeze_excitation(
+            input=conv2,
+            num_channels=num_filters * 2,
+            reduction_ratio=reduction_ratio)
+
+        short = self.shortcut(input, num_filters * 2, stride)
+
+        return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+    def conv_bn_layer(self,
+                      input,
+                      num_filters,
+                      filter_size,
+                      stride=1,
+                      groups=1,
+                      act=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) / 2,
+            groups=groups,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=conv, act=act)
+
+    def squeeze_excitation(self, input, num_channels, reduction_ratio):
+        pool = fluid.layers.pool2d(
+            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+        stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+        squeeze = fluid.layers.fc(input=pool,
+                                  size=num_channels / reduction_ratio,
+                                  act='relu')
+        stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
+        excitation = fluid.layers.fc(input=squeeze,
+                                     size=num_channels,
+                                     act='sigmoid')
+        scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+        return scale
+
+
+def get_model(batch_size):
+    # Input data
+    image = fluid.layers.fill_constant(
+        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
+    label = fluid.layers.fill_constant(
+        shape=[batch_size, 1], dtype='int64', value=0.0)
+
+    # Train program
+    model = SE_ResNeXt(layers=50)
+    out = model.net(input=image, class_dim=102)
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+
+    avg_cost = fluid.layers.mean(x=cost)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+    # Evaluator
+    test_program = fluid.default_main_program().clone(for_test=True)
+
+    # Optimization
+    total_images = 6149  # flowers
+    epochs = [30, 60, 90]
+    step = int(total_images / batch_size + 1)
+
+    bd = [step * e for e in epochs]
+    base_lr = 0.1
+    lr = []
+    lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+
+    optimizer = fluid.optimizer.Momentum(
+        learning_rate=fluid.layers.piecewise_decay(
+            boundaries=bd, values=lr),
+        momentum=0.9,
+        regularization=fluid.regularizer.L2Decay(1e-4))
+    optimizer.minimize(avg_cost)
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.flowers.train(), batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.flowers.test(), batch_size=batch_size)
+
+    return test_program, avg_cost, train_reader, test_reader, acc_top1, out
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+class DistSeResneXt2x2:
+    def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
+                    trainer_id):
+        get_model(batch_size=2)
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), pserver_endpoints,
+                           trainers)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(pserver_prog)
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 20
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            print("waiting ps ready: ", pid)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
+            batch_size=20)
+        if is_dist:
+            t = get_transpiler(trainer_id,
+                               fluid.default_main_program(), endpoints,
+                               trainers)
+            trainer_prog = t.get_trainer_program()
+        else:
+            trainer_prog = fluid.default_main_program()
+
+        startup_exe = fluid.Executor(place)
+        startup_exe.run(fluid.default_startup_program())
+
+        strategy = fluid.ExecutionStrategy()
+        strategy.num_threads = 1
+        strategy.allow_op_delay = False
+        exe = fluid.ParallelExecutor(
+            True,
+            loss_name=avg_cost.name,
+            exec_strategy=strategy,
+            num_trainers=trainers,
+            trainer_id=trainer_id)
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        reader_generator = train_reader()
+        first_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(first_loss)
+        for i in xrange(5):
+            loss, = exe.run(fetch_list=[avg_cost.name])
+        last_loss, = exe.run(fetch_list=[avg_cost.name])
+        print(last_loss)
+
+
+def main(role="pserver",
+         endpoints="127.0.0.1:9123",
+         trainer_id=0,
+         current_endpoint="127.0.0.1:9123",
+         trainers=1,
+         is_dist=True):
+    model = DistSeResneXt2x2()
+    if role == "pserver":
+        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
+    else:
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 7:
+        print(
+            "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
+        )
+    role = sys.argv[1]
+    endpoints = sys.argv[2]
+    trainer_id = int(sys.argv[3])
+    current_endpoint = sys.argv[4]
+    trainers = int(sys.argv[5])
+    is_dist = True if sys.argv[6] == "TRUE" else False
+    main(
+        role=role,
+        endpoints=endpoints,
+        trainer_id=trainer_id,
+        current_endpoint=current_endpoint,
+        trainers=trainers,
+        is_dist=is_dist)
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
@@ -15,13 +15,13 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+from paddle.fluid import metrics


 class TestAucOp(OpTest):
    def setUp(self):
        self.op_type = "auc"
        pred = np.random.random((128, 2)).astype("float32")
-        indices = np.random.randint(0, 2, (128, 2))
        labels = np.random.randint(0, 2, (128, 1))
        num_thresholds = 200
        tp = np.zeros((num_thresholds, )).astype("int64")
@@ -30,8 +30,7 @@ class TestAucOp(OpTest):
        fn = np.zeros((num_thresholds, )).astype("int64")

        self.inputs = {
-            'Out': pred,
-            'Indices': indices,
+            'Predict': pred,
            'Label': labels,
            'TP': tp,
            'TN': tn,
@@ -39,57 +38,18 @@ class TestAucOp(OpTest):
            'FN': fn
        }
        self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
-        # NOTE: sklearn use a different way to generate thresholds
-        #       which will cause the result differs slightly:
-        # from sklearn.metrics import roc_curve, auc
-        # fpr, tpr, thresholds = roc_curve(labels, pred)
-        # auc_value = auc(fpr, tpr)
-        # we caculate AUC again using numpy for testing
-        kepsilon = 1e-7  # to account for floating point imprecisions
-        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
-                      for i in range(num_thresholds - 2)]
-        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]

-        # caculate TP, FN, TN, FP count
-        tp_list = np.ndarray((num_thresholds, ))
-        fn_list = np.ndarray((num_thresholds, ))
-        tn_list = np.ndarray((num_thresholds, ))
-        fp_list = np.ndarray((num_thresholds, ))
-        for idx_thresh, thresh in enumerate(thresholds):
-            tp, fn, tn, fp = 0, 0, 0, 0
-            for i, lbl in enumerate(labels):
-                if lbl:
-                    if pred[i, 0] >= thresh:
-                        tp += 1
-                    else:
-                        fn += 1
-                else:
-                    if pred[i, 0] >= thresh:
-                        fp += 1
-                    else:
-                        tn += 1
-            tp_list[idx_thresh] = tp
-            fn_list[idx_thresh] = fn
-            tn_list[idx_thresh] = tn
-            fp_list[idx_thresh] = fp
-
-        epsilon = 1e-6
-        tpr = (tp_list.astype("float32") + epsilon) / (
-            tp_list + fn_list + epsilon)
-        fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon)
-        rec = (tp_list.astype("float32") + epsilon) / (
-            tp_list + fp_list + epsilon)
-
-        x = fpr[:num_thresholds - 1] - fpr[1:]
-        y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
-        auc_value = np.sum(x * y)
+        python_auc = metrics.Auc(name="auc",
+                                 curve='ROC',
+                                 num_thresholds=num_thresholds)
+        python_auc.update(pred, labels)

        self.outputs = {
-            'AUC': auc_value,
-            'TPOut': tp_list,
-            'FNOut': fn_list,
-            'TNOut': tn_list,
-            'FPOut': fp_list
+            'AUC': python_auc.eval(),
+            'TPOut': python_auc.tp_list,
+            'FNOut': python_auc.fn_list,
+            'TNOut': python_auc.tn_list,
+            'FPOut': python_auc.fp_list
        }

    def test_check_output(self):

--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
@@ -100,6 +100,8 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
            np.array_equal(np.array(sentence_scores), expected_data))


+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestBeamSearchDecodeOpGPU(TestBeamSearchDecodeOp):
    def setUp(self):
        self.scope = core.Scope()

--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -191,12 +191,16 @@ class TestWithDilation(TestConv2dTransposeOp):


 # ------------ test_cudnn ------------
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNN(TestConv2dTransposeOp):
    def init_op_type(self):
        self.use_cudnn = True
        self.op_type = "conv2d_transpose"


+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithPad(TestWithPad):
    def init_test_case(self):
        self.pad = [1, 1]
@@ -212,6 +216,8 @@ class TestCUDNNWithPad(TestWithPad):
        self.op_type = "conv2d_transpose"


+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithStride(TestWithStride):
    def init_test_case(self):
        self.pad = [1, 1]
@@ -227,6 +233,8 @@ class TestCUDNNWithStride(TestWithStride):
        self.op_type = "conv2d_transpose"


+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithGroups(TestWithGroups):
    def init_test_case(self):
        self.pad = [1, 1]

--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
@@ -197,12 +197,16 @@ class TestWithDilation(TestConv3dTransposeOp):


 # ------------ test_cudnn ------------
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNN(TestConv3dTransposeOp):
    def init_op_type(self):
        self.use_cudnn = True
        self.op_type = "conv3d_transpose"


+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithPad(TestWithPad):
    def init_test_case(self):
        self.pad = [1, 1, 1]
@@ -218,6 +222,8 @@ class TestCUDNNWithPad(TestWithPad):
        self.op_type = "conv3d_transpose"


+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithStride(TestWithStride):
    def init_test_case(self):
        self.pad = [1, 1, 1]
@@ -233,6 +239,8 @@ class TestCUDNNWithStride(TestWithStride):
        self.op_type = "conv3d_transpose"


+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestCUDNNWithGroups(TestWithGroups):
    def init_test_case(self):
        self.pad = [1, 1, 1]

--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
@@ -142,8 +142,7 @@ class TestDataBalance(unittest.TestCase):
                filenames=[self.lod_data_file_name],
                shapes=[[-1, 3], [-1, 1]],
                lod_levels=[1, 0],
-                dtypes=['float32', 'int32'],
-                thread_num=1)
+                dtypes=['float32', 'int32'])
            ins, label = fluid.layers.read_file(data_reader)

            place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
@@ -156,7 +155,7 @@ class TestDataBalance(unittest.TestCase):
                main_program=main_prog,
                build_strategy=build_strategy)

-            if (parallel_exe.device_count > self.batch_size):
+            if parallel_exe.device_count > self.batch_size:
                print("WARNING: Unittest TestDataBalance skipped. \
                    For the result is not correct when device count \
                    is larger than batch size.")
@@ -190,3 +189,7 @@ class TestDataBalance(unittest.TestCase):
    def test_all(self):
        self.main()
        self.main_lod()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import unittest
+import os
+import signal
+import subprocess
+
+
+class TestDistSeResneXt2x2(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 2
+        self._pservers = 2
+        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
+        self._python_interp = "python"
+
+    def start_pserver(self):
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        ps0_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
+        ps1_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
+
+        ps0_proc = subprocess.Popen(
+            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        ps1_proc = subprocess.Popen(
+            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return ps0_proc, ps1_proc
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 20
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(3)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def non_test_with_place(self):
+        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
+        required_envs = {
+            "PATH": os.getenv("PATH"),
+            "PYTHONPATH": os.getenv("PYTHONPATH"),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
+            "FLAGS_fraction_of_gpu_memory_to_use": "0.15"
+        }
+        # Run local to get a base line
+        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
+        env_local.update(required_envs)
+        local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \
+            (self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1)
+        local_proc = subprocess.Popen(
+            local_cmd.split(" "), stdout=subprocess.PIPE, env=env_local)
+        local_proc.wait()
+        local_ret = local_proc.stdout.read()
+
+        # Run dist train to compare with local results
+        ps0, ps1 = self.start_pserver()
+        self._wait_ps_ready(ps0.pid)
+        self._wait_ps_ready(ps1.pid)
+
+        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
+        tr0_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
+        tr1_cmd = "%s dist_se_resnext.py trainer %s 1 %s %d TRUE" % \
+            (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
+
+        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        env0.update(required_envs)
+        env1.update(required_envs)
+        FNULL = open(os.devnull, 'w')
+
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env0)
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env1)
+
+        tr0_proc.wait()
+        tr1_proc.wait()
+        loss_data0 = tr0_proc.stdout.read()
+        lines = loss_data0.split("\n")
+        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
+        dist_last_loss = eval(lines[1].replace(" ", ","))[0]
+
+        local_lines = local_ret.split("\n")
+        local_first_loss = eval(local_lines[0])[0]
+        local_last_loss = eval(local_lines[1])[0]
+
+        self.assertAlmostEqual(local_first_loss, dist_first_loss)
+        self.assertAlmostEqual(local_last_loss, dist_last_loss)
+
+        # check tr0_out
+        # FIXME: ensure the server process is killed
+        # replace with ps0.terminate()
+        os.kill(ps0.pid, signal.SIGKILL)
+        os.kill(ps1.pid, signal.SIGKILL)
+        FNULL.close()
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -22,6 +22,9 @@ import numpy

 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
+from paddle.fluid.layers.io import ListenAndServ
+from paddle.fluid.layers.io import Recv
+from paddle.fluid.layers.io import Send


 class TestSendOp(unittest.TestCase):
@@ -65,8 +68,7 @@ class TestSendOp(unittest.TestCase):
        main = fluid.Program()

        with fluid.program_guard(main):
-            serv = layers.ListenAndServ(
-                "127.0.0.1:0", ["X"], optimizer_mode=False)
+            serv = ListenAndServ("127.0.0.1:0", ["X"], optimizer_mode=False)
            with serv.do():
                out_var = main.global_block().create_var(
                    name="scale_0.tmp_0",
@@ -99,8 +101,8 @@ class TestSendOp(unittest.TestCase):
                persistable=False,
                shape=[32, 32])
            fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
-            layers.Send("127.0.0.1:%d" % port, [x])
-            o = layers.Recv("127.0.0.1:%d" % port, [get_var])
+            Send("127.0.0.1:%d" % port, [x])
+            o = Recv("127.0.0.1:%d" % port, [get_var])

        exe = fluid.Executor(place)
        self.dist_out = exe.run(main, fetch_list=o)  # o is a list

--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -39,7 +39,7 @@ class TestMomentumOp1(OpTest):

        velocity_out = mu * velocity + grad
        if use_nesterov:
-            param_out = param - grad * learning_rate + \
+            param_out = param - grad * learning_rate - \
                        velocity_out * mu * learning_rate
        else:
            param_out = param - learning_rate * velocity_out
@@ -75,7 +75,7 @@ class TestMomentumOp2(OpTest):

        velocity_out = mu * velocity + grad
        if use_nesterov:
-            param_out = param - grad * learning_rate + \
+            param_out = param - grad * learning_rate - \
                        velocity_out * mu * learning_rate
        else:
            param_out = param - learning_rate * velocity_out

--- a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
@@ -39,17 +39,17 @@ class TestMultipleReader(unittest.TestCase):
        copyfile('./mnist_0.recordio', './mnist_1.recordio')
        copyfile('./mnist_0.recordio', './mnist_2.recordio')

-    def main(self, thread_num):
+    def main(self, is_test=False):
        file_list = [
            './mnist_0.recordio', './mnist_1.recordio', './mnist_2.recordio'
        ]
        with fluid.program_guard(fluid.Program(), fluid.Program()):
            data_files = fluid.layers.open_files(
                filenames=file_list,
-                thread_num=thread_num,
                shapes=[(-1, 784), (-1, 1)],
                lod_levels=[0, 0],
-                dtypes=['float32', 'int64'])
+                dtypes=['float32', 'int64'],
+                is_test=is_test)
            img, label = fluid.layers.read_file(data_files)

            if fluid.core.is_compiled_with_cuda():
@@ -71,6 +71,9 @@ class TestMultipleReader(unittest.TestCase):
            self.assertEqual(batch_count, self.num_batch * 3)

    def test_main(self):
-        self.main(thread_num=3)  # thread number equals to file number
-        self.main(thread_num=10)  # thread number is larger than file number
-        self.main(thread_num=2)  # thread number is less than file number
+        self.main(is_test=False)
+        self.main(is_test=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -15,6 +15,7 @@
 import paddle.dataset.flowers as flowers
 import math
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import unittest
 import numpy as np
 import paddle
@@ -92,7 +93,8 @@ class TestFetchOp(unittest.TestCase):
            train_inputs.append(tst_reader_iter.next())

        os.environ['CPU_NUM'] = str(4)
-        self.parallel_exe(train_inputs, seed=1, use_cuda=True)
+        if core.is_compiled_with_cuda():
+            self.parallel_exe(train_inputs, seed=1, use_cuda=True)
        self.parallel_exe(train_inputs, seed=1, use_cuda=False)


@@ -137,7 +139,8 @@ class TestFeedParallel(unittest.TestCase):

    def test_feed_op(self):
        os.environ['CPU_NUM'] = str(4)
-        self.parallel_exe(use_cuda=True, seed=1)
+        if core.is_compiled_with_cuda():
+            self.parallel_exe(use_cuda=True, seed=1)
        self.parallel_exe(use_cuda=False, seed=1)



--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -14,6 +14,7 @@

 from parallel_executor_test_base import TestParallelExecutorBase
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy as np
 import paddle
 import paddle.dataset.mnist as mnist
@@ -32,9 +33,7 @@ def simple_fc_net(use_feed):
            filenames=[MNIST_RECORDIO_FILE],
            shapes=[[-1, 784], [-1, 1]],
            lod_levels=[0, 0],
-            dtypes=['float32', 'int64'],
-            thread_num=1,
-            for_parallel=True)
+            dtypes=['float32', 'int64'])
        reader = fluid.layers.io.double_buffer(reader)
        img, label = fluid.layers.read_file(reader)
    hidden = img
@@ -60,9 +59,7 @@ def fc_with_batchnorm(use_feed):
            filenames=[MNIST_RECORDIO_FILE],
            shapes=[[-1, 784], [-1, 1]],
            lod_levels=[0, 0],
-            dtypes=['float32', 'int64'],
-            thread_num=1,
-            for_parallel=True)
+            dtypes=['float32', 'int64'])
        reader = fluid.layers.io.double_buffer(reader)
        img, label = fluid.layers.read_file(reader)

@@ -101,13 +98,25 @@ class TestMNIST(TestParallelExecutorBase):
            fluid.recordio_writer.convert_reader_to_recordio_file(
                MNIST_RECORDIO_FILE, reader, feeder)

+    def _init_data(self, random=True):
+        np.random.seed(5)
+        if random:
+            img = np.random.random(size=[32, 784]).astype(np.float32)
+        else:
+            img = np.ones(shape=[32, 784], dtype='float32')
+        label = np.ones(shape=[32, 1], dtype='int64')
+        return img, label
+
+    # simple_fc
    def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
        self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
        self.check_network_convergence(
            simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)

-        img = np.zeros(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
+        img, label = self._init_data()
+
        self.check_network_convergence(
            simple_fc_net,
            feed_dict={"image": img,
@@ -115,6 +124,37 @@ class TestMNIST(TestParallelExecutorBase):
            use_cuda=use_cuda,
            use_reduce=use_reduce)

+    def check_simple_fc_convergence_with_Reduce(self, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+        self.check_network_convergence(
+            simple_fc_net, use_cuda=use_cuda, use_reduce=True)
+        self.check_network_convergence(
+            simple_fc_net,
+            use_cuda=use_cuda,
+            allow_op_delay=True,
+            use_reduce=True)
+
+        img, label = self._init_data()
+
+        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
+            simple_fc_net,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=False)
+        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
+            simple_fc_net,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=True)
+
+        for loss in zip(all_reduce_first_loss, reduce_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(all_reduce_last_loss, reduce_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+
    def test_simple_fc(self):
        # use_cuda
        self.check_simple_fc_convergence(True)
@@ -122,12 +162,15 @@ class TestMNIST(TestParallelExecutorBase):

    def test_simple_fc_with_new_strategy(self):
        # use_cuda, use_reduce
-        self.check_simple_fc_convergence(True, True)
-        self.check_simple_fc_convergence(False, True)
+        self.check_simple_fc_convergence_with_Reduce(True)
+        self.check_simple_fc_convergence_with_Reduce(False)
+
+    def check_simple_fc_parallel_accuracy(self, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
+        img, label = self._init_data(random=False)

-    def check_simple_fc_parallel_accuracy(self, use_cuda, use_reduce=False):
-        img = np.zeros(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
        single_first_loss, single_last_loss = self.check_network_convergence(
            method=simple_fc_net,
            seed=1000,
@@ -141,8 +184,7 @@ class TestMNIST(TestParallelExecutorBase):
            feed_dict={"image": img,
                       "label": label},
            use_cuda=use_cuda,
-            use_parallel_executor=True,
-            use_reduce=use_reduce)
+            use_parallel_executor=True)

        for p_f in parallel_first_loss:
            self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
@@ -153,30 +195,53 @@ class TestMNIST(TestParallelExecutorBase):
        self.check_simple_fc_parallel_accuracy(True)
        self.check_simple_fc_parallel_accuracy(False)

-    def test_simple_fc_parallel_accuracy_with_new_strategy(self):
-        # use_cuda, use_reduce
-        self.check_simple_fc_parallel_accuracy(True, True)
-        self.check_simple_fc_parallel_accuracy(False, True)
+    def check_batchnorm_fc_convergence(self, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return

-    def check_batchnorm_fc_convergence(self, use_cuda, use_reduce=False):
        self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda)
-        img = np.zeros(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
+
+        img, label = self._init_data()
+
        self.check_network_convergence(
+            fc_with_batchnorm,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda)
+
+    def check_batchnorm_fc_convergence_use_reduce(self, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+        self.check_network_convergence(
+            fc_with_batchnorm, use_cuda=use_cuda, use_reduce=True)
+
+        img, label = self._init_data()
+
+        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
            fc_with_batchnorm,
            feed_dict={"image": img,
                       "label": label},
            use_cuda=use_cuda,
-            use_reduce=use_reduce)
+            use_reduce=False)
+        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
+            fc_with_batchnorm,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_reduce=True)
+
+        for loss in zip(all_reduce_first_loss, reduce_first_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+        for loss in zip(all_reduce_last_loss, reduce_last_loss):
+            self.assertAlmostEquals(loss[0], loss[1], delta=1e-4)

    def test_batchnorm_fc(self):
        self.check_batchnorm_fc_convergence(True)
        self.check_batchnorm_fc_convergence(False)

    def test_batchnorm_fc_with_new_strategy(self):
-        # use_cuda, use_reduce
-        self.check_batchnorm_fc_convergence(True, True)
-        self.check_batchnorm_fc_convergence(False, True)
+        self.check_batchnorm_fc_convergence_use_reduce(True)
+        self.check_batchnorm_fc_convergence_use_reduce(False)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -16,6 +16,7 @@ import paddle.fluid as fluid
 import paddle.fluid.layers.ops as ops
 from paddle.fluid.initializer import init_on_cpu
 from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
+import paddle.fluid.core as core
 from parallel_executor_test_base import TestParallelExecutorBase
 import unittest
 import math
@@ -140,6 +141,9 @@ class TestResnet(TestParallelExecutorBase):
                                                          use_reduce=False,
                                                          iter=20):

+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+
        os.environ['CPU_NUM'] = str(4)

        def _cosine_decay(learning_rate, step_each_epoch, epochs=120):

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy as np
 import unittest
 import os
@@ -92,16 +93,18 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
    def test_parallel_testing(self):
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
-        self.check_network_convergence(
-            use_cuda=True, build_strategy=build_strategy)
+        if core.is_compiled_with_cuda():
+            self.check_network_convergence(
+                use_cuda=True, build_strategy=build_strategy)
        self.check_network_convergence(
            use_cuda=False, build_strategy=build_strategy)

    def test_parallel_testing_with_new_strategy(self):
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-        self.check_network_convergence(
-            use_cuda=True, build_strategy=build_strategy)
+        if core.is_compiled_with_cuda():
+            self.check_network_convergence(
+                use_cuda=True, build_strategy=build_strategy)
        self.check_network_convergence(
            use_cuda=False, build_strategy=build_strategy)


--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
@@ -56,6 +56,8 @@ class TestPrintOpCPU(unittest.TestCase):
                       return_numpy=False)


+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestPrintOpGPU(TestPrintOpCPU):
    def setUp(self):
        self.place = core.CUDAPlace(0)

--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -79,12 +79,18 @@ class TestProfiler(unittest.TestCase):
                pass_acc_calculator.add(value=acc, weight=b_size)
                pass_acc = pass_acc_calculator.eval()

+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "profiler is enabled only with GPU")
    def test_cpu_profiler(self):
        self.net_profiler('CPU')

+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "profiler is enabled only with GPU")
    def test_cuda_profiler(self):
        self.net_profiler('GPU')

+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "profiler is enabled only with GPU")
    def test_all_profiler(self):
        self.net_profiler('All', '/tmp/profile_out')
        with open('/tmp/profile_out', 'r') as f:

--- a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
@@ -45,12 +45,12 @@ class TestPyReader(unittest.TestCase):
            ) else fluid.CPUPlace()
            executor = fluid.Executor(place)

-            data_file, feed_queue = fluid.layers.py_reader(
+            data_file = fluid.layers.py_reader(
                capacity=self.capacity,
                dtypes=self.dtypes,
                lod_levels=self.lod_levels,
                shapes=self.shapes)
-
+            feed_queue = data_file.queue
            read_out_data = fluid.layers.read_file(data_file)
            self.inputs = []


--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -52,11 +52,13 @@ def simple_fc_net(in_size,
                  batch_size,
                  queue_capacity,
                  use_double_buffer=False):
-    reader, feed_queue = fluid.layers.py_reader(
+    reader = fluid.layers.py_reader(
        capacity=queue_capacity,
        shapes=[[-1, in_size], [-1, 1]],
        lod_levels=[0, 0],
-        dtypes=['float32', 'int64'])
+        dtypes=['float32', 'int64'],
+        use_double_buffer=False)
+    feed_queue = reader.queue
    reader = fluid.layers.batch(reader, batch_size=batch_size)
    if use_double_buffer:
        reader = fluid.layers.double_buffer(reader)

--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -89,15 +89,11 @@ class TestProdOp(OpTest):
        self.check_grad(['X'], 'Out')


-class TestKeepDimReduce(OpTest):
+class Test1DReduce(OpTest):
    def setUp(self):
        self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': [-2], 'keep_dim': True}
-        self.outputs = {
-            'Out':
-            self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True)
-        }
+        self.inputs = {'X': np.random.random(20).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}

    def test_check_output(self):
        self.check_output()
@@ -106,32 +102,82 @@ class TestKeepDimReduce(OpTest):
        self.check_grad(['X'], 'Out')


-class Test1DReduce(OpTest):
+class Test2DReduce0(Test1DReduce):
    def setUp(self):
        self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random(20).astype("float64")}
+        self.attrs = {'dim': [0]}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}

-    def test_check_output(self):
-        self.check_output()

-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+class Test2DReduce1(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [1]}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }


-class TestReduceAll(OpTest):
+class Test3DReduce0(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [1]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce1(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [2]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce2(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [-2]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce3(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [1, 2]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class TestKeepDimReduce(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=self.attrs['keep_dim'])
+        }
+
+
+class TestReduceAll(Test1DReduce):
    def setUp(self):
        self.op_type = "reduce_sum"
        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
        self.attrs = {'reduce_all': True}
        self.outputs = {'Out': self.inputs['X'].sum()}

-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-

 ## reduction in multi dims
 class TestReduceMeanOpMultiAxises(OpTest):

--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
@@ -61,6 +61,8 @@ class TestSequenceSoftmaxOp(OpTest):


 # ----------------cudnn Sequencesoftmax----------------
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSequenceSoftmaxCUDNNOp(TestSequenceSoftmaxOp):
    def init_op_type(self):
        self.use_cudnn = True

--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -63,11 +63,15 @@ class TestSoftmaxOp(OpTest):
            self.check_grad(["X"], "Out", max_relative_error=0.01)


+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp(TestSoftmaxOp):
    def init_kernel_type(self):
        self.use_cudnn = True


+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSoftmaxFP16Op(TestSoftmaxOp):
    def init_kernel_type(self):
        self.dtype = np.float16
@@ -79,6 +83,8 @@ class TestSoftmaxFP16Op(TestSoftmaxOp):
                self.check_output_with_place(place, atol=1e-3)


+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
    def init_kernel_type(self):
        self.use_cudnn = True

--- a/python/paddle/fluid/tests/unittests/test_version.py
+++ b/python/paddle/fluid/tests/unittests/test_version.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import re
+
+import paddle.version as fluid_version
+
+
+class VersionTest(unittest.TestCase):
+    def setUp(self):
+        self._major_regex = "[0-9]+"
+        self._minor_regex = "[0-9]+"
+        self._patch_regex = "[0-9]+(\\.(a|b|rc)\\.[0-9]+)?"
+        self._rc_regex = "[0-9]+"
+        self._version_regex = "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?"
+        self._commit_regex = "[0-9a-f]{5,49}"
+
+    def test_check_output(self):
+        # check commit format
+        self.assertTrue(re.match(self._commit_regex, fluid_version.commit))
+        self.assertTrue(isinstance(fluid_version.istaged, bool))
+
+        # check version format
+        if fluid_version.istaged:
+            self.assertEqual(fluid_version.major, 0)
+            self.assertEqual(fluid_version.minor, 0)
+            self.assertEqual(fluid_version.patch, "0")
+            self.assertEqual(fluid_version.rc, 0)
+            self.assertEqual(fluid_version.full_version, "0.0.0")
+        else:
+            self.assertTrue(re.match(self._major_regex, fluid_version.major))
+            self.assertTrue(re.match(self._minor_regex, fluid_version.minor))
+            self.assertTrue(re.match(self._patch_regex, fluid_version.patch))
+            self.assertTrue(re.match(self._rc_regex, fluid_version.rc))
+            self.assertTrue(
+                re.match(self._version_regex, fluid_version.full_version))
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -68,8 +68,14 @@ def reader_creator(image_filename, label_filename, buffer_size):
                for i in xrange(buffer_size):
                    yield images[i, :], int(labels[i])
        finally:
-            m.terminate()
-            l.terminate()
+            try:
+                m.terminate()
+            except:
+                pass
+            try:
+                l.terminate()
+            except:
+                pass

    return reader


--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -104,6 +104,8 @@ packages=['paddle',
          'paddle.fluid.proto',
          'paddle.fluid.proto.profiler',
          'paddle.fluid.layers',
+          'paddle.fluid.contrib',
+          'paddle.fluid.contrib.decoder',
          'paddle.fluid.transpiler',
          'paddle.fluid.transpiler.details']