Merge remote-tracking branch 'upstream/develop' into mkl_packed

290edd8f · tensor-tang · 82091035 · 6b475981 · 290edd8f · 290edd8f
287 changed file
--- a/README.md
+++ b/README.md
@@ -61,32 +61,32 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 ## Installation

 It is recommended to check out the
-[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html).
+[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/build_from_source_en.html).

 ## Documentation

-We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
-[Chinese](http://doc.paddlepaddle.org/doc_cn/) documentation.
+We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and
+[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation.

- [Deep Learning 101](http://book.paddlepaddle.org/index.html)
+- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)

  You might want to start from this online interactive book that can run in a Jupyter Notebook.

- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/cluster_train_en.html)

  You can run distributed training jobs on MPI clusters.

- [Distributed Training on Kubernetes](http://doc.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/k8s_en.html)

   You can also run distributed training jobs on Kubernetes clusters.

- [Python API](http://doc.paddlepaddle.org/develop/doc/api/index_en.html)
+- [Python API](http://www.paddlepaddle.org/docs/develop/documentation/en/api/index_en.html)

   Our new API enables much shorter programs.

- [How to Contribute](http://doc.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html)

   We appreciate your contributions!


--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@@ -6,8 +6,18 @@ height = 227
 width = 227
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
+gp = get_config_arg('layer_num', int, 1)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer,
+    'num_samples': num_samples
+}
 define_py_data_sources2(
    "train.list", None, module="provider", obj="process", args=args)

@@ -31,7 +41,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)

 # conv2
 net = img_conv_layer(
-    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
 net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
 net = img_pool_layer(input=net, pool_size=3, stride=2)

@@ -40,11 +50,11 @@ net = img_conv_layer(
    input=net, filter_size=3, num_filters=384, stride=1, padding=1)
 # conv4
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)

 # conv5
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
 net = img_pool_layer(input=net, pool_size=3, stride=2)

 net = fc_layer(
@@ -59,6 +69,9 @@ net = fc_layer(
    layer_attr=ExtraAttr(drop_rate=0.5))
 net = fc_layer(input=net, size=1000, act=SoftmaxActivation())

-lab = data_layer('label', num_class)
-loss = cross_entropy(input=net, label=lab)
-outputs(loss)
+if is_infer:
+    outputs(net)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=net, label=lab)
+    outputs(loss)
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 use_gpu = get_config_arg('use_gpu', bool, True)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,

--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -14,6 +14,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
    else:
        settings.data_size = settings.height * settings.width
    settings.is_infer = kwargs.get('is_infer', False)
+    settings.num_samples = kwargs.get('num_samples', 2560)
    if settings.is_infer:
        settings.slots = [dense_vector(settings.data_size)]
    else:
@@ -23,7 +24,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
 @provider(
    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(2560 if settings.is_infer else 1024):
+    for i in xrange(settings.num_samples):
        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
        if settings.is_infer:
            yield img.astype('float32')

--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg("layer_num", int, 50)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,

--- a/benchmark/paddle/image/run_mkldnn_infer.sh
+++ b/benchmark/paddle/image/run_mkldnn_infer.sh
@@ -37,7 +37,7 @@ function infer() {
      --trainer_count=1 \
      --num_passes=1 \
      --save_dir="models/${topology}-${layer_num}" \
-      --config_args="batch_size=128,layer_num=${layer_num}" \
+      --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
      > /dev/null 2>&1
    echo "Done"
  fi
@@ -79,8 +79,9 @@ fi
 # inference benchmark
 for use_mkldnn in True False; do
  for batchsize in 1 2 4 8 16; do
-    infer googlenet v1 $batchsize $use_mkldnn
-    infer resnet 50 $batchsize $use_mkldnn
    infer vgg 19 $batchsize $use_mkldnn
+    infer resnet 50 $batchsize $use_mkldnn
+    infer googlenet v1 $batchsize $use_mkldnn
+    infer alexnet 2 $batchsize $use_mkldnn
  done
 done
--- a/benchmark/paddle/image/run_mkldnn_train.sh
+++ b/benchmark/paddle/image/run_mkldnn_train.sh
@@ -28,6 +28,10 @@ function train() {
    --test_period=100 \
    --config_args=$args \
    2>&1 | tee ${log} 
+
+  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }

 if [ ! -f "train.list" ]; then
@@ -43,5 +47,6 @@ for use_mkldnn in True False; do
    train vgg 19 $batchsize $use_mkldnn
    train resnet 50 $batchsize $use_mkldnn
    train googlenet v1 $batchsize $use_mkldnn
+    train alexnet 2 $batchsize $use_mkldnn
  done
 done
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
+set -e
+
+function clock_to_seconds() {
+  hours=`echo $1 | awk -F ':' '{print $1}'`
+  mins=`echo $1 | awk -F ':' '{print $2}'`
+  secs=`echo $1 | awk -F ':' '{print $3}'`
+  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
+}
+
+function infer() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  thread=`nproc`
+  if [ $thread -gt $bs ]; then
+    thread=$bs
+  fi
+  log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+
+  models_in="models/${topology}-${layer_num}/pass-00000/"
+  if [ ! -d $models_in ]; then
+    echo "./run_mkl_infer.sh to save the model first"
+    exit 0
+  fi
+  log_period=$((32 / bs))
+  paddle train --job=test \
+    --config="${topology}.py" \
+    --use_mkldnn=False \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=$log_period \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
+    --init_model_path=$models_in \
+    2>&1 | tee ${log}
+
+  # calculate the last 5 logs period time of 160(=32*5) samples,
+  # the time before are burning time.
+  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  start_sec=`clock_to_seconds $start`
+  end_sec=`clock_to_seconds $end`
+  fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
+  echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+  echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# inference benchmark
+for batchsize in 1 2 4 8 16; do
+  infer vgg 19 $batchsize
+  infer resnet 50 $batchsize 
+  infer googlenet v1 $batchsize
+  infer alexnet 2 $batchsize
+done
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
+set -e
+
+function train() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  thread=`nproc`
+  # each trainer_count use only 1 core to avoid conflict
+  log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+  args="batch_size=${bs},layer_num=${layer_num}"
+  config="${topology}.py"
+  paddle train --job=time \
+    --config=$config \
+    --use_mkldnn=False \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=3 \
+    --test_period=30 \
+    --config_args=$args \
+    2>&1 | tee ${log} 
+
+  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# training benchmark
+for batchsize in 64 128 256; do
+  train vgg 19 $batchsize
+  train resnet 50 $batchsize
+  train googlenet v1 $batchsize
+  train alexnet 2 $batchsize
+done
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)

 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -253,9 +253,9 @@ IF(NOT PROTOBUF_FOUND)
    IF(WITH_C_API)
        INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
        IF(ANDROID)
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
        ELSE()
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib)
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
        ENDIF()
    ENDIF()


--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -467,7 +467,7 @@ lambda_cost
    :noindex:

 square_error_cost
--------
+-----------------
 ..  autoclass:: paddle.v2.layer.square_error_cost
    :noindex:

@@ -533,7 +533,7 @@ Miscs
 =====

 dropout
--------------
+--------
 ..  autoclass:: paddle.v2.layer.dropout
    :noindex:


--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -19,17 +19,17 @@ dynamic_lstm
    :noindex:

 data
---------
+----
 ..  autofunction:: paddle.v2.fluid.layers.data
    :noindex:

 mean
---------
+----
 ..  autofunction:: paddle.v2.fluid.layers.mean
    :noindex:

 mul
---------
+---
 ..  autofunction:: paddle.v2.fluid.layers.mul
    :noindex:

@@ -45,13 +45,13 @@ elementwise_div


 dropout
---------
+-------
 ..  autofunction:: paddle.v2.fluid.layers.dropout
    :noindex:


 reshape
---------
+--------
 ..  autofunction:: paddle.v2.fluid.layers.reshape
    :noindex:

@@ -81,67 +81,67 @@ transpose


 sigmoid_cross_entropy_with_logits
---------
+---------------------------------
 ..  autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
    :noindex:


 cast
---------
+----
 ..  autofunction:: paddle.v2.fluid.layers.cast
    :noindex:


 concat
---------
+-------
 ..  autofunction:: paddle.v2.fluid.layers.concat
    :noindex:


 sums
---------
+----
 ..  autofunction:: paddle.v2.fluid.layers.sums
    :noindex:


 linear_chain_crf
---------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
    :noindex:


 assign
---------
+-------
 ..  autofunction:: paddle.v2.fluid.layers.embedding
    :noindex:


 split_lod_tensor
---------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
    :noindex:


 merge_lod_tensor
---------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
    :noindex:

 cos_sim
---------
+--------
 ..  autofunction:: paddle.v2.fluid.layers.cos_sim
    :noindex:


 cross_entropy
---------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.cross_entropy
    :noindex:



 square_error_cost
---------
+-----------------
 ..  autofunction:: paddle.v2.fluid.layers.square_error_cost
    :noindex:

@@ -153,74 +153,68 @@ accuracy


 sequence_conv
---------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_conv
    :noindex:


 conv2d
---------
+------
 ..  autofunction:: paddle.v2.fluid.layers.conv2d
    :noindex:


 sequence_pool
---------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_pool
    :noindex:


 pool2d
---------
+------
 ..  autofunction:: paddle.v2.fluid.layers.pool2d
    :noindex:


 batch_norm
---------
+----------
 ..  autofunction:: paddle.v2.fluid.layers.batch_norm
    :noindex:


 beam_search_decode
---------
+------------------
 ..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
    :noindex:


-lstm
---------
-..  autofunction:: paddle.v2.fluid.layers.lstm
-    :noindex:
-
-
 lod_rank_table
---------
+--------------
 ..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
    :noindex:


 max_sequence_len
---------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
    :noindex:


 topk
---------
+-----
 ..  autofunction:: paddle.v2.fluid.layers.topk
    :noindex:


 lod_tensor_to_array
---------
+-------------------
 ..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
    :noindex:



 array_to_lod_tensor
---------
+-------------------
 ..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
    :noindex:

@@ -228,26 +222,26 @@ array_to_lod_tensor


 fill_constant
---------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.fill_constant
    :noindex:



 fill_constant_batch_size_like
---------
+-----------------------------
 ..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
    :noindex:


 ones
---------
+----
 ..  autofunction:: paddle.v2.fluid.layers.ones
    :noindex:


 zeros
---------
+-----
 ..  autofunction:: paddle.v2.fluid.layers.zeros
    :noindex:

@@ -259,14 +253,14 @@ increment


 array_write
---------
+-----------
 ..  autofunction:: paddle.v2.fluid.layers.array_write
    :noindex:



 create_array
---------
+------------
 ..  autofunction:: paddle.v2.fluid.layers.create_array
    :noindex:

@@ -278,25 +272,55 @@ less_than


 array_read
---------
+----------
 ..  autofunction:: paddle.v2.fluid.layers.array_read
    :noindex:


 shrink_memory
---------
+--------------
 ..  autofunction:: paddle.v2.fluid.layers.shrink_memory
    :noindex:


 array_length
---------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.array_length
    :noindex:


 conv2d_transpose
---------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
    :noindex:

+
+sequence_expand
+---------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+    :noindex:
+
+
+lstm_unit
+---------
+..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+    :noindex:
+
+
+sequence_softmax
+----------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
+    :noindex:
+
+
+reduce_sum
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+    :noindex:
+
+
+reduce_mean
+---------
+..  autofunction:: paddle.v2.fluid.layers.reduce_mean
+    :noindex:
+
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@@ -3,19 +3,19 @@ Nets
 ===========

 simple_img_conv_pool
-----------
+--------------------
 ..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
    :noindex:


 img_conv_group
-----------
+---------------
 ..  autofunction:: paddle.v2.fluid.nets.img_conv_group
    :noindex:


 sequence_conv_pool
-----------
+------------------
 ..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
    :noindex:


--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
@@ -18,7 +18,7 @@ SGDOptimizer


 MomentumOptimizer
-----------
+-----------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: MomentumOptimizer
    :noindex:
@@ -26,14 +26,14 @@ MomentumOptimizer


 AdagradOptimizer
-----------
+----------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: AdagradOptimizer
    :noindex:


 AdamOptimizer
-----------
+-------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: AdamOptimizer
    :noindex:
@@ -47,7 +47,7 @@ AdamaxOptimizer


 DecayedAdagradOptimizer
-----------
+-----------------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: DecayedAdagradOptimizer
    :noindex:

--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
@@ -3,14 +3,14 @@ Regularizer
 ===========

 WeightDecayRegularizer
-----------
+----------------------
 ..  automodule:: paddle.v2.fluid.regularizer
    :members: WeightDecayRegularizer
    :noindex:


 L2DecayRegularizer
-----------
+------------------
 ..  automodule:: paddle.v2.fluid.regularizer
    :members: L2DecayRegularizer
    :noindex:
@@ -18,7 +18,7 @@ L2DecayRegularizer


 L1DecayRegularizer
-----------
+-------------------
 ..  automodule:: paddle.v2.fluid.regularizer
    :members: L1DecayRegularizer


--- a/doc/design/executor.md
+++ b/doc/design/executor.md
 # Executor Design Doc

 ## Motivation
+In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
+[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).

-We use executor to do the runtime evaluation of a `ProgramDesc`.
+The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.

 ## Overview

-An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.

-### What does executor do?
+## Executor

-It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
+The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
+It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.

-### What does executor NOT do?
+### The interface
+```c++
+  Executor(places);
+```
+A executor does not own any computing resources, a user can only construct an executor using the specified places.

-It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
+### Running an Executor

-It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
-
-## Implementation
-
-`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
+```
+  void Run(ProgramDesc, Scope, block_id, create_local_scope);
+```
+An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
--- a/doc/design/kernel_hint_design.md
+++ b/doc/design/kernel_hint_design.md
+## Problem
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+
+In the current design, we use KernelType to describe one kernel.
+
+```cpp
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+ `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
+
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+
+So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
+
+The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
+
+## Solution
+
+### Potential choice
+1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
+
+2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
+
+### Final choice
+To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
+
+In C++
+
+```cpp
+const std::string kForceCPU = "force_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
+
+KernelType GetExpectedKernelType() {
+  if (Attr<bool>(kForceCPU)) {
+    return KernelType(CPUPlace, ...)
+  } else {
+    ...
+  }
+}
+```
+
+In Python code
+
+```python
+FORCE_CPU = core.kForceCPU()
+
+def xx_layer(..., force_cpu=false):
+  layer_helper = LayerHelper(...)
+  layer_helper.append_op(
+    type="xx",
+    attr={FORCE_CPU: force_cpu})
+```
--- a/doc/design/mkl/mkl_packed.md
+++ b/doc/design/mkl/mkl_packed.md
@@ -30,10 +30,10 @@
 由于在现有的某些情况下（例如RNN），多次调用 cblas_?gemm 会使用相同的原数据，因此，每次调用时对原数据的重复Packing便成为了冗余。

 为了最大程度减少多次调用 cblas_?gemm 在Packing上的耗时，Intel® MKL 引入了以下四个API:
-   * cblas_?gemm_alloc
-   * cblas_?gemm_pack 
-   * cblas_?gemm_compute
-   * cblas_?gemm_free
+   * [cblas_?gemm_alloc](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-alloc)
+   * [cblas_?gemm_pack](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-pack)
+   * [cblas_?gemm_compute](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-compute)
+   * [cblas_?gemm_free](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-free)

 通过使用这些API，我们可以先完成对原数据的Packing操作，再把已转换为Packed格式的数据传递给那些复用同一数据的gemm_compute函数，从而避免了Packing冗余。

@@ -84,7 +84,20 @@ PaddlePaddle/Paddle
 2. 对比优化后layer与相对应的PaddlePaddle原有layer, 在batch mode下的结果。

 ### Python API
-TBD
+计划在`paddle/utils.Flags`中添加`use_mkl_packed`的flag，用于选择是否使用相关功能，并且当编译时`WITH_MKL=ON`的情况下，默认设置为`true`。
+
+同时，在`python/paddle/trainer/config_parser.py`中对应的layer处，添加`use_mkl_packed`这个选择，方便用户在Python端选择是否启用这个功能。
+
+具体实现方式比如：
+
+```python
+use_mkl_packed = bool(int(g_command_config_args.get("use_mkl_packed", 0)))
+if use_mkl_packed:
+    self.layer_type = mkl_packed_*
+```
+
+所有相关的`layer_type`会以*mkl_packed_*开头，这些会在`MKLPacked*Layer`注册layer的时候保证，以示区分。 
+

 ### Benchmarking
 会添加相应的脚本用于测试和对比在使用MKL Packed recurrent layers 前后的网络性能。

--- a/doc/design/operator_kernel_type.md
+++ b/doc/design/operator_kernel_type.md
+# Design Doc: The Keys of Operator Kernel Type
+## Problem
+An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique Kernel. Before an operator runs, an certain kernel must be chosen by a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  proto::DataType data_type_;
+};
+```
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
+
+It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys are not enough. We need a more complete representation of `OpKernelType`. 
+
+We often implement a kernel of an operator with some computing library in certain device(place). Please remind that computing library and device are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. 
+
+For example, Eigen library can support Nvidia GPU/AMD GPU/CPU. And MKLDNN library can support Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
+
+It's obvious that different DataTypes, like fp64/fp32/int8 will have different kernels. But the data layout of a Tensor will also lead to different implementation. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209). Data Layout should also be taken into consideration.
+
+## Solution
+
+There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  platform::Library library_;
+  proto::DataType data_type_;
+  framework::Layout layout_;
+};
+```
+
+Following is the details:
+
+### Place
+
+`Place` is defined as follows:
+
+```cpp
+typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
+```
+
+`Place` is to represent the device memory where data is locating.
+
+
+### Library
+
+One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
+
+```cpp
+enum Library { Plain, MKLDNN, CUDNN };
+```
+
+We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
+A library usually has a corresponding `DeviceContext` which contains some handles needed by computation. Fluid now have two default DeviceContexts in CPU and CUDA, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains a Eigen library handle and `CDUADeviceContext` contains a Eigen library handle and cuBLAS handle.
+
+If we want to support new Library, a new enumerator need to be added to `Library` and a new corresponding `LibraryDeviceContext` will be created.
+
+
+### DataType
+
+
+`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
+
+### Layout
+
+Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
+
+Different layout leads to different implementation of operator kernel. There are mainly 4 principles we have to follow to support layout in our fluid framework.
+
+- We take layout as a data member of Tensor. Layout is actually a enum variable. If fluid is built with MKLDNN, then, the memory format in MKLDNN will be added into this enum variable too.
+
+- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout of generating data. Of course, we can have some default layout, like NCHW.
+
+- The inference of Layout is at run-time, not compile-time.
+
+- Every operator have to implement different kernels for different layouts. Let's take MKLDNN as an example, if we want to implement a MKLDNN convolution operator, we have to realize all the kernels for different layout, list at [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to do registering kernels for MKLDNN operators.
+
+`Layout` is also defined as a enum variable:
+
+```cpp
+enum Layout {
+  kNCHW,
+  kNHWC,
+#ifdef PADDLE_WITH_MKLDNN
+  knChw8c
+  ...
+#endif
+};
+```
--- a/doc/design/refactor/multi_cpu.md
+++ b/doc/design/refactor/multi_cpu.md
+# Design Doc: Execute the Program with Multi CPU
+
+## Abstract
+
+This Design Doc propose an approach to make the user-defined Op graph
+running with multi-CPU, we will use an auto transpiler to convert the user-defined
+Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
+
+## Transpiler
+
+<img src="src/multi-threads/single-thread@3x.png" width="300">
+
+After converted:
+
+<img src="src/multi-threads/multi-threads@3x.png" width="1000">
+
+## Implement
+
+- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
+  which would be executed with multi-threads.
+- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
+  for the atomic counter become `0`:
+  ```cpp
+  BlockingCounter bc(thread_count);
+  for (int i = 0; i < thread_count; ++i) {
+    thread_pool->Start([&bc] {bc.DecrementCount(); })
+  }
+  bc.Wait();
+  ```
+- `ParallelDo` Operator
+  - Initialize a thread pool which is a Singleton.
+  - Use a block id as the input, and create run the specify Block on independent scope
+    with multi-threads.
+  - Initialize a `BlockingCounter` instance and wait until all threads are done.
+- `Split` Operator will split the Input Tensor into a TensorArray.
+- `Merge` merge all the gradients which calculated in different threads
+  with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
+
+## TODO
+
+- Improve the optimizer stage with multi-threads, since we could
+  assign the parameters to the different threads and execute
+  optimizer with multi-threads.
--- a/doc/design/refactor/src/multi-threads.graffle
+++ b/doc/design/refactor/src/multi-threads.graffle
--- a/doc/design/refactor/src/multi-threads/multi-threads@3x.png
+++ b/doc/design/refactor/src/multi-threads/multi-threads@3x.png
--- a/doc/design/refactor/src/multi-threads/single-thread@3x.png
+++ b/doc/design/refactor/src/multi-threads/single-thread@3x.png
--- a/doc/design/switch_kernel.md
+++ b/doc/design/switch_kernel.md
+## Background
+Every operator has many kernels because there are multiple data types, places, data layout that Fluid supports. We use the `KernelType` to describe kernel types that operators can hold. 
+
+The `KernelType` is as follows.
+
+```
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+
+The `place_` is a descriptor of the device and the computational library, e.g., `MKLDNNPlace`, `CUDAPlace`.
+
+The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float`/`double`.
+
+The `layout` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+
+## Problem
+
+We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
+
+1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
+2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
+3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
+
+Problems under these situations are similar. We can formalise this problem as follow.
+
+We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
+
+## Solution
+
+It is clearly that transforming inputs of an operator toadapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
+
+We can infer a kernel type from the inputs of an operators. We let this kernel type as `actual kernel type`, which means this kernel type is the actually kernel type that operator should be performed.
+
+We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
+
+We transform the input data from `actual` to `expect` if the expect kernel type is not as same as actual kernel type.
+
+The algorithm is described as follow
+
+```cpp
+using DataTransformationFN = std::function<void(const Tensor& in, Tensor* out)>;
+using KernelTypePair = std::pair<KernelType, KernelType>;
+
+map<KernelTypePair, DataTransformationFN> g_data_transformation_;
+
+void OpWithKernel::Run() {
+  vec<Tensor> inputs = ...
+  auto actual_kernel_type = GetActualKernelType(inputs);
+  
+  // The expected kernel type is related to actual kernel type.
+  // For the most operators, the expected kernel type is as same as
+  // actual kernel type.
+  //
+  // So we pass `actual_kernel_type` as a parameter of 
+  // GetExpectedKernelType
+  auto expect_kernel_type = GetExpectedKernelType(actual_kernel_type);
+  
+  auto trans = g_data_transformation_[{actual_kernel_type, expect_kernel_type}];
+  
+  kernel.run(trans(inputs));
+}
+```
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -70,13 +70,13 @@ PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其
   :header: "依赖", "版本", "说明"
   :widths: 10, 15, 30

-   "CMake", ">=3.5", ""
+   "CMake", ">=3.2", ""
   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
-   "Python", "2.7.x", "依赖libpython2.7.so"
-   "pip", ">=9.0", ""
-   "numpy", "", ""
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
   "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "可选"
+   "Go", ">=1.8", "可选"


 .. _build_options:

--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -76,13 +76,13 @@ will be downloaded automatically.
   :header: "Dependency", "Version", "Description"
   :widths: 10, 15, 30

-   "CMake", ">=3.5", ""
+   "CMake", ">=3.2", ""
   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
-   "Python", "2.7.x", "Need libpython2.7.so"
-   "pip", ">=9.0", ""
-   "numpy", "", ""
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
   "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "Optional"
+   "Go", ">=1.8", "Optional"


 .. _build_options:

--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -128,7 +128,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note

 AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
 是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
-`编译 <./build_from_source_cn.rst>`_ PaddlePaddle为no-avx版本。
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。

 以下指令能检查Linux电脑是否支持AVX：


--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -137,7 +137,7 @@ GPU driver installed before move on.
 AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
 The latest PaddlePaddle Docker image turns AVX on by default, so, if your
 computer doesn't support AVX, you'll probably need to
-`build <./build_from_source_en.rst>`_ with :code:`WITH_AVX=OFF`.
+`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`.

 The following command will tell you whether your computer supports AVX.


--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -37,11 +37,11 @@ PaddlePaddle可以使用常用的Python包管理工具
    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
    :widths: 1, 3, 3, 3

-    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"

 .. _pip_dependency:


--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -40,11 +40,11 @@ If the links below shows up the login form, just click "Log in as guest" to star
    :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
    :widths: 1, 3, 3, 3

-    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"

 .. _pip_dependency:


--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -53,7 +53,7 @@ Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU
 ```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
@@ -82,7 +82,7 @@ The equation is: Out = X * Y
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of scale operator.").NotInGradient();
    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();

--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -50,7 +50,7 @@ First, define `ProtoMaker` to describe the Operator's input, output, and additio
 ```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
@@ -79,7 +79,7 @@ An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/de
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of scale operator.").NotInGradient();
    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();

--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
-# PaddlePaddle分布式训练
+# 分布式训练


 ## 概述
@@ -181,8 +181,8 @@ PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务

 ## 在不同集群中运行

-  - [fabric](fabric_cn.md)
-  - [openmpi](openmpi_cn.md)
-  - [kubernetes](k8s_cn.md)
-  - [kubernetes distributed](k8s_distributed_cn.md)
-  - [kubernetes on AWS](k8s_aws_cn.md)
+  - [fabric集群](fabric_cn.md)
+  - [openmpi集群](openmpi_cn.md)
+  - [kubernetes单机](k8s_cn.md)
+  - [kubernetes distributed分布式](k8s_distributed_cn.md)
+  - [AWS上运行kubernetes集群训练](k8s_aws_cn.md)
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
-# PaddlePaddle Distributed Training
+# Distributed Training

 ## Introduction

@@ -188,5 +188,4 @@ These cluster platforms provide API or environment variables for training proces
  - [fabric](fabric_en.md)
  - [openmpi](openmpi_en.md)
  - [kubernetes](k8s_en.md)
-  - kubernetes distributed
  - [kubernetes on AWS](k8s_aws_en.md)
--- a/doc/howto/usage/cluster/k8s_aws_en.md
+++ b/doc/howto/usage/cluster/k8s_aws_en.md
@@ -493,7 +493,7 @@ spec:
    spec:
      containers:
      - name: paddle-data
-        image: paddledev/paddle-tutorial:k8s_data
+        image: paddlepaddle/paddle-tutorial:k8s_data
        imagePullPolicy: Always
        volumeMounts:
        - mountPath: "/efs"
@@ -522,7 +522,7 @@ NAME          DESIRED   SUCCESSFUL   AGE
 paddle-data   1         1            6m
 ```

-Data preparation is done by docker image `paddledev/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.
+Data preparation is done by docker image `paddlepaddle/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.

 #### Start Training

@@ -545,7 +545,7 @@ spec:
          claimName: efsvol
      containers:
      - name: trainer
-        image: paddledev/paddle-tutorial:k8s_train
+        image: paddlepaddle/paddle-tutorial:k8s_train
        command: ["bin/bash",  "-c", "/root/start.sh"]
        env:
        - name: JOB_NAME
@@ -617,7 +617,7 @@ kubectl --kubeconfig=kubeconfig log -f POD_NAME

 Run `kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job` to check training job status. It will complete in around 20 minutes.

-The details for start `pserver` and `trainer` are hidden inside docker image `paddledev/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.
+The details for start `pserver` and `trainer` are hidden inside docker image `paddlepaddle/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.

 #### Inspect Training Output


--- a/doc/howto/usage/cluster/k8s_cn.md
+++ b/doc/howto/usage/cluster/k8s_cn.md
 # Kubernetes单机训练

-在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
+在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的PaddlePaddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。

 ## 制作Docker镜像

-在一个功能齐全的Kubernetes机群里，通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话，一个分布式Paddle训练任务中的每个进程都可以从Ceph读取数据。在这个例子里，我们只演示一个单机作业，所以可以简化对环境的要求，把训练数据直接放在
-Paddle的Docker image里。为此，我们需要制作一个包含训练数据的Paddle镜像。
+在一个功能齐全的Kubernetes机群里，通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话，一个分布式PaddlePaddle训练任务中
+的每个进程都可以从Ceph读取数据。在这个例子里，我们只演示一个单机作业，所以可以简化对环境的要求，把训练数据直接放在
+PaddlePaddle的Docker Image里。为此，我们需要制作一个包含训练数据的PaddlePaddle镜像。

-Paddle 的 [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) 
-里介绍了用Paddle源码中的脚本下载训练数据的过程。
-而 `paddledev/paddle:cpu-demo-latest` 镜像里有 Paddle 源码与demo，（ 请注意，默认的
-Paddle镜像 `paddledev/paddle:cpu-latest` 是不包括源码的, Paddle的各版本镜像可以参考 [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html) ），所以我们使用这个镜像来下载训练数据到Docker container中，然后把这个包含了训练数据的container保存为一个新的镜像。
+PaddlePaddle的 `paddlepaddle/paddle:cpu-demo-latest` 镜像里有PaddlePaddle的源码与demo，
+（请注意，默认的PaddlePaddle生产环境镜像 `paddlepaddle/paddle:latest` 是不包括源码的，PaddlePaddle的各版本镜像可以参考
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)），
+下面我们使用这个镜像来下载数据到Docker Container中，并把这个包含了训练数据的Container保存为一个新的镜像。

 ### 运行容器

 ```
-$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
 ```

 ### 下载数据
@@ -103,7 +104,7 @@ spec:
      restartPolicy: Never
 ```

-### 创建Paddle Job
+### 创建PaddlePaddle Job

 使用上文创建的yaml文件创建Kubernetes Job，命令为：


--- a/doc/howto/usage/cluster/k8s_distributed_cn.md
+++ b/doc/howto/usage/cluster/k8s_distributed_cn.md
@@ -2,8 +2,6 @@

 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cluster/cluster_train_cn.html)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。

-有关Kubernetes相关概念以及如何搭建和配置Kubernetes集群，可以参考[k8s_basis](./k8s_basis_cn.md)。
-
 ## 整体方案

 在训练之前，用户将配置与训练数据切分好放在分布式文件系统预先分配好的目录中(不同的分布式文件系统，需要使用其制定的方式挂载后并导入数据)，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
@@ -28,7 +26,7 @@ PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行
 - 拷贝训练文件到容器内
 - 生成`paddle pserver`与`paddle train`进程的启动参数，并且启动训练

-因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile)。
+因为官方镜像 `paddlepaddle/paddle:latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile)。

 ```bash
 $ cd doc/howto/usage/k8s/src/k8s_train
@@ -62,7 +60,7 @@ spec:
      hostNetwork: true
      containers:
      - name: paddle-data
-        image: paddledev/paddle-tutorial:k8s_data
+        image: paddlepaddle/paddle-tutorial:k8s_data
        imagePullPolicy: Always
        volumeMounts:
        - mountPath: "/mnt"

--- a/doc/howto/usage/cluster/k8s_en.md
+++ b/doc/howto/usage/cluster/k8s_en.md
-# Paddle On Kubernetes
+# PaddlePaddle On Kubernetes

->In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster.
+In this article, we will introduce how to run PaddlePaddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run PaddlePaddle training job on distributed cluster.

 ## Build Docker Image

-In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data.
+In distributed Kubernetes cluster, we will use Ceph or other distributed
+storage system for storing training related data so that all processes in
+PaddlePaddle training can retrieve data from Ceph. In this example, we will
+only demo training job on single machine. In order to simplify the requirement
+of the environment, we will directly put training data into the PaddlePaddle Docker Image,
+so we need to create a PaddlePaddle Docker image that includes the training data.

-Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code.
-And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image.
+The production Docker Image `paddlepaddle/paddle:cpu-demo-latest` has the PaddlePaddle
+source code and demo. (Caution: Default PaddlePaddle Docker Image `paddlepaddle/paddle:latest` doesn't include
+the source code, PaddlePaddle's different versions of Docker Image can be referred here:
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_en.html)),
+so we run this Docker Image and download the training data, and then commit the whole
+Container to be a new Docker Image.

 ### Run Docker Container

 ```
-$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
 ```

 ### Download Training Data
@@ -67,7 +76,7 @@ $ docker commit quick_start_data mypaddle/paddle:quickstart

 ## Use Kubernetes For Training

->We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
+We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.

 ### Create Yaml Files

@@ -99,7 +108,7 @@ spec:
      restartPolicy: Never
 ```

-### Start Paddle Job
+### Start PaddlePaddle Job

 Using the above yaml file to start the Kubernetes job.


--- a/doc/howto/usage/cluster/src/Dockerfile
+++ b/doc/howto/usage/cluster/src/Dockerfile
-FROM paddledev/paddle:cpu-latest
+FROM paddlepaddle/paddle:latest

 MAINTAINER zjsxzong89@gmail.com


--- a/doc/howto/usage/cluster/src/k8s_train/Dockerfile
+++ b/doc/howto/usage/cluster/src/k8s_train/Dockerfile
-FROM paddledev/paddle:cpu-latest
+FROM paddlepaddle/paddle:latest

 COPY start.sh /root/
 COPY start_paddle.py /root/

--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -19,42 +19,42 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
+Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
  switch (attr_desc.type()) {
-    case framework::AttrType::BOOLEAN: {
+    case proto::AttrType::BOOLEAN: {
      return attr_desc.b();
    }
-    case framework::AttrType::INT: {
+    case proto::AttrType::INT: {
      return attr_desc.i();
    }
-    case framework::AttrType::FLOAT: {
+    case proto::AttrType::FLOAT: {
      return attr_desc.f();
    }
-    case framework::AttrType::STRING: {
+    case proto::AttrType::STRING: {
      return attr_desc.s();
    }
-    case framework::AttrType::BOOLEANS: {
+    case proto::AttrType::BOOLEANS: {
      std::vector<bool> val(attr_desc.bools_size());
      for (int i = 0; i < attr_desc.bools_size(); ++i) {
        val[i] = attr_desc.bools(i);
      }
      return val;
    }
-    case framework::AttrType::INTS: {
+    case proto::AttrType::INTS: {
      std::vector<int> val(attr_desc.ints_size());
      for (int i = 0; i < attr_desc.ints_size(); ++i) {
        val[i] = attr_desc.ints(i);
      }
      return val;
    }
-    case framework::AttrType::FLOATS: {
+    case proto::AttrType::FLOATS: {
      std::vector<float> val(attr_desc.floats_size());
      for (int i = 0; i < attr_desc.floats_size(); ++i) {
        val[i] = attr_desc.floats(i);
      }
      return val;
    }
-    case framework::AttrType::STRINGS: {
+    case proto::AttrType::STRINGS: {
      std::vector<std::string> val(attr_desc.strings_size());
      for (int i = 0; i < attr_desc.strings_size(); ++i) {
        val[i] = attr_desc.strings(i);

--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -27,12 +27,12 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 template <typename T>
-inline AttrType AttrTypeID() {
+inline proto::AttrType AttrTypeID() {
  Attribute tmp = T();
-  return static_cast<AttrType>(tmp.which() - 1);
+  return static_cast<proto::AttrType>(tmp.which() - 1);
 }

-Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
+Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc);

 class AttrReader {
 public:

--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -42,7 +42,7 @@ static std::unordered_set<std::string>& CtrlFlowOps() {
 static inline std::unique_ptr<OperatorBase> CreateGradOp(
    const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
    std::unordered_map<std::string, std::string>* grad_to_var) {
-  OpDescBind op_desc;
+  OpDesc op_desc;
  op_desc.SetInputMap(op.Inputs());
  op_desc.SetOutputMap(op.Outputs());
  op_desc.SetType(op.Type());
@@ -53,7 +53,7 @@ static inline std::unique_ptr<OperatorBase> CreateGradOp(
  grad_ops.reserve(grad_descs.size());
  std::transform(grad_descs.begin(), grad_descs.end(),
                 std::back_inserter(grad_ops),
-                 [](const std::unique_ptr<OpDescBind>& grad_desc) {
+                 [](const std::unique_ptr<OpDesc>& grad_desc) {
                   return OpRegistry::CreateOp(*grad_desc);
                 });
  PADDLE_ENFORCE(!grad_ops.empty());
@@ -217,7 +217,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
        // If part of input gradient of that operator is not calculated, fill
        // zero variables to that input gradient.
        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
-                                           {{"Y", {grad_input}}},
+                                           {{"Out", {grad_input}}},
                                           AttributeMap{}));
      }
      return false;
@@ -296,7 +296,7 @@ static std::string FwdName(const std::string& grad_name) {
 static void CreateGradVarInBlock(
    size_t grad_op_start_index,
    const std::unordered_map<std::string, std::string>& param_name_map,
-    BlockDescBind* block_desc,
+    BlockDesc* block_desc,
    std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
  auto ops = block_desc->AllOps();
  for (size_t op_index = grad_op_start_index; op_index < ops.size();
@@ -341,7 +341,7 @@ static void CreateGradVarInBlock(
      auto* param = block_desc->FindVarRecursive(pname);
      auto* grad = block_desc->FindVar(arg);
      if (param == nullptr) {
-        grad->SetDataType(DataType::FP32);
+        grad->SetDataType(proto::DataType::FP32);
      } else {
        grad->SetDataType(param->GetDataType());
      }
@@ -350,12 +350,11 @@ static void CreateGradVarInBlock(
  }
 }

-std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
-    const OpDescBind* op_desc, std::unordered_set<std::string>* no_grad_vars,
+std::vector<std::unique_ptr<OpDesc>> MakeOpGrad(
+    const OpDesc* op_desc, std::unordered_set<std::string>* no_grad_vars,
    std::unordered_map<std::string, std::string>* grad_to_var,
-    const std::vector<BlockDescBind*>& grad_block =
-        std::vector<BlockDescBind*>()) {
-  std::vector<std::unique_ptr<OpDescBind>> grad_op_descs;
+    const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>()) {
+  std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
  // All input gradients of forwarding operator do not need to calculate.
  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
  if (AllGradInSet(inputs, *no_grad_vars)) {
@@ -386,7 +385,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
          .Get(op_desc->Type())
          .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block);

-  std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
+  std::list<std::unique_ptr<OpDesc>> pending_fill_zeros_ops;
  for (auto& desc : grad_op_descs) {
    for (const std::string& in_name : desc->InputArgumentNames()) {
      if (no_grad_vars->count(in_name)) {
@@ -394,9 +393,9 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
        std::string new_name = prefix + kZeroVarSuffix;
        desc->Rename(in_name, new_name);
-        std::unique_ptr<OpDescBind> fill_zeros_op(
-            new OpDescBind("fill_zeros_like", {{"X", {prefix}}},
-                           {{"Y", {new_name}}}, AttributeMap{}));
+        std::unique_ptr<OpDesc> fill_zeros_op(
+            new OpDesc("fill_zeros_like", {{"X", {prefix}}},
+                       {{"Out", {new_name}}}, AttributeMap{}));
        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
      }
    }
@@ -408,34 +407,33 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
  return grad_op_descs;
 }

-static BlockDescBind* CreateStepBlock(
-    ProgramDescBind& program_desc,
-    std::unordered_set<std::string>* no_grad_vars,
+static BlockDesc* CreateStepBlock(
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
    std::unordered_map<std::string, std::string>* grad_to_var,
    int step_block_idx);

-std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
-    ProgramDescBind& program_desc, int block_idx,
+std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
+    ProgramDesc& program_desc, int block_idx,
    std::unordered_set<std::string>* no_grad_vars,
    std::unordered_map<std::string, std::string>* grad_to_var) {
  VLOG(5) << "MakeBlockBackward";
-  BlockDescBind* cur_block = program_desc.MutableBlock(block_idx);
-  std::vector<OpDescBind*> op_descs = cur_block->AllOps();
+  BlockDesc* cur_block = program_desc.MutableBlock(block_idx);
+  std::vector<OpDesc*> op_descs = cur_block->AllOps();
  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
  size_t grad_desc_idx = 0;
-  std::vector<std::unique_ptr<OpDescBind>> backward_descs;
+  std::vector<std::unique_ptr<OpDesc>> backward_descs;

  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
    VLOG(5) << "Making backward " << (*it)->Type() << " op";
-    std::vector<std::unique_ptr<OpDescBind>> op_grads;
+    std::vector<std::unique_ptr<OpDesc>> op_grads;

    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
      int step_block_idx = (*it)->GetBlockAttr("sub_block");
-      BlockDescBind* backward_block = CreateStepBlock(
-          program_desc, no_grad_vars, grad_to_var, step_block_idx);
+      BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
+                                                  grad_to_var, step_block_idx);
      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
    } else if ((*it)->Type() == "conditional_block") {
-      BlockDescBind* backward_block =
+      BlockDesc* backward_block =
          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
                          (*it)->GetBlockAttr("sub_block"));
      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
@@ -463,14 +461,14 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
      }
      ++grad_desc_idx;
    }
-    std::transform(
-        op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
-        [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
+    std::transform(op_grads.begin(), op_grads.end(),
+                   std::back_inserter(backward_descs),
+                   [](std::unique_ptr<OpDesc>& ptr) { return std::move(ptr); });
  }

  VLOG(5) << "Appending Sums";
  // Check whether some variables are written more than once
-  std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
+  std::list<std::pair<size_t, std::unique_ptr<OpDesc>>> pending_sum_ops;
  for (const auto& dup : dup_out_ops) {
    const std::string& out_name = dup.first;
    const std::vector<size_t> dup_op = dup.second;
@@ -486,16 +484,15 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
        sum_op_inputs.emplace_back(new_name);
        next_g_name = sum_op_inputs.back();
      }
-      std::unique_ptr<OpDescBind> sum_op(
-          new OpDescBind("sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}},
+      std::unique_ptr<OpDesc> sum_op(new OpDesc("sum", {{"X", sum_op_inputs}},
+                                                {{"Out", {out_name}}},
                                                AttributeMap{}));
      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
    }
  }

-  pending_sum_ops.sort(
-      [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
-         const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
+  pending_sum_ops.sort([](const std::pair<size_t, std::unique_ptr<OpDesc>>& a,
+                          const std::pair<size_t, std::unique_ptr<OpDesc>>& b) {
    return a.first > b.first;
  });
  for (auto& p : pending_sum_ops) {
@@ -508,14 +505,13 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
  return backward_descs;
 }

-static BlockDescBind* CreateStepBlock(
-    ProgramDescBind& program_desc,
-    std::unordered_set<std::string>* no_grad_vars,
+static BlockDesc* CreateStepBlock(
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
    std::unordered_map<std::string, std::string>* grad_to_var,
    int step_block_idx) {
  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
                                                   no_grad_vars, grad_to_var);
-  BlockDescBind* backward_block =
+  BlockDesc* backward_block =
      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
  for (auto& ptr : backward_block_op_descs) {
    backward_block->AppendAllocatedOp(move(ptr));
@@ -524,7 +520,7 @@ static BlockDescBind* CreateStepBlock(
 }

 ParamGradInfoMap AppendBackward(
-    ProgramDescBind& program_desc, const VarDescBind& target,
+    ProgramDesc& program_desc, const VarDesc& target,
    const std::unordered_set<std::string>& no_grad_vars) {
  std::unordered_set<std::string> no_grad_var_names;
  no_grad_var_names.reserve(no_grad_vars.size() + 1);
@@ -541,8 +537,8 @@ ParamGradInfoMap AppendBackward(
  PADDLE_ENFORCE(is_scalar, "target should be scalar");
  VLOG(3) << "backward from loss=" << target.Name()
          << " data_type=" << target.GetDataType();
-  std::unique_ptr<OpDescBind> fill_one_op(
-      new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
+  std::unique_ptr<OpDesc> fill_one_op(
+      new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}},
                 {{"shape", std::vector<int>{1}},
                  {"value", static_cast<float>(1.0)},
                  {"dtype", target.GetDataType()}}));

--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@@ -49,7 +49,7 @@ using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
                                            GradVarInfo /*grad_var_info*/>;

 ParamGradInfoMap AppendBackward(
-    ProgramDescBind& program_desc, const VarDescBind& target,
+    ProgramDesc& program_desc, const VarDesc& target,
    const std::unordered_set<std::string>& no_grad_vars);

 }  // namespace framework

--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -58,13 +58,13 @@ class RowWiseAddGradMaker : public SingleGradOpDescMaker {
  using SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<OpDescBind> Apply() const override {
-    auto grad_op = new OpDescBind();
+  std::unique_ptr<OpDesc> Apply() const override {
+    auto grad_op = new OpDesc();
    grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
    grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
    grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
    grad_op->SetType("rowwise_add_grad");
-    return std::unique_ptr<OpDescBind>(grad_op);
+    return std::unique_ptr<OpDesc>(grad_op);
  }
 };

@@ -159,14 +159,14 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
  FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "x");
-    AddOutput("Y", "out");
+    AddOutput("Out", "out");
    AddComment("");
  }
 };

 class SumOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SumOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "the input tensors of sum operator.").AsDuplicable();
    AddOutput("Out", "the output tensor of sum operator.");
@@ -190,11 +190,11 @@ class MinusGradOpDescMaker : public GradOpDescMakerBase {
 public:
  using GradOpDescMakerBase::GradOpDescMakerBase;

-  std::vector<std::unique_ptr<OpDescBind>> operator()() const override {
-    std::vector<std::unique_ptr<OpDescBind>> retv;
+  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
+    std::vector<std::unique_ptr<OpDesc>> retv;
    auto x_g = InputGrad("X");
    if (!x_g.empty()) {
-      auto *op_desc = new OpDescBind();
+      auto *op_desc = new OpDesc();
      op_desc->SetType("scale");
      op_desc->SetInput("X", OutputGrad("Out"));
      op_desc->SetOutput("Out", x_g);
@@ -204,7 +204,7 @@ class MinusGradOpDescMaker : public GradOpDescMakerBase {

    auto y_g = InputGrad("Y");
    if (!y_g.empty()) {
-      auto *op_desc = new OpDescBind();
+      auto *op_desc = new OpDesc();
      op_desc->SetType("scale");
      op_desc->SetInput("X", OutputGrad("Out"));
      op_desc->SetOutput("Out", y_g);
@@ -430,8 +430,8 @@ TEST(Backward, op_part_of_output_are_not_need) {
  ASSERT_EQ("fill_zeros_like", fill_zero.Type());
  ASSERT_EQ(1UL, fill_zero.Inputs("X").size());
  ASSERT_EQ("Z", fill_zero.Input("X"));
-  ASSERT_EQ(1UL, fill_zero.Outputs("Y").size());
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Y"));
+  ASSERT_EQ(1UL, fill_zero.Outputs("Out").size());
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out"));

  auto &d_many_out = *net->ops_[1];
  ASSERT_EQ("many_output_op_grad", d_many_out.Type());
@@ -505,25 +505,25 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
 }

 TEST(Backward, simple_single_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);

-  f::OpDescBind *op = block->AppendOp();
+  f::OpDesc *op = block->AppendOp();
  op->SetType("rowwise_add");
  op->SetInput("X", {"x"});
  op->SetInput("b", {"b"});
  op->SetOutput("Out", {"out"});

-  auto target = f::VarDescBind("out");
+  auto target = f::VarDesc("out");
  target.SetShape({1});
  auto var_to_grad =
      AppendBackward(program, target, std::unordered_set<std::string>{});

  ASSERT_EQ(block->AllOps().size(), 3UL);
-  f::OpDescBind *fill_op = block->AllOps()[1];
+  f::OpDesc *fill_op = block->AllOps()[1];
  EXPECT_EQ(fill_op->Type(), "fill_constant");

-  f::OpDescBind *grad_op = block->AllOps()[2];
+  f::OpDesc *grad_op = block->AllOps()[2];
  EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
@@ -543,16 +543,16 @@ TEST(Backward, simple_single_op) {
 }

 TEST(Backward, default_attribute) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op = block->AppendOp();
  op->SetType("mul");
  op->SetInput("X", {"x"});
  op->SetInput("Y", {"y"});
  op->SetOutput("Out", {"out"});
  op->CheckAttrs();

-  auto target = f::VarDescBind("out");
+  auto target = f::VarDesc("out");
  target.SetShape({1});
  AppendBackward(program, target, std::unordered_set<std::string>{});

@@ -560,47 +560,47 @@ TEST(Backward, default_attribute) {
  EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
  EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);

-  f::OpDescBind *fill_op = block->AllOps()[1];
+  f::OpDesc *fill_op = block->AllOps()[1];
  EXPECT_EQ(fill_op->Type(), "fill_constant");

-  f::OpDescBind *grad_op = block->AllOps()[2];
+  f::OpDesc *grad_op = block->AllOps()[2];
  ASSERT_EQ(grad_op->Type(), "mul_grad");
  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
 }

 TEST(Backward, simple_mult_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
  op1->SetType("rowwise_add");
  op1->SetInput("X", {"x1"});
  op1->SetInput("b", {"b1"});
  op1->SetOutput("Out", {"out1"});

-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
  op2->SetType("mul");
  op2->SetInput("X", {"out1"});
  op2->SetInput("Y", {"y2"});
  op2->SetOutput("Out", {"out2"});

-  f::OpDescBind *op3 = block->AppendOp();
+  f::OpDesc *op3 = block->AppendOp();
  op3->SetType("rowwise_add");
  op3->SetInput("X", {"out2"});
  op3->SetInput("b", {"b3"});
  op3->SetOutput("Out", {"out3"});

-  auto target = f::VarDescBind("out3");
+  auto target = f::VarDesc("out3");
  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad =
      AppendBackward(program, target, std::unordered_set<std::string>{});

  ASSERT_EQ(block->AllOps().size(), 6UL + 1);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
  EXPECT_EQ(fill_op->Type(), "fill_constant");

-  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  f::OpDesc *grad_op1 = block->AllOps()[6];
  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -611,7 +611,7 @@ TEST(Backward, simple_mult_op) {
  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
            std::vector<std::string>({f::GradVarName("b1")}));

-  f::OpDescBind *grad_op2 = block->AllOps()[5];
+  f::OpDesc *grad_op2 = block->AllOps()[5];
  EXPECT_EQ(grad_op2->Type(), "mul_grad");
  ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
@@ -625,7 +625,7 @@ TEST(Backward, simple_mult_op) {
  EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
            std::vector<std::string>({f::GradVarName("y2")}));

-  f::OpDescBind *grad_op3 = block->AllOps()[4];
+  f::OpDesc *grad_op3 = block->AllOps()[4];
  EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
@@ -655,42 +655,42 @@ TEST(Backward, simple_mult_op) {
 }

 TEST(Backward, intermedia_var_no_grad) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
  op1->SetType("rowwise_add");
  op1->SetInput("X", {"x1"});
  op1->SetInput("b", {"b1"});
  op1->SetOutput("Out", {"out1"});

-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
  op2->SetType("mul");
  op2->SetInput("X", {"x2"});
  op2->SetInput("Y", {"y2"});
  op2->SetOutput("Out", {"out2"});

-  f::OpDescBind *op3 = block->AppendOp();
+  f::OpDesc *op3 = block->AppendOp();
  op3->SetType("rowwise_add");
  op3->SetInput("X", {"out2"});
  op3->SetInput("b", {"b3"});
  op3->SetOutput("Out", {"out3"});

-  f::OpDescBind *op4 = block->AppendOp();
+  f::OpDesc *op4 = block->AppendOp();
  op4->SetType("mul");
  op4->SetInput("X", {"out1"});
  op4->SetInput("Y", {"out3"});
  op4->SetOutput("Out", {"out4"});

-  auto target = f::VarDescBind("out4");
+  auto target = f::VarDesc("out4");
  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {"out3"});

  ASSERT_EQ(block->AllOps().size(), 7UL);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
  EXPECT_EQ(fill_op->Type(), "fill_constant");

-  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  f::OpDesc *grad_op1 = block->AllOps()[6];
  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -701,7 +701,7 @@ TEST(Backward, intermedia_var_no_grad) {
  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
            std::vector<std::string>({f::GradVarName("b1")}));

-  f::OpDescBind *grad_op4 = block->AllOps()[5];
+  f::OpDesc *grad_op4 = block->AllOps()[5];
  EXPECT_EQ(grad_op4->Type(), "mul_grad");
  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
@@ -726,32 +726,32 @@ TEST(Backward, intermedia_var_no_grad) {
 }

 TEST(Backward, var_no_grad) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
  op1->SetType("mult_in_out");
  op1->SetInput("X", {"x1"});
  op1->SetInput("H", {"h1"});
  op1->SetOutput("Y", {"y1"});
  op1->SetOutput("Z", {"z1"});

-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
  op2->SetType("mult_in_out");
  op2->SetInput("X", {"y1"});
  op2->SetInput("H", {"z1"});
  op2->SetOutput("Y", {"y2"});
  op2->SetOutput("Z", {"z2"});

-  auto target = f::VarDescBind("z2");
+  auto target = f::VarDesc("z2");
  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {"z1"});

  ASSERT_EQ(block->AllOps().size(), 6UL);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
  EXPECT_EQ(fill_op->Type(), "fill_constant");

-  f::OpDescBind *grad_op2 = block->AllOps()[3];
+  f::OpDesc *grad_op2 = block->AllOps()[3];
  ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
  ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
@@ -767,15 +767,15 @@ TEST(Backward, var_no_grad) {
            std::vector<std::string>({f::GradVarName("y1")}));
  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());

-  f::OpDescBind *fill_zero_op = block->AllOps()[4];
+  f::OpDesc *fill_zero_op = block->AllOps()[4];
  ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
  ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
  ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
  EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(fill_zero_op->Output("Y"),
+  EXPECT_EQ(fill_zero_op->Output("Out"),
            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));

-  f::OpDescBind *grad_op1 = block->AllOps()[5];
+  f::OpDesc *grad_op1 = block->AllOps()[5];
  ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
  ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -803,37 +803,37 @@ TEST(Backward, var_no_grad) {
 }

 TEST(Backward, shared_var) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
  op1->SetType("rowwise_add");
  op1->SetInput("X", {"x1"});
  op1->SetInput("b", {"b1"});
  op1->SetOutput("Out", {"out1"});

-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
  op2->SetType("mul");
  op2->SetInput("X", {"out1"});
  op2->SetInput("Y", {"y2"});
  op2->SetOutput("Out", {"out2"});

-  f::OpDescBind *op3 = block->AppendOp();
+  f::OpDesc *op3 = block->AppendOp();
  op3->SetType("rowwise_add");
  op3->SetInput("X", {"out1"});
  op3->SetInput("b", {"b3"});
  op3->SetOutput("Out", {"out3"});

-  auto target = f::VarDescBind("out3");
+  auto target = f::VarDesc("out3");
  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad =
      AppendBackward(program, target, std::unordered_set<std::string>{});

  ASSERT_EQ(block->AllOps().size(), 8UL);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
  EXPECT_EQ(fill_op->Type(), "fill_constant");

-  f::OpDescBind *grad_op3 = block->AllOps()[4];
+  f::OpDesc *grad_op3 = block->AllOps()[4];
  ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
@@ -844,7 +844,7 @@ TEST(Backward, shared_var) {
  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
            std::vector<std::string>({f::GradVarName("b3")}));

-  f::OpDescBind *grad_op4 = block->AllOps()[5];
+  f::OpDesc *grad_op4 = block->AllOps()[5];
  ASSERT_EQ(grad_op4->Type(), "mul_grad");
  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
@@ -858,7 +858,7 @@ TEST(Backward, shared_var) {
  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
            std::vector<std::string>({f::GradVarName("y2")}));

-  f::OpDescBind *sum_op = block->AllOps()[6];
+  f::OpDesc *sum_op = block->AllOps()[6];
  ASSERT_EQ(sum_op->Type(), "sum");
  ASSERT_EQ(sum_op->InputNames().size(), 1UL);
  ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
@@ -868,7 +868,7 @@ TEST(Backward, shared_var) {
  EXPECT_EQ(sum_op->Output("Out"),
            std::vector<std::string>({f::GradVarName("out1")}));

-  f::OpDescBind *grad_op1 = block->AllOps()[7];
+  f::OpDesc *grad_op1 = block->AllOps()[7];
  ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -895,19 +895,19 @@ TEST(Backward, shared_var) {
 }

 TEST(Backward, half_backward) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
  auto *op1 = block->AppendOp();
  op1->SetType("minus");
  op1->SetInput("X", {"a"});
  op1->SetInput("Y", {"b"});
  op1->SetOutput("Out", {"out"});

-  auto target = f::VarDescBind("out");
+  auto target = f::VarDesc("out");
  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {"b"});
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
  EXPECT_EQ(fill_op->Type(), "fill_constant");
  auto ops = block->AllOps();
  ASSERT_EQ(3UL, ops.size());

--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -19,18 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-VarDescBind *BlockDescBind::Var(const std::string &name) {
+VarDesc *BlockDesc::Var(const std::string &name) {
  auto it = vars_.find(name);
  if (it != vars_.end()) {
    return it->second.get();
  }
  need_update_ = true;
-  auto *var = new VarDescBind(name);
+  auto *var = new VarDesc(name);
  vars_[name].reset(var);
  return var;
 }

-VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
+VarDesc *BlockDesc::FindVar(const std::string &name) const {
  auto it = vars_.find(name);
  if (it == vars_.end()) {
    return nullptr;
@@ -38,11 +38,11 @@ VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
  return it->second.get();
 }

-bool BlockDescBind::HasVar(const std::string &name) const {
+bool BlockDesc::HasVar(const std::string &name) const {
  return vars_.find(name) != vars_.end();
 }

-VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
+VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const {
  if (name == kEmptyVarName) return nullptr;

  auto it = vars_.find(name);
@@ -53,53 +53,67 @@ VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
  return it->second.get();
 }

-VarDescBind *BlockDescBind::FindRecursiveOrCreateVar(
-    const std::string &name_bytes) {
-  VarDescBind *res = FindVarRecursive(name_bytes);
+VarDesc *BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
+  VarDesc *res = FindVarRecursive(name_bytes);
  if (res == nullptr) {
    res = Var(name_bytes);
  }
  return res;
 }

-bool BlockDescBind::HasVarRecursive(const std::string &name) const {
+bool BlockDesc::HasVarRecursive(const std::string &name) const {
  return FindVarRecursive(name) != nullptr;
 }

-std::vector<VarDescBind *> BlockDescBind::AllVars() const {
-  std::vector<VarDescBind *> res;
+std::vector<VarDesc *> BlockDesc::AllVars() const {
+  std::vector<VarDesc *> res;
  for (const auto &p : vars_) {
    res.push_back(p.second.get());
  }
  return res;
 }

-OpDescBind *BlockDescBind::AppendOp() {
+OpDesc *BlockDesc::AppendOp() {
  need_update_ = true;
-  ops_.emplace_back(new OpDescBind());
+  ops_.emplace_back(new OpDesc());
  return ops_.back().get();
 }

-void BlockDescBind::AppendAllocatedOp(std::unique_ptr<OpDescBind> &&op_desc) {
+void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
  need_update_ = true;
  ops_.emplace_back(std::move(op_desc));
 }

-OpDescBind *BlockDescBind::PrependOp() {
+OpDesc *BlockDesc::PrependOp() {
  need_update_ = true;
-  ops_.emplace_front(new OpDescBind());
+  ops_.emplace_front(new OpDesc());
  return ops_.front().get();
 }

-std::vector<OpDescBind *> BlockDescBind::AllOps() const {
-  std::vector<OpDescBind *> res;
+void BlockDesc::RemoveOp(size_t s, size_t e) {
+  if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
+    return;
+  }
+  need_update_ = true;
+  for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) {
+    auto names = (*it)->InputArgumentNames();
+    for (auto n : names) {
+      // TODO(typhoonzero): delete vars if no other op use it.
+      VLOG(3) << "deleting var " << n;
+    }
+  }
+  ops_.erase(ops_.begin() + s, ops_.begin() + e);
+}
+
+std::vector<OpDesc *> BlockDesc::AllOps() const {
+  std::vector<OpDesc *> res;
  for (const auto &op : ops_) {
    res.push_back(op.get());
  }
  return res;
 }

-void BlockDescBind::Flush() {
+void BlockDesc::Flush() {
  for (auto &op_desc : ops_) {
    op_desc->Flush();
  }
@@ -121,43 +135,43 @@ void BlockDescBind::Flush() {
  }
 }

-BlockDescBind *BlockDescBind::ParentBlock() const {
+BlockDesc *BlockDesc::ParentBlock() const {
  if (this->desc_->parent_idx() == kNoneBlockIndex) {
    return nullptr;
  }
  return prog_->MutableBlock(static_cast<size_t>(this->desc_->parent_idx()));
 }

-BlockDesc *BlockDescBind::Proto() {
+proto::BlockDesc *BlockDesc::Proto() {
  Flush();
  return desc_;
 }

-BlockDescBind::BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
+BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
    : prog_(prog), desc_(desc), need_update_(false) {
-  for (const VarDesc &var_desc : desc_->vars()) {
-    vars_[var_desc.name()].reset(new VarDescBind(var_desc));
+  for (const proto::VarDesc &var_desc : desc_->vars()) {
+    vars_[var_desc.name()].reset(new VarDesc(var_desc));
  }
-  for (const OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDescBind(op_desc, prog));
+  for (const proto::OpDesc &op_desc : desc_->ops()) {
+    ops_.emplace_back(new OpDesc(op_desc, prog));
  }
 }

-BlockDescBind::BlockDescBind(const BlockDescBind &other, BlockDesc *desc,
-                             ProgramDescBind *prog)
+BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
+                     ProgramDesc *prog)
    : prog_(prog), desc_(desc) {
  need_update_ = true;
  for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDescBind(*op));
+    ops_.emplace_back(new OpDesc(*op));
  }

  for (auto &it : other.vars_) {
-    auto *var = new VarDescBind(*it.second);
+    auto *var = new VarDesc(*it.second);
    vars_[it.first].reset(var);
  }
 }

-void BlockDescBind::ClearPBOps() {
+void BlockDesc::ClearPBOps() {
  auto ops = this->desc_->mutable_ops();
  while (!ops->empty()) {
    // we do not own the OpDesc, so release the ownership.
@@ -165,7 +179,7 @@ void BlockDescBind::ClearPBOps() {
  }
 }

-void BlockDescBind::ClearPBVars() {
+void BlockDesc::ClearPBVars() {
  auto vars = this->desc_->mutable_vars();
  while (!vars->empty()) {
    // we do not own the VarDesc, so release the ownership.

--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -28,20 +28,19 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-class ProgramDescBind;
+class ProgramDesc;

 // Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
 // read/write speed. Only when we want the protobuf message, the local changes
 // will be synchronized (by `Sync` method).

-class BlockDescBind {
+class BlockDesc {
 public:
-  BlockDescBind(ProgramDescBind *prog, BlockDesc *desc);
+  BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc);

-  BlockDescBind(const BlockDescBind &other, BlockDesc *desc,
-                ProgramDescBind *prog);
+  BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);

-  ~BlockDescBind() {
+  ~BlockDesc() {
    this->ClearPBVars();
    this->ClearPBOps();
  }
@@ -50,15 +49,15 @@ class BlockDescBind {

  int32_t Parent() const { return desc_->parent_idx(); }

-  VarDescBind *Var(const std::string &name_bytes);
+  VarDesc *Var(const std::string &name_bytes);

-  VarDescBind *FindVar(const std::string &name_bytes) const;
+  VarDesc *FindVar(const std::string &name_bytes) const;

  bool HasVar(const std::string &var_name) const;

-  VarDescBind *FindVarRecursive(const std::string &name_bytes) const;
+  VarDesc *FindVarRecursive(const std::string &name_bytes) const;

-  VarDescBind *FindRecursiveOrCreateVar(const std::string &name_bytes);
+  VarDesc *FindRecursiveOrCreateVar(const std::string &name_bytes);

  bool HasVarRecursive(const std::string &var_name) const;

@@ -70,41 +69,43 @@ class BlockDescBind {
    return var_names;
  }

-  std::vector<VarDescBind *> AllVars() const;
+  std::vector<VarDesc *> AllVars() const;

-  BlockDescBind *ParentBlock() const;
+  BlockDesc *ParentBlock() const;

-  OpDescBind *AppendOp();
+  OpDesc *AppendOp();

-  void AppendAllocatedOp(std::unique_ptr<OpDescBind> &&op_desc);
+  void AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc);

-  OpDescBind *PrependOp();
+  OpDesc *PrependOp();

-  std::vector<OpDescBind *> AllOps() const;
+  void RemoveOp(size_t s, size_t e);
+
+  std::vector<OpDesc *> AllOps() const;

  size_t OpSize() const { return ops_.size(); }

-  OpDescBind *Op(int idx) { return ops_.at(idx).get(); }
+  OpDesc *Op(int idx) { return ops_.at(idx).get(); }

  void Flush();

-  BlockDesc *Proto();
+  proto::BlockDesc *Proto();

-  ProgramDescBind *Program() { return this->prog_; }
+  ProgramDesc *Program() { return this->prog_; }

 private:
  void ClearPBOps();
  void ClearPBVars();

 private:
-  ProgramDescBind *prog_;  // not_own
-  BlockDesc *desc_;        // not_own
+  ProgramDesc *prog_;       // not_own
+  proto::BlockDesc *desc_;  // not_own
  bool need_update_;

-  std::deque<std::unique_ptr<OpDescBind>> ops_;
-  std::unordered_map<std::string, std::unique_ptr<VarDescBind>> vars_;
+  std::deque<std::unique_ptr<OpDesc>> ops_;
+  std::unordered_map<std::string, std::unique_ptr<VarDesc>> vars_;

-  DISABLE_COPY_AND_ASSIGN(BlockDescBind);
+  DISABLE_COPY_AND_ASSIGN(BlockDesc);
 };
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/data_layout.h
+++ b/paddle/framework/data_layout.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+
+enum DataLayout {
+  kNHWC = 0,
+  kNCHW = 1,
+  kAnyLayout = 2,
+};
+
+inline DataLayout StringToDataLayout(const std::string& str) {
+  if (str == "NHWC" || str == "nhwc") {
+    return DataLayout::kNHWC;
+  } else if (str == "NCHW" || str == "nchw") {
+    return DataLayout::kNCHW;
+  } else {
+    PADDLE_THROW("Unknown storage order string: %s", str);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -20,7 +20,8 @@
 namespace paddle {
 namespace framework {

-inline DataType ToDataType(std::type_index type) {
+inline proto::DataType ToDataType(std::type_index type) {
+  using namespace paddle::framework::proto;
  if (typeid(float).hash_code() == type.hash_code()) {
    return DataType::FP32;
  } else if (typeid(double).hash_code() == type.hash_code()) {
@@ -36,7 +37,8 @@ inline DataType ToDataType(std::type_index type) {
  }
 }

-inline std::type_index ToTypeIndex(DataType type) {
+inline std::type_index ToTypeIndex(proto::DataType type) {
+  using namespace paddle::framework::proto;
  switch (type) {
    case DataType::FP32:
      return typeid(float);
@@ -54,7 +56,8 @@ inline std::type_index ToTypeIndex(DataType type) {
 }

 template <typename Visitor>
-inline void VisitDataType(DataType type, Visitor visitor) {
+inline void VisitDataType(proto::DataType type, Visitor visitor) {
+  using namespace paddle::framework::proto;
  switch (type) {
    case DataType::FP32:
      visitor.template operator()<float>();

--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
@@ -90,7 +90,7 @@ struct OpInfoFiller<T, kOperator> {
 template <typename T>
 struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
  void operator()(const char* op_type, OpInfo* info) const {
-    info->proto_ = new OpProto;
+    info->proto_ = new proto::OpProto;
    info->checker_ = new OpAttrChecker();
    auto maker = T(info->proto_, info->checker_);
    maker.Validate();
@@ -106,10 +106,10 @@ template <typename T>
 struct OpInfoFiller<T, kGradOpDescMaker> {
  void operator()(const char* op_type, OpInfo* info) const {
    info->grad_op_maker_ = [](
-        const OpDescBind& fwd_op,
+        const OpDesc& fwd_op,
        const std::unordered_set<std::string>& no_grad_set,
        std::unordered_map<std::string, std::string>* grad_to_var,
-        const std::vector<BlockDescBind*>& grad_block) {
+        const std::vector<BlockDesc*>& grad_block) {
      T maker(fwd_op, no_grad_set, grad_to_var, grad_block);
      return maker();
    };
@@ -119,7 +119,7 @@ struct OpInfoFiller<T, kGradOpDescMaker> {
 template <typename T>
 struct OpInfoFiller<T, kVarTypeInference> {
  void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_var_type_ = [](const OpDescBind& fwd_op, BlockDescBind* block) {
+    info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) {
      T inference;
      inference(fwd_op, block);
    };

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -41,20 +41,20 @@ Executor::Executor(const std::vector<platform::Place>& places) {
  device_contexts_.swap(borrowed_contexts);
 }

-static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
-  if (var_type == VarDesc::LOD_TENSOR) {
+static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
+  if (var_type == proto::VarDesc::LOD_TENSOR) {
    var->GetMutable<LoDTensor>();
-  } else if (var_type == VarDesc::SELECTED_ROWS) {
+  } else if (var_type == proto::VarDesc::SELECTED_ROWS) {
    var->GetMutable<SelectedRows>();
-  } else if (var_type == VarDesc::FEED_MINIBATCH) {
+  } else if (var_type == proto::VarDesc::FEED_MINIBATCH) {
    var->GetMutable<FeedFetchList>();
-  } else if (var_type == VarDesc::FETCH_LIST) {
+  } else if (var_type == proto::VarDesc::FETCH_LIST) {
    var->GetMutable<FeedFetchList>();
-  } else if (var_type == VarDesc::STEP_SCOPES) {
+  } else if (var_type == proto::VarDesc::STEP_SCOPES) {
    var->GetMutable<std::vector<framework::Scope>>();
-  } else if (var_type == VarDesc::LOD_RANK_TABLE) {
+  } else if (var_type == proto::VarDesc::LOD_RANK_TABLE) {
    var->GetMutable<LoDRankTable>();
-  } else if (var_type == VarDesc::LOD_TENSOR_ARRAY) {
+  } else if (var_type == proto::VarDesc::LOD_TENSOR_ARRAY) {
    var->GetMutable<LoDTensorArray>();
  } else {
    PADDLE_THROW(
@@ -64,8 +64,8 @@ static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
  }
 }

-void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
-                   bool create_local_scope) {
+void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
+                   bool create_local_scope, bool create_vars) {
  // TODO(tonyyang-svail):
  //    - only runs on the first device (i.e. no interdevice communication)
  //    - will change to use multiple blocks for RNN op and Cond Op
@@ -74,6 +74,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
  auto& device = device_contexts_[0];

  Scope* local_scope = scope;
+  if (create_vars) {
    if (create_local_scope) {
      local_scope = &scope->NewScope();
      for (auto& var : block.AllVars()) {
@@ -100,7 +101,8 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
        VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
                << ptr;
      }
-  }
+    }  // if (create_local_scope)
+  }    // if (create_vars)

  for (auto& op_desc : block.AllOps()) {
    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);

--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -40,6 +40,16 @@ class DeviceContextPool {
    return *pool;
  }

+  const platform::DeviceContext* Borrow(const platform::Place& place) {
+    auto range = device_contexts_.equal_range(place);
+    if (range.first == range.second) {
+      PADDLE_THROW(
+          "'Place' is not supported, Please re-compile with WITH_GPU "
+          "option");
+    }
+    return range.first->second;
+  }
+
  std::vector<const platform::DeviceContext*> Borrow(
      const std::vector<platform::Place>& places) {
    PADDLE_ENFORCE_GT(places.size(), 0);
@@ -114,7 +124,8 @@ class Executor {
   *  ProgramDesc
   *  Scope
   */
-  void Run(const ProgramDescBind&, Scope*, int, bool create_local_scope = true);
+  void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
+           bool create_vars = true);

 private:
  std::vector<const platform::DeviceContext*> device_contexts_;

--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -14,7 +14,7 @@ limitations under the License. */

 syntax = "proto2";
 option optimize_for = LITE_RUNTIME;
-package paddle.framework;
+package paddle.framework.proto;

 enum AttrType {
  INT = 0;

--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -22,21 +22,27 @@
 namespace paddle {
 namespace framework {

+/*
+  This functor class is responsible for creating the gradient ops for the given
+  operator fwd_op. After it is called (through operator()), the pairs of
+  (gradient variable, corresponding input variable of fwd_op) will be added to
+  grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its
+  gradient varialbe will be ignored or kEmptyVarName depending on the template
+  argument DropEmptyIG in the derived classes.
+ */
 class GradOpDescMakerBase {
 public:
  explicit GradOpDescMakerBase(
-      const OpDescBind& fwd_op,
-      const std::unordered_set<std::string>& no_grad_set,
+      const OpDesc& fwd_op, const std::unordered_set<std::string>& no_grad_set,
      std::unordered_map<std::string, std::string>* grad_to_var,
-      const std::vector<BlockDescBind*>& grad_block =
-          std::vector<BlockDescBind*>())
+      const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>())
      : fwd_op_(fwd_op),
        no_grad_set_(no_grad_set),
        grad_to_var_(grad_to_var),
        grad_block_(grad_block) {}

  virtual ~GradOpDescMakerBase() = default;
-  virtual std::vector<std::unique_ptr<OpDescBind>> operator()() const = 0;
+  virtual std::vector<std::unique_ptr<OpDesc>> operator()() const = 0;

 protected:
  std::vector<std::string> InputGrad(const std::string& name,
@@ -58,6 +64,16 @@ class GradOpDescMakerBase {
    if (!drop_empty_grad) {
      return ret_val;
    }
+    PADDLE_ENFORCE_LE(var_names.size(), 1UL,
+                      "BUG from operator developer:"
+                      " for input argument with a list of variables, "
+                      " drop_empty_grad is not allowed because it makes"
+                      " the correspondence bewteen a variable and its gradient"
+                      " ambiguous. Use REGISTER_OP_EX to register the op"
+                      " or call InputGrad(?,false) in GradOpDescMaker."
+                      " Op type %s",
+                      fwd_op_.Type());
+
    std::vector<std::string> dropped_ret_val;
    dropped_ret_val.reserve(ret_val.size());
    std::copy_if(ret_val.begin(), ret_val.end(),
@@ -105,26 +121,26 @@ class GradOpDescMakerBase {
  std::string ForwardOpType() const { return this->fwd_op_.Type(); }

 private:
-  const OpDescBind& fwd_op_;
+  const OpDesc& fwd_op_;
  const std::unordered_set<std::string>& no_grad_set_;
  std::unordered_map<std::string, std::string>* grad_to_var_;

 protected:
-  std::vector<BlockDescBind*> grad_block_;
+  std::vector<BlockDesc*> grad_block_;
 };

 class SingleGradOpDescMaker : public GradOpDescMakerBase {
 public:
  using GradOpDescMakerBase::GradOpDescMakerBase;

-  std::vector<std::unique_ptr<OpDescBind>> operator()() const {
-    std::vector<std::unique_ptr<OpDescBind>> retv;
+  std::vector<std::unique_ptr<OpDesc>> operator()() const {
+    std::vector<std::unique_ptr<OpDesc>> retv;
    retv.emplace_back(this->Apply());
    return retv;
  }

 protected:
-  virtual std::unique_ptr<OpDescBind> Apply() const = 0;
+  virtual std::unique_ptr<OpDesc> Apply() const = 0;
 };

 template <bool DropEmptyIG = true>
@@ -133,8 +149,8 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
  using SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  virtual std::unique_ptr<OpDescBind> Apply() const {
-    auto* grad = new OpDescBind();
+  virtual std::unique_ptr<OpDesc> Apply() const {
+    auto* grad = new OpDesc();
    grad->SetType(this->GradOpType());

    for (auto& input_param : this->InputNames()) {
@@ -150,7 +166,7 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {

    grad->SetAttrMap(this->Attrs());

-    return std::unique_ptr<OpDescBind>(grad);
+    return std::unique_ptr<OpDesc>(grad);
  }

  virtual std::string GradOpType() const {
@@ -161,7 +177,7 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
 class EmptyGradOpMaker : public GradOpDescMakerBase {
 public:
  using GradOpDescMakerBase::GradOpDescMakerBase;
-  std::vector<std::unique_ptr<OpDescBind>> operator()() const override {
+  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
    return {};
  }
 };

--- a/paddle/framework/library_type.h
+++ b/paddle/framework/library_type.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+
+// For more details about the design of LibraryType, Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library
+
+enum LibraryType { kPlain = 0; kMKLDNN = 1; kCUDNN = 2; }
+
+}  // namespace
+}  // framework
--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/framework/lod_rank_table.cc
@@ -46,4 +46,13 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
 }

 }  // namespace framework
+
+std::ostream& operator<<(std::ostream& out,
+                         const framework::LoDRankTable& table) {
+  out << "NumOfSequence " << table.items().size() << "\n";
+  for (auto& each_item : table.items()) {
+    out << "\tSeq #" << each_item.index << ", Len=" << each_item.length << "\n";
+  }
+  return out;
+}
 }  // namespace paddle
--- a/paddle/framework/lod_rank_table.h
+++ b/paddle/framework/lod_rank_table.h
@@ -13,6 +13,7 @@
   limitations under the License. */

 #pragma once
+#include <iosfwd>
 #include "paddle/framework/lod_tensor.h"

 namespace paddle {
@@ -52,4 +53,8 @@ class LoDRankTable {
 };

 }  // namespace framework
+
+std::ostream& operator<<(std::ostream& out,
+                         const framework::LoDRankTable& table);
+
 }  // namespace paddle
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -197,7 +197,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
  {  // the 2nd field, tensor description
     // int32_t  size
     // void*    protobuf message
-    framework::TensorDesc desc;
+    proto::TensorDesc desc;
    desc.set_data_type(framework::ToDataType(tensor.type()));
    auto dims = framework::vectorize(tensor.dims());
    auto *pb_dims = desc.mutable_dims();
@@ -262,7 +262,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
  uint32_t version;
  is.read(reinterpret_cast<char *>(&version), sizeof(version));
  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-  framework::TensorDesc desc;
+  proto::TensorDesc desc;
  {  // int32_t size
     // proto buffer
    int32_t size;
@@ -281,16 +281,16 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
    void *buf;
    platform::Place cpu = platform::CPUPlace();
    switch (desc.data_type()) {
-      case framework::FP32:
+      case proto::FP32:
        buf = tensor->mutable_data<float>(cpu);
        break;
-      case framework::FP64:
+      case proto::FP64:
        buf = tensor->mutable_data<double>(cpu);
        break;
-      case framework::INT32:
+      case proto::INT32:
        buf = tensor->mutable_data<int>(cpu);
        break;
-      case framework::INT64:
+      case proto::INT64:
        buf = tensor->mutable_data<int64_t>(cpu);
        break;
      default:

--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -184,6 +184,18 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
  return tensor;
 }

+// Get the absolute offset of a lod[start_level][start_idx:end_idx] and
+// relative length of details for every levels(i.e., [start_level: ]).
+//
+// For example,
+//   lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]]
+//   start_level = 0
+//   start_idx = 1
+//   end_idx = 3
+//
+// Returns:
+//  LoD = [[1, 4], [2, 4, 2, 3, 2]]
+//  pair<size_t, size_t> = {11, 24}
 std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
    const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);


--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-class OpDescBind;
-class BlockDescBind;
+class OpDesc;
+class BlockDesc;
 class CompileTimeInferShapeContext : public InferShapeContext {
 public:
-  CompileTimeInferShapeContext(const OpDescBind &op,
-                               const BlockDescBind &block);
+  CompileTimeInferShapeContext(const OpDesc &op, const BlockDesc &block);

  bool HasInput(const std::string &name) const override;

@@ -58,11 +57,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
    PADDLE_ENFORCE_LT(j, Outputs(out).size());
    auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
    auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
-    if (in_var->GetType() != VarDesc::LOD_TENSOR) {
+    if (in_var->GetType() != proto::VarDesc::LOD_TENSOR) {
      VLOG(3) << "input " << in << " is not LodTensor";
      return;
    }
-    PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
+    PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarDesc::LOD_TENSOR,
                      "The %d-th output of Output(%s) must be LoDTensor.", j,
                      out);
    out_var->SetLoDLevel(in_var->GetLodLevel());
@@ -70,19 +69,18 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  bool IsRuntime() const override;

 protected:
-  VarDesc::VarType GetVarType(const std::string &name) const override;
+  proto::VarDesc::VarType GetVarType(const std::string &name) const override;

  DDim GetDim(const std::string &name) const override;

  void SetDim(const std::string &name, const DDim &dim) override;

-  const OpDescBind &op_;
-  const BlockDescBind &block_;
+  const OpDesc &op_;
+  const BlockDesc &block_;
 };

-OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const AttributeMap &attrs) {
+OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
+               const VariableNameMap &outputs, const AttributeMap &attrs) {
  desc_.set_type(type);
  inputs_ = inputs;
  outputs_ = outputs;
@@ -90,12 +88,12 @@ OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
  need_update_ = true;
 }

-OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
+OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
    : desc_(desc), need_update_(false) {
  // restore inputs_
  int input_size = desc_.inputs_size();
  for (int i = 0; i < input_size; ++i) {
-    const OpDesc::Var &var = desc_.inputs(i);
+    const proto::OpDesc::Var &var = desc_.inputs(i);
    std::vector<std::string> &args = inputs_[var.parameter()];
    int argu_size = var.arguments_size();
    args.reserve(argu_size);
@@ -106,7 +104,7 @@ OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
  // restore outputs_
  int output_size = desc_.outputs_size();
  for (int i = 0; i < output_size; ++i) {
-    const OpDesc::Var &var = desc_.outputs(i);
+    const proto::OpDesc::Var &var = desc_.outputs(i);
    std::vector<std::string> &args = outputs_[var.parameter()];
    int argu_size = var.arguments_size();
    args.reserve(argu_size);
@@ -115,9 +113,9 @@ OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
    }
  }
  // restore attrs_
-  for (const OpDesc::Attr &attr : desc_.attrs()) {
+  for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
    std::string attr_name = attr.name();
-    if (attr.type() != AttrType::BLOCK) {
+    if (attr.type() != proto::AttrType::BLOCK) {
      attrs_[attr_name] = GetAttrValue(attr);
    } else {
      auto bid = attr.block_idx();
@@ -126,20 +124,19 @@ OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
  }
 }

-OpDesc *OpDescBind::Proto() {
+proto::OpDesc *OpDesc::Proto() {
  Flush();
  return &desc_;
 }

-const std::vector<std::string> &OpDescBind::Input(
-    const std::string &name) const {
+const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
  auto it = inputs_.find(name);
  PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name,
                 Type());
  return it->second;
 }

-std::vector<std::string> OpDescBind::InputArgumentNames() const {
+std::vector<std::string> OpDesc::InputArgumentNames() const {
  std::vector<std::string> retv;
  for (auto &ipt : this->inputs_) {
    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
@@ -147,21 +144,20 @@ std::vector<std::string> OpDescBind::InputArgumentNames() const {
  return retv;
 }

-void OpDescBind::SetInput(const std::string &param_name,
+void OpDesc::SetInput(const std::string &param_name,
                      const std::vector<std::string> &args) {
  need_update_ = true;
  inputs_[param_name] = args;
 }

-const std::vector<std::string> &OpDescBind::Output(
-    const std::string &name) const {
+const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
  auto it = outputs_.find(name);
  PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
                 name, Type());
  return it->second;
 }

-std::vector<std::string> OpDescBind::OutputArgumentNames() const {
+std::vector<std::string> OpDesc::OutputArgumentNames() const {
  std::vector<std::string> retv;
  for (auto &ipt : this->outputs_) {
    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
@@ -169,19 +165,19 @@ std::vector<std::string> OpDescBind::OutputArgumentNames() const {
  return retv;
 }

-void OpDescBind::SetOutput(const std::string &param_name,
+void OpDesc::SetOutput(const std::string &param_name,
                       const std::vector<std::string> &args) {
  need_update_ = true;
  this->outputs_[param_name] = args;
 }

-AttrType OpDescBind::GetAttrType(const std::string &name) const {
+proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
  auto it = attrs_.find(name);
  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return static_cast<AttrType>(it->second.which() - 1);
+  return static_cast<proto::AttrType>(it->second.which() - 1);
 }

-std::vector<std::string> OpDescBind::AttrNames() const {
+std::vector<std::string> OpDesc::AttrNames() const {
  std::vector<std::string> retv;
  retv.reserve(attrs_.size());
  for (auto &attr : attrs_) {
@@ -190,41 +186,39 @@ std::vector<std::string> OpDescBind::AttrNames() const {
  return retv;
 }

-void OpDescBind::SetAttr(const std::string &name, const Attribute &v) {
+void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
  this->attrs_[name] = v;
  need_update_ = true;
 }

-void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
+void OpDesc::SetBlockAttr(const std::string &name, BlockDesc &block) {
  this->attrs_[name] = &block;
  need_update_ = true;
 }

-void OpDescBind::SetAttrMap(
+void OpDesc::SetAttrMap(
    const std::unordered_map<std::string, Attribute> &attr_map) {
  attrs_ = attr_map;
  need_update_ = true;
 }

-Attribute OpDescBind::GetAttr(const std::string &name) const {
+Attribute OpDesc::GetAttr(const std::string &name) const {
  auto it = attrs_.find(name);
  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
  return it->second;
 }

-int OpDescBind::GetBlockAttr(const std::string &name) const {
+int OpDesc::GetBlockAttr(const std::string &name) const {
  auto it = attrs_.find(name);
  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return boost::get<BlockDescBind *>(it->second)->ID();
+  return boost::get<BlockDesc *>(it->second)->ID();
 }

-const std::unordered_map<std::string, Attribute> &OpDescBind::GetAttrMap()
-    const {
+const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
  return attrs_;
 }

-void OpDescBind::Rename(const std::string &old_name,
-                        const std::string &new_name) {
+void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
  for (auto &input : inputs_) {
    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
  }
@@ -235,7 +229,7 @@ void OpDescBind::Rename(const std::string &old_name,
  need_update_ = true;
 }

-void OpDescBind::RenameOutput(const std::string &old_name,
+void OpDesc::RenameOutput(const std::string &old_name,
                          const std::string &new_name) {
  for (auto &output : outputs_) {
    std::replace(output.second.begin(), output.second.end(), old_name,
@@ -244,7 +238,7 @@ void OpDescBind::RenameOutput(const std::string &old_name,
  need_update_ = true;
 }

-void OpDescBind::RenameInput(const std::string &old_name,
+void OpDesc::RenameInput(const std::string &old_name,
                         const std::string &new_name) {
  for (auto &input : inputs_) {
    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
@@ -253,8 +247,8 @@ void OpDescBind::RenameInput(const std::string &old_name,
 }

 struct SetAttrDescVisitor : public boost::static_visitor<void> {
-  explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
-  mutable OpDesc::Attr *attr_;
+  explicit SetAttrDescVisitor(proto::OpDesc::Attr *attr) : attr_(attr) {}
+  mutable proto::OpDesc::Attr *attr_;
  void operator()(int v) const { attr_->set_i(v); }
  void operator()(float v) const { attr_->set_f(v); }
  void operator()(const std::string &v) const { attr_->set_s(v); }
@@ -272,11 +266,13 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
  void operator()(const std::vector<bool> &v) const {
    VectorToRepeated(v, attr_->mutable_bools());
  }
-  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->idx()); }
+  void operator()(proto::BlockDesc *desc) const {
+    attr_->set_block_idx(desc->idx());
+  }
  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
 };

-void OpDescBind::Flush() {
+void OpDesc::Flush() {
  if (need_update_) {
    this->desc_.mutable_inputs()->Clear();
    for (auto &ipt : inputs_) {
@@ -297,7 +293,7 @@ void OpDescBind::Flush() {
      auto *attr_desc = desc_.add_attrs();
      attr_desc->set_name(attr.first);
      attr_desc->set_type(
-          static_cast<framework::AttrType>(attr.second.which() - 1));
+          static_cast<proto::AttrType>(attr.second.which() - 1));
      SetAttrDescVisitor visitor(attr_desc);
      boost::apply_visitor(visitor, attr.second);
    }
@@ -328,7 +324,7 @@ static void InitInferShapeFuncs() {
  });
 }

-void OpDescBind::CheckAttrs() {
+void OpDesc::CheckAttrs() {
  PADDLE_ENFORCE(!Type().empty(),
                 "CheckAttr() can not be called before type is setted.");
  auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
@@ -340,7 +336,7 @@ void OpDescBind::CheckAttrs() {
  checker->Check(attrs_);
 }

-void OpDescBind::InferShape(const BlockDescBind &block) const {
+void OpDesc::InferShape(const BlockDesc &block) const {
  VLOG(3) << "CompileTime infer shape on " << Type();
  InitInferShapeFuncs();
  auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
@@ -363,7 +359,7 @@ void OpDescBind::InferShape(const BlockDescBind &block) const {
  infer_shape(&ctx);
 }

-void OpDescBind::InferVarType(BlockDescBind *block) const {
+void OpDesc::InferVarType(BlockDesc *block) const {
  auto &info = OpInfoMap::Instance().Get(this->Type());
  if (info.infer_var_type_) {
    info.infer_var_type_(*this, block);
@@ -375,14 +371,14 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
    for (auto &out_pair : this->outputs_) {
      for (auto &out_var_name : out_pair.second) {
        block->FindRecursiveOrCreateVar(out_var_name)
-            ->SetType(VarDesc::LOD_TENSOR);
+            ->SetType(proto::VarDesc::LOD_TENSOR);
      }
    }
  }
 }

 CompileTimeInferShapeContext::CompileTimeInferShapeContext(
-    const OpDescBind &op, const BlockDescBind &block)
+    const OpDesc &op, const BlockDesc &block)
    : op_(op), block_(block) {}

 bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
@@ -484,7 +480,7 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
 }
 bool CompileTimeInferShapeContext::IsRuntime() const { return false; }

-VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
+proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
    const std::string &name) const {
  return block_.FindVarRecursive(name)->GetType();
 }

--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -23,19 +23,19 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-class BlockDescBind;
-class ProgramDescBind;
+class BlockDesc;
+class ProgramDesc;

-class OpDescBind {
+class OpDesc {
 public:
-  OpDescBind() {}
+  OpDesc() {}

-  OpDescBind(const std::string &type, const VariableNameMap &inputs,
+  OpDesc(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const AttributeMap &attrs);

-  OpDescBind(const OpDesc &desc, ProgramDescBind *prog);
+  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog);

-  OpDesc *Proto();
+  proto::OpDesc *Proto();

  std::string Type() const { return desc_.type(); }

@@ -59,13 +59,13 @@ class OpDescBind {
    return attrs_.find(name) != attrs_.end();
  }

-  AttrType GetAttrType(const std::string &name) const;
+  proto::AttrType GetAttrType(const std::string &name) const;

  std::vector<std::string> AttrNames() const;

  void SetAttr(const std::string &name, const Attribute &v);

-  void SetBlockAttr(const std::string &name, BlockDescBind &block);
+  void SetBlockAttr(const std::string &name, BlockDesc &block);

  Attribute GetAttr(const std::string &name) const;

@@ -107,9 +107,9 @@ class OpDescBind {

  void CheckAttrs();

-  void InferShape(const BlockDescBind &block) const;
+  void InferShape(const BlockDesc &block) const;

-  void InferVarType(BlockDescBind *block) const;
+  void InferVarType(BlockDesc *block) const;

  void MarkAsTarget() { desc_.set_is_target(true); }

@@ -126,8 +126,10 @@ class OpDescBind {
    return ret_val;
  }

-  OpDesc desc_;
+  proto::OpDesc desc_;
+  // input arg name => output variable names
  VariableNameMap inputs_;
+  // output arg name => output variable names
  VariableNameMap outputs_;
  AttributeMap attrs_;


--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -34,7 +34,7 @@ class InferShapeBase {
 struct OpInfo {
  OpCreator creator_;
  GradOpMakerFN grad_op_maker_;
-  OpProto* proto_{nullptr};
+  proto::OpProto* proto_{nullptr};
  OpAttrChecker* checker_{nullptr};
  InferVarTypeFN infer_var_type_;
  InferShapeFN infer_shape_;
@@ -43,7 +43,7 @@ struct OpInfo {
    return proto_ != nullptr && checker_ != nullptr;
  }

-  const OpProto& Proto() const {
+  const proto::OpProto& Proto() const {
    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
    PADDLE_ENFORCE(proto_->IsInitialized(),
                   "Operator Proto must be initialized in op info");

--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/framework/op_proto_maker.h
@@ -22,6 +22,8 @@ namespace framework {
 // this class not only make proto but also init attribute checkers.
 class OpProtoAndCheckerMaker {
 public:
+  using OpProto = proto::OpProto;
+  using OpAttrChecker = framework::OpAttrChecker;
  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
      : proto_(proto), op_checker_(op_checker) {}

@@ -80,7 +82,7 @@ class OpProtoAndCheckerMaker {

 class NOPMaker : public OpProtoAndCheckerMaker {
 public:
-  NOPMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  NOPMaker(OpProto* proto, framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {}
 };


--- a/paddle/framework/op_proto_maker_test.cc
+++ b/paddle/framework/op_proto_maker_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */

 class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 public:
-  TestAttrProtoMaker(paddle::framework::OpProto* proto,
+  TestAttrProtoMaker(paddle::framework::proto::OpProto* proto,
                     paddle::framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddAttr<float>("scale", "scale of test op");
@@ -27,7 +27,7 @@ class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };

 TEST(ProtoMaker, DuplicatedAttr) {
-  paddle::framework::OpProto op_proto;
+  paddle::framework::proto::OpProto op_proto;
  paddle::framework::OpAttrChecker op_checker;
  auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
@@ -35,7 +35,7 @@ TEST(ProtoMaker, DuplicatedAttr) {

 class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 public:
-  TestInOutProtoMaker(paddle::framework::OpProto* proto,
+  TestInOutProtoMaker(paddle::framework::proto::OpProto* proto,
                      paddle::framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("input", "input of test op");
@@ -44,7 +44,7 @@ class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };

 TEST(ProtoMaker, DuplicatedInOut) {
-  paddle::framework::OpProto op_proto;
+  paddle::framework::proto::OpProto op_proto;
  paddle::framework::OpAttrChecker op_checker;
  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);

--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -31,7 +31,8 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
 }

 static VariableNameMap ConvertOpDescVarsToVarNameMap(
-    const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) {
+    const google::protobuf::RepeatedPtrField<proto::OpDesc::Var>&
+        op_desc_vars) {
  VariableNameMap ret_val;
  for (auto& var : op_desc_vars) {
    auto& var_names = ret_val[var.parameter()];
@@ -43,9 +44,10 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap(
  return ret_val;
 }

-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
+    const proto::OpDesc& op_desc) {
  VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be"
-             "used in unit tests. Use CreateOp(const OpDescBind& op_desc) "
+             "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
             "instead.";
  VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
  VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
@@ -57,7 +59,7 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
  return CreateOp(op_desc.type(), inputs, outputs, attrs);
 }

-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDescBind& op_desc) {
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
  return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(),
                  op_desc.GetAttrMap());
 }

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -77,9 +77,9 @@ class OpRegistry {
                                                const VariableNameMap& outputs,
                                                AttributeMap attrs);

-  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
+  static std::unique_ptr<OperatorBase> CreateOp(const proto::OpDesc& op_desc);

-  static std::unique_ptr<OperatorBase> CreateOp(const OpDescBind& op_desc);
+  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
 };

 template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
@@ -126,6 +126,14 @@ class OpKernelRegistrar : public Registrar {
                             __test_global_namespace_##uniq_name##__>::value, \
                msg)

+/*
+  The variadic arguments should be class types derived from one of the
+  following classes:
+    OpProtoAndCheckerMaker
+    GradOpDescMakerBase
+    VarTypeInference
+    InferShapeBase
+*/
 #define REGISTER_OPERATOR(op_type, op_class, ...)                      \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
      __reg_op__##op_type,                                             \
@@ -144,15 +152,24 @@ class OpKernelRegistrar : public Registrar {
  }

 /**
- * Macro to register Operator.
+ * Macro to register Operator. When the input is duplicable, you should
+ * use REGISTER_OP_EX with deop_empty_grad=false instead.
 */
 #define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
                    grad_op_class)                                   \
+  REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,    \
+                 grad_op_class, true)
+
+// When an argument is duplicable, we need to use this version.
+// Perhaps we can omit DropEmptyIG template parameter and
+// only have one version of REGISTER_OP.
+#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,       \
+                       grad_op_class, drop_empty_grad)                        \
  REGISTER_OPERATOR(grad_op_type, grad_op_class);                             \
  class _GradOpDescMaker_##grad_op_type##_                                    \
-      : public ::paddle::framework::DefaultGradOpDescMaker<true> {         \
+      : public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
    using ::paddle::framework::DefaultGradOpDescMaker<                        \
-        true>::DefaultGradOpDescMaker;                                     \
+        drop_empty_grad>::DefaultGradOpDescMaker;                             \
                                                                              \
   protected:                                                                 \
    virtual std::string GradOpType() const { return #grad_op_type; }          \

--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -51,7 +51,7 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {

 static void BuildVar(const std::string& param_name,
                     std::initializer_list<const char*> arguments,
-                     paddle::framework::OpDesc::Var* var) {
+                     paddle::framework::proto::OpDesc::Var* var) {
  var->set_parameter(param_name);
  for (auto& arg_name : arguments) {
    var->add_arguments(arg_name);
@@ -63,7 +63,7 @@ REGISTER_OP_WITHOUT_GRADIENT(my_test_op, paddle::framework::MyTestOp,
                             paddle::framework::MyTestOpProtoAndCheckerMaker);

 TEST(OpRegistry, CreateOp) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("cos_sim");
  BuildVar("input", {"aa"}, op_desc.add_inputs());
  BuildVar("output", {"bb"}, op_desc.add_outputs());
@@ -71,7 +71,7 @@ TEST(OpRegistry, CreateOp) {
  float scale = 3.3;
  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(scale);

  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
@@ -83,14 +83,14 @@ TEST(OpRegistry, CreateOp) {
 }

 TEST(OpRegistry, IllegalAttr) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("cos_sim");
  BuildVar("input", {"aa"}, op_desc.add_inputs());
  BuildVar("output", {"bb"}, op_desc.add_outputs());

  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(-2.0);

  bool caught = false;
@@ -108,7 +108,7 @@ TEST(OpRegistry, IllegalAttr) {
 }

 TEST(OpRegistry, DefaultValue) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("cos_sim");
  BuildVar("input", {"aa"}, op_desc.add_inputs());
  BuildVar("output", {"bb"}, op_desc.add_outputs());
@@ -123,7 +123,7 @@ TEST(OpRegistry, DefaultValue) {
 }

 TEST(OpRegistry, CustomChecker) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("my_test_op");
  BuildVar("input", {"ii"}, op_desc.add_inputs());
  BuildVar("output", {"oo"}, op_desc.add_outputs());
@@ -145,7 +145,7 @@ TEST(OpRegistry, CustomChecker) {
  // set 'test_attr' set to an illegal value
  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("test_attr");
-  attr->set_type(paddle::framework::AttrType::INT);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
  attr->set_i(3);
  caught = false;
  try {
@@ -164,7 +164,7 @@ TEST(OpRegistry, CustomChecker) {
  op_desc.mutable_attrs()->Clear();
  attr = op_desc.mutable_attrs()->Add();
  attr->set_name("test_attr");
-  attr->set_type(paddle::framework::AttrType::INT);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
  attr->set_i(4);
  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  paddle::platform::CPUDeviceContext dev_ctx;

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -377,7 +377,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
    }
  }

-  VarDesc::VarType GetVarType(const std::string& name) const override {
+  proto::VarDesc::VarType GetVarType(const std::string& name) const override {
    auto* var = scope_.FindVar(name);
    return ToVarType(var->Type());
  }
@@ -417,7 +417,7 @@ OpKernelType OperatorWithKernel::GetKernelType(
    const ExecutionContext& ctx) const {
  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
 }
-DataType OperatorWithKernel::IndicateDataType(
+proto::DataType OperatorWithKernel::IndicateDataType(
    const ExecutionContext& ctx) const {
  auto& scope = ctx.scope();
  int data_type = -1;
@@ -443,7 +443,7 @@ DataType OperatorWithKernel::IndicateDataType(
    }
  }
  PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
-  return static_cast<DataType>(data_type);
+  return static_cast<proto::DataType>(data_type);
 }

 }  // namespace framework

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -358,12 +358,13 @@ struct OpKernelType {
  };

  platform::Place place_;
-  DataType data_type_;
+  proto::DataType data_type_;

-  OpKernelType(DataType data_type, platform::Place place)
+  OpKernelType(proto::DataType data_type, platform::Place place)
      : place_(place), data_type_(data_type) {}

-  OpKernelType(DataType data_type, const platform::DeviceContext& dev_ctx)
+  OpKernelType(proto::DataType data_type,
+               const platform::DeviceContext& dev_ctx)
      : place_(dev_ctx.GetPlace()), data_type_(data_type) {}

  bool operator==(const OpKernelType& o) const {
@@ -409,7 +410,7 @@ class OperatorWithKernel : public OperatorBase {
 private:
  // indicate kernel DataType by input data. Defaultly all input data must be
  // same.
-  DataType IndicateDataType(const ExecutionContext& ctx) const;
+  proto::DataType IndicateDataType(const ExecutionContext& ctx) const;
 };

 std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key);

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -58,7 +58,7 @@ class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {

 static void BuildVar(const std::string& param_name,
                     std::initializer_list<const char*> arguments,
-                     paddle::framework::OpDesc::Var* var) {
+                     paddle::framework::proto::OpDesc::Var* var) {
  var->set_parameter(param_name);
  for (auto& arg_name : arguments) {
    *var->mutable_arguments()->Add() = arg_name;
@@ -70,14 +70,14 @@ REGISTER_OP_WITHOUT_GRADIENT(
    paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);

 TEST(OperatorBase, all) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("test_operator");
  BuildVar("input", {"IN1"}, op_desc.add_inputs());
  BuildVar("output", {"OUT1"}, op_desc.add_outputs());

  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(3.14);

  paddle::platform::CPUDeviceContext device_context;
@@ -115,7 +115,7 @@ class OpWithKernelTest : public OperatorWithKernel {
 protected:
  void InferShape(framework::InferShapeContext* ctx) const override {}
  OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
-    return OpKernelType(DataType::FP32, ctx.GetPlace());
+    return OpKernelType(proto::DataType::FP32, ctx.GetPlace());
  }
 };

@@ -195,14 +195,14 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,

 // test with single input
 TEST(OpKernel, all) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
  op_desc.set_type("op_with_kernel");
  BuildVar("x", {"IN1"}, op_desc.add_inputs());
  BuildVar("y", {"OUT1"}, op_desc.add_outputs());

  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(3.14);

  paddle::platform::CPUDeviceContext cpu_device_context;
@@ -224,7 +224,7 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 TEST(OpKernel, multi_inputs) {
  using namespace paddle::framework;

-  OpDesc op_desc;
+  proto::OpDesc op_desc;
  op_desc.set_type("op_multi_inputs_with_kernel");
  BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
  BuildVar("k", {"k0"}, op_desc.add_inputs());
@@ -232,7 +232,7 @@ TEST(OpKernel, multi_inputs) {

  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
  attr->set_f(3.14);

  paddle::platform::CPUDeviceContext cpu_device_context;

--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -18,49 +18,49 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-BlockDescBind *ProgramDescBind::AppendBlock(const BlockDescBind &parent) {
+BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
  auto *b = desc_.add_blocks();
  b->set_parent_idx(parent.ID());
  b->set_idx(desc_.blocks_size() - 1);
-  blocks_.emplace_back(new BlockDescBind(this, b));
+  blocks_.emplace_back(new BlockDesc(this, b));
  return blocks_.back().get();
 }

-ProgramDesc *ProgramDescBind::Proto() {
+proto::ProgramDesc *ProgramDesc::Proto() {
  for (auto &block : blocks_) {
    block->Flush();
  }
  return &desc_;
 }

-ProgramDescBind::ProgramDescBind() {
+ProgramDesc::ProgramDesc() {
  auto *block = desc_.mutable_blocks()->Add();
  block->set_idx(kRootBlockIndex);
  block->set_parent_idx(kNoneBlockIndex);
-  blocks_.emplace_back(new BlockDescBind(this, block));
+  blocks_.emplace_back(new BlockDesc(this, block));
 }

-ProgramDescBind::ProgramDescBind(const ProgramDescBind &o) {
+ProgramDesc::ProgramDesc(const ProgramDesc &o) {
  desc_ = o.desc_;

  for (int i = 0; i < desc_.blocks_size(); ++i) {
    auto *block = desc_.mutable_blocks(i);
-    blocks_.emplace_back(new BlockDescBind(*o.blocks_[i], block, this));
+    blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
  }
 }

-ProgramDescBind::ProgramDescBind(const ProgramDesc &desc) {
+ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
  desc_ = desc;
  for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDescBind(this, &block_desc));
+    blocks_.emplace_back(new BlockDesc(this, &block_desc));
  }
 }

-ProgramDescBind::ProgramDescBind(const std::string &binary_str) {
+ProgramDesc::ProgramDesc(const std::string &binary_str) {
  PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
                 "Fail to parse program_desc from binary string.");
  for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDescBind(this, &block_desc));
+    blocks_.emplace_back(new BlockDesc(this, &block_desc));
  }
 }


--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -23,32 +23,32 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-class BlockDescBind;
+class BlockDesc;

-class ProgramDescBind {
+class ProgramDesc {
 public:
-  ProgramDescBind();
+  ProgramDesc();

-  explicit ProgramDescBind(const ProgramDesc &desc);
+  explicit ProgramDesc(const proto::ProgramDesc &desc);

-  ProgramDescBind(const ProgramDescBind &o);
+  ProgramDesc(const ProgramDesc &o);

-  explicit ProgramDescBind(const std::string &binary_str);
+  explicit ProgramDesc(const std::string &binary_str);

-  BlockDescBind *AppendBlock(const BlockDescBind &parent);
+  BlockDesc *AppendBlock(const BlockDesc &parent);

-  BlockDescBind *MutableBlock(size_t idx) { return blocks_[idx].get(); }
+  BlockDesc *MutableBlock(size_t idx) { return blocks_[idx].get(); }

-  const BlockDescBind &Block(size_t idx) const { return *blocks_[idx]; }
+  const BlockDesc &Block(size_t idx) const { return *blocks_[idx]; }

  size_t Size() const { return blocks_.size(); }

-  ProgramDesc *Proto();
+  proto::ProgramDesc *Proto();

 private:
-  ProgramDesc desc_;
+  proto::ProgramDesc desc_;

-  std::vector<std::unique_ptr<BlockDescBind>> blocks_;
+  std::vector<std::unique_ptr<BlockDesc>> blocks_;
 };
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
@@ -19,18 +19,18 @@
 namespace paddle {
 namespace framework {
 TEST(ProgramDesc, copy_ctor) {
-  ProgramDescBind program;
+  ProgramDesc program;
  auto* global_block = program.MutableBlock(0);
  auto* x = global_block->Var("X");
-  x->SetType(VarDesc_VarType_LOD_TENSOR);
+  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
  x->SetLoDLevel(0);
-  x->SetDataType(FP32);
+  x->SetDataType(proto::FP32);
  x->SetShape({1000, 784});

  auto* y = global_block->Var("Y");
-  y->SetType(VarDesc_VarType_LOD_TENSOR);
+  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
  y->SetLoDLevel(0);
-  y->SetDataType(FP32);
+  y->SetDataType(proto::FP32);
  y->SetShape({784, 100});

  auto* op = global_block->AppendOp();
@@ -39,15 +39,15 @@ TEST(ProgramDesc, copy_ctor) {
  op->SetInput("Y", {y->Name()});

  auto* out = global_block->Var("Out");
-  out->SetType(VarDesc_VarType_LOD_TENSOR);
+  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
  op->SetOutput("Y", {out->Name()});

-  ProgramDescBind program_copy(program);
+  ProgramDesc program_copy(program);

  auto* global_block_copy = program_copy.MutableBlock(0);
  ASSERT_NE(global_block, global_block_copy);

-  auto assert_same_var = [&](const std::string& name, VarDescBind* var_before) {
+  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
    ASSERT_TRUE(global_block_copy->HasVar(name));
    auto* copy = global_block_copy->Var(name);
    ASSERT_NE(copy, var_before);
@@ -81,18 +81,18 @@ TEST(ProgramDesc, copy_ctor) {
 }

 TEST(ProgramDescBind, serialize_and_deserialize) {
-  ProgramDescBind program_origin;
+  ProgramDesc program_origin;
  auto* global_block = program_origin.MutableBlock(0);
  auto* x = global_block->Var("X");
-  x->SetType(VarDesc_VarType_LOD_TENSOR);
+  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
  x->SetLoDLevel(0);
-  x->SetDataType(FP32);
+  x->SetDataType(proto::FP32);
  x->SetShape({1000, 784});

  auto* y = global_block->Var("Y");
-  y->SetType(VarDesc_VarType_LOD_TENSOR);
+  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
  y->SetLoDLevel(0);
-  y->SetDataType(FP32);
+  y->SetDataType(proto::FP32);
  y->SetShape({784, 100});

  auto* op = global_block->AppendOp();
@@ -101,17 +101,17 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
  op->SetInput("Y", {y->Name()});

  auto* out = global_block->Var("Out");
-  out->SetType(VarDesc_VarType_LOD_TENSOR);
+  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
  op->SetOutput("Y", {out->Name()});

  std::string binary_str;
  program_origin.Proto()->SerializeToString(&binary_str);

-  ProgramDescBind program_restored(binary_str);
+  ProgramDesc program_restored(binary_str);
  auto* global_block_restored = program_restored.MutableBlock(0);
  ASSERT_NE(global_block, global_block_restored);

-  auto assert_same_var = [&](const std::string& name, VarDescBind* var_before) {
+  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
    ASSERT_TRUE(global_block_restored->HasVar(name));
    auto* restored = global_block_restored->Var(name);
    ASSERT_NE(restored, var_before);

--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
@@ -29,7 +29,7 @@ const std::string kFetchOpType = "fetch";
 const std::string kDropOutOpType = "dropout";
 const std::string kBatchNormOpType = "batch_norm";

-bool HasDependentVar(const OpDesc& op_desc,
+bool HasDependentVar(const proto::OpDesc& op_desc,
                     const std::set<std::string>& dependent_vars) {
  for (auto& var : op_desc.outputs()) {
    for (auto& argu : var.arguments()) {
@@ -41,14 +41,15 @@ bool HasDependentVar(const OpDesc& op_desc,
  return false;
 }

-bool IsTarget(const OpDesc& op_desc) {
+bool IsTarget(const proto::OpDesc& op_desc) {
  if (op_desc.has_is_target()) {
    return op_desc.is_target();
  }
  return false;
 }

-void prune_impl(const ProgramDesc& input, ProgramDesc* output, int block_id) {
+void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
+                int block_id) {
  // TODO(tonyyang-svail):
  //    - will change to use multiple blocks for RNN op and Cond Op

@@ -104,12 +105,12 @@ void prune_impl(const ProgramDesc& input, ProgramDesc* output, int block_id) {
 }

 // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
-void Prune(const ProgramDesc& input, ProgramDesc* output) {
+void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
  prune_impl(input, output, 0);
 }

-void inference_optimize_impl(const ProgramDesc& input, ProgramDesc* output,
-                             int block_id) {
+void inference_optimize_impl(const proto::ProgramDesc& input,
+                             proto::ProgramDesc* output, int block_id) {
  *output = input;
  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
  for (auto& op_desc : *op_field) {
@@ -125,7 +126,8 @@ void inference_optimize_impl(const ProgramDesc& input, ProgramDesc* output,
  }
 }

-void InferenceOptimize(const ProgramDesc& input, ProgramDesc* output) {
+void InferenceOptimize(const proto::ProgramDesc& input,
+                       proto::ProgramDesc* output) {
  inference_optimize_impl(input, output, 0);
 }


--- a/paddle/framework/prune.h
+++ b/paddle/framework/prune.h
@@ -20,9 +20,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-void Prune(const ProgramDesc& input, ProgramDesc* output);
+void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output);

-void InferenceOptimize(const ProgramDesc& input, ProgramDesc* output);
+void InferenceOptimize(const proto::ProgramDesc& input,
+                       proto::ProgramDesc* output);

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/prune_test.cc
+++ b/paddle/framework/prune_test.cc
@@ -29,12 +29,12 @@ namespace ops = paddle::operators;

 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
           const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           paddle::framework::BlockDescBind *block) {
+           paddle::framework::BlockDesc *block) {
  // insert output
  for (auto kv : outputs) {
    for (auto v : kv.second) {
      auto var = block->Var(v);
-      var->SetDataType(paddle::framework::DataType::FP32);
+      var->SetDataType(paddle::framework::proto::DataType::FP32);
    }
  }

@@ -51,26 +51,26 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
 }

 TEST(Prune, one_operator) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);

  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
        block);

-  f::ProgramDesc *pdesc = program.Proto();
-  f::ProgramDesc pruned;
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc pruned;

-  Prune(*pdesc, &pruned);
+  f::Prune(*pdesc, &pruned);
  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 0);

  pdesc->mutable_blocks(0)->mutable_ops(0)->set_is_target(true);
-  Prune(*pdesc, &pruned);
+  f::Prune(*pdesc, &pruned);
  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 1);
 }

 TEST(Prune, forward) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);

  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
        block);
@@ -81,19 +81,19 @@ TEST(Prune, forward) {
  AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{},
        block);

-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();

  for (int i = 0; i < pdesc->blocks(0).ops_size(); ++i) {
-    f::ProgramDesc pruned;
+    f::proto::ProgramDesc pruned;
    pdesc->mutable_blocks(0)->mutable_ops(i)->set_is_target(true);
-    Prune(*pdesc, &pruned);
+    f::Prune(*pdesc, &pruned);
    PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), i + 1);
  }
 }

 TEST(Prune, multi_input_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);

  AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{},
        block);
@@ -104,17 +104,17 @@ TEST(Prune, multi_input_op) {
  AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}},
        f::AttributeMap{}, block);

-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
  pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true);

-  f::ProgramDesc pruned;
-  Prune(*pdesc, &pruned);
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 4);
 }

 TEST(Prune, multi_output_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);

  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
        f::AttributeMap{}, block);
@@ -123,17 +123,17 @@ TEST(Prune, multi_output_op) {
  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
        block);

-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);

-  f::ProgramDesc pruned;
-  Prune(*pdesc, &pruned);
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 2);
 }

 TEST(Prune, multi_target) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);

  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
        f::AttributeMap{}, block);
@@ -142,11 +142,11 @@ TEST(Prune, multi_target) {
  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
        block);

-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
  pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);

-  f::ProgramDesc pruned;
-  Prune(*pdesc, &pruned);
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 3);
 }
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -57,17 +57,17 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
    SetDim(names[i], dims[i]);
  }
 }
-std::vector<VarDesc::VarType> InferShapeContext::GetInputsVarType(
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetInputsVarType(
    const std::string &name) const {
  return GetVarTypes(Inputs(name));
 }
-std::vector<VarDesc::VarType> InferShapeContext::GetOutputsVarType(
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetOutputsVarType(
    const std::string &name) const {
  return GetVarTypes(Outputs(name));
 }
-std::vector<VarDesc::VarType> InferShapeContext::GetVarTypes(
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetVarTypes(
    const std::vector<std::string> &names) const {
-  std::vector<VarDesc::VarType> retv;
+  std::vector<proto::VarDesc::VarType> retv;
  retv.resize(names.size());
  std::transform(names.begin(), names.end(), retv.begin(),
                 std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,

--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -27,8 +27,9 @@ class InferShapeContext {
  virtual bool HasInput(const std::string &name) const = 0;
  virtual bool HasOutput(const std::string &name) const = 0;

-  std::vector<VarDesc::VarType> GetInputsVarType(const std::string &name) const;
-  std::vector<VarDesc::VarType> GetOutputsVarType(
+  std::vector<proto::VarDesc::VarType> GetInputsVarType(
+      const std::string &name) const;
+  std::vector<proto::VarDesc::VarType> GetOutputsVarType(
      const std::string &name) const;

  virtual bool HasInputs(const std::string &name) const = 0;
@@ -65,10 +66,10 @@ class InferShapeContext {
  std::vector<framework::DDim> GetDims(
      const std::vector<std::string> &names) const;

-  std::vector<VarDesc::VarType> GetVarTypes(
+  std::vector<proto::VarDesc::VarType> GetVarTypes(
      const std::vector<std::string> &names) const;

-  virtual VarDesc::VarType GetVarType(const std::string &name) const = 0;
+  virtual proto::VarDesc::VarType GetVarType(const std::string &name) const = 0;
 };

 }  // namespace framework

--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
@@ -25,11 +25,9 @@
 namespace paddle {
 namespace framework {
 class OperatorBase;
-class OpDescBind;
-class BlockDescBind;
-class BlockDesc;
+class OpDesc;
 class InferShapeContext;
-class BlockDescBind;
+class BlockDesc;

 using VariableNameMap = std::map<std::string, std::vector<std::string>>;

@@ -37,7 +35,7 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using Attribute =
    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                   std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDescBind*>;
+                   std::vector<bool>, BlockDesc*>;

 using AttributeMap = std::unordered_map<std::string, Attribute>;

@@ -45,13 +43,13 @@ using OpCreator = std::function<OperatorBase*(
    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;

-using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDescBind>>(
-    const OpDescBind&, const std::unordered_set<std::string>& /*no_grad_set*/,
+using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
+    const OpDesc&, const std::unordered_set<std::string>& /*no_grad_set*/,
    std::unordered_map<std::string, std::string>* /*grad_to_var*/,
-    const std::vector<BlockDescBind*>& grad_block)>;
+    const std::vector<BlockDesc*>& grad_block)>;

-using InferVarTypeFN = std::function<void(const OpDescBind& /*op_desc*/,
-                                          BlockDescBind* /*block*/)>;
+using InferVarTypeFN =
+    std::function<void(const OpDesc& /*op_desc*/, BlockDesc* /*block*/)>;

 using InferShapeFN = std::function<void(InferShapeContext*)>;


--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -18,30 +18,32 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-VarDesc::VarType VarDescBind::GetType() const { return desc_.type(); }
+proto::VarDesc::VarType VarDesc::GetType() const { return desc_.type(); }

-void VarDescBind::SetType(VarDesc::VarType type) { desc_.set_type(type); }
+void VarDesc::SetType(proto::VarDesc::VarType type) { desc_.set_type(type); }

-void VarDescBind::SetShape(const std::vector<int64_t> &dims) {
+void VarDesc::SetShape(const std::vector<int64_t> &dims) {
  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
 }

-void VarDescBind::SetDataType(DataType data_type) {
+void VarDesc::SetDataType(proto::DataType data_type) {
  mutable_tensor_desc()->set_data_type(data_type);
 }

-std::vector<int64_t> VarDescBind::Shape() const {
+std::vector<int64_t> VarDesc::Shape() const {
  return RepeatedToVector(tensor_desc().dims());
 }

-DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }
+proto::DataType VarDesc::GetDataType() const {
+  return tensor_desc().data_type();
+}

-void VarDescBind::SetLoDLevel(int32_t lod_level) {
+void VarDesc::SetLoDLevel(int32_t lod_level) {
  switch (desc_.type()) {
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
      desc_.mutable_lod_tensor()->set_lod_level(lod_level);
      break;
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
      desc_.mutable_tensor_array()->set_lod_level(lod_level);
      break;
    default:
@@ -50,11 +52,11 @@ void VarDescBind::SetLoDLevel(int32_t lod_level) {
  }
 }

-int32_t VarDescBind::GetLodLevel() const {
+int32_t VarDesc::GetLodLevel() const {
  switch (desc_.type()) {
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
      return desc_.lod_tensor().lod_level();
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
      return desc_.tensor_array().lod_level();
    default:
      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
@@ -62,29 +64,29 @@ int32_t VarDescBind::GetLodLevel() const {
  }
 }

-const TensorDesc &VarDescBind::tensor_desc() const {
+const proto::TensorDesc &VarDesc::tensor_desc() const {
  PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
  switch (desc_.type()) {
-    case VarDesc::SELECTED_ROWS:
+    case proto::VarDesc::SELECTED_ROWS:
      return desc_.selected_rows();
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
      return desc_.lod_tensor().tensor();
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
      return desc_.tensor_array().tensor();
    default:
      PADDLE_THROW("Unexpected branch.");
  }
 }

-TensorDesc *VarDescBind::mutable_tensor_desc() {
+proto::TensorDesc *VarDesc::mutable_tensor_desc() {
  PADDLE_ENFORCE(desc_.has_type(),
                 "invoke MutableTensorDesc must after set type");
  switch (desc_.type()) {
-    case VarDesc::SELECTED_ROWS:
+    case proto::VarDesc::SELECTED_ROWS:
      return desc_.mutable_selected_rows();
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
      return desc_.mutable_lod_tensor()->mutable_tensor();
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
      return desc_.mutable_tensor_array()->mutable_tensor();
    default:
      PADDLE_THROW("Unexpected branch.");

--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -53,44 +53,44 @@ inline void VectorToRepeated(const std::vector<bool> &vec,
  }
 }

-class VarDescBind {
+class VarDesc {
 public:
-  explicit VarDescBind(const std::string &name) {
+  explicit VarDesc(const std::string &name) {
    desc_.set_name(name);
-    desc_.set_type(VarDesc::LOD_TENSOR);
+    desc_.set_type(proto::VarDesc::LOD_TENSOR);
  }

-  explicit VarDescBind(const VarDesc &desc) : desc_(desc) {}
+  explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {}

-  VarDesc *Proto() { return &desc_; }
+  proto::VarDesc *Proto() { return &desc_; }

  std::string Name() const { return desc_.name(); }

  void SetShape(const std::vector<int64_t> &dims);

-  void SetDataType(DataType data_type);
+  void SetDataType(proto::DataType data_type);

  std::vector<int64_t> Shape() const;

-  DataType GetDataType() const;
+  proto::DataType GetDataType() const;

  void SetLoDLevel(int32_t lod_level);

  int32_t GetLodLevel() const;

-  VarDesc::VarType GetType() const;
+  proto::VarDesc::VarType GetType() const;

-  void SetType(VarDesc::VarType type);
+  void SetType(proto::VarDesc::VarType type);

  bool Persistable() const { return desc_.persistable(); }

  void SetPersistable(bool persistable) { desc_.set_persistable(persistable); }

 private:
-  const TensorDesc &tensor_desc() const;
-  TensorDesc *mutable_tensor_desc();
+  const proto::TensorDesc &tensor_desc() const;
+  proto::TensorDesc *mutable_tensor_desc();

-  VarDesc desc_;
+  proto::VarDesc desc_;
 };
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@@ -20,15 +20,15 @@

 namespace paddle {
 namespace framework {
-inline VarDesc::VarType ToVarType(std::type_index type) {
+inline proto::VarDesc::VarType ToVarType(std::type_index type) {
  if (type.hash_code() == typeid(LoDTensor).hash_code()) {
-    return VarDesc_VarType_LOD_TENSOR;
+    return proto::VarDesc_VarType_LOD_TENSOR;
  } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
-    return VarDesc_VarType_LOD_RANK_TABLE;
+    return proto::VarDesc_VarType_LOD_RANK_TABLE;
  } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
-    return VarDesc_VarType_LOD_TENSOR_ARRAY;
+    return proto::VarDesc_VarType_LOD_TENSOR_ARRAY;
  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
-    return VarDesc_VarType_SELECTED_ROWS;
+    return proto::VarDesc_VarType_SELECTED_ROWS;
  } else {
    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
  }
@@ -37,16 +37,16 @@ inline VarDesc::VarType ToVarType(std::type_index type) {
 template <typename Visitor>
 inline void VisitVarType(const Variable& var, Visitor visitor) {
  switch (ToVarType(var.Type())) {
-    case VarDesc_VarType_LOD_TENSOR:
+    case proto::VarDesc_VarType_LOD_TENSOR:
      visitor(var.Get<framework::LoDTensor>());
      return;
-    case VarDesc_VarType_LOD_RANK_TABLE:
+    case proto::VarDesc_VarType_LOD_RANK_TABLE:
      visitor(var.Get<LoDRankTable>());
      return;
-    case VarDesc_VarType_LOD_TENSOR_ARRAY:
+    case proto::VarDesc_VarType_LOD_TENSOR_ARRAY:
      visitor(var.Get<LoDTensorArray>());
      return;
-    case VarDesc_VarType_SELECTED_ROWS:
+    case proto::VarDesc_VarType_SELECTED_ROWS:
      visitor(var.Get<SelectedRows>());
      return;
    default:

--- a/paddle/framework/var_type_inference.h
+++ b/paddle/framework/var_type_inference.h
@@ -21,8 +21,7 @@ namespace framework {
 class VarTypeInference {
 public:
  virtual ~VarTypeInference() {}
-  virtual void operator()(const OpDescBind& op_desc,
-                          BlockDescBind* block) const = 0;
+  virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
 };

 }  // namespace framework

--- a/paddle/framework/var_type_inference_test.cc
+++ b/paddle/framework/var_type_inference_test.cc
@@ -33,17 +33,16 @@ class SumOpMaker : public OpProtoAndCheckerMaker {

 class SumOpVarTypeInference : public VarTypeInference {
 public:
-  void operator()(const OpDescBind &op_desc,
-                  BlockDescBind *block) const override {
+  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
    auto &inputs = op_desc.Input("X");
-    auto default_var_type = VarDesc::SELECTED_ROWS;
+    auto default_var_type = proto::VarDesc::SELECTED_ROWS;

    bool any_input_is_lod_tensor = std::any_of(
        inputs.begin(), inputs.end(), [block](const std::string &name) {
-          return block->Var(name)->GetType() == VarDesc::LOD_TENSOR;
+          return block->Var(name)->GetType() == proto::VarDesc::LOD_TENSOR;
        });
    if (any_input_is_lod_tensor) {
-      default_var_type = VarDesc::LOD_TENSOR;
+      default_var_type = proto::VarDesc::LOD_TENSOR;
    }

    auto out_var_name = op_desc.Output("Out").front();
@@ -62,43 +61,43 @@ namespace paddle {
 namespace framework {

 TEST(InferVarType, sum_op) {
-  ProgramDescBind prog;
+  ProgramDesc prog;
  auto *op = prog.MutableBlock(0)->AppendOp();
  op->SetType("sum");
  op->SetInput("X", {"test_a", "test_b", "test_c"});
  op->SetOutput("Out", {"test_out"});

-  prog.MutableBlock(0)->Var("test_a")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_b")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_c")->SetType(VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarDesc::SELECTED_ROWS);
  prog.MutableBlock(0)->Var("test_out");

  op->InferVarType(prog.MutableBlock(0));

-  ASSERT_EQ(VarDesc::SELECTED_ROWS,
+  ASSERT_EQ(proto::VarDesc::SELECTED_ROWS,
            prog.MutableBlock(0)->Var("test_out")->GetType());

-  prog.MutableBlock(0)->Var("test_b")->SetType(VarDesc::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::LOD_TENSOR);
  op->InferVarType(prog.MutableBlock(0));
-  ASSERT_EQ(VarDesc::LOD_TENSOR,
+  ASSERT_EQ(proto::VarDesc::LOD_TENSOR,
            prog.MutableBlock(0)->Var("test_out")->GetType());
 }

 TEST(InferVarType, sum_op_without_infer_var_type) {
-  ProgramDescBind prog;
+  ProgramDesc prog;
  auto *op = prog.MutableBlock(0)->AppendOp();
  op->SetType("sum_without_infer_var_type");
  op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
  op->SetOutput("Out", {"test2_out"});

-  prog.MutableBlock(0)->Var("test2_a")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_b")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_c")->SetType(VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarDesc::SELECTED_ROWS);
  prog.MutableBlock(0)->Var("test2_out");

  op->InferVarType(prog.MutableBlock(0));

-  ASSERT_EQ(VarDesc_VarType_LOD_TENSOR,
+  ASSERT_EQ(proto::VarDesc_VarType_LOD_TENSOR,
            prog.MutableBlock(0)->Var("test2_out")->GetType());
 }


--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -62,33 +62,6 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
  }
 }

-template <>
-void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num) {
-  platform::SetDeviceId(src_place.device);
-  platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
-}
-
-template <>
-void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::CPUPlace src_place,
-                                                  const void* src, size_t num) {
-  platform::SetDeviceId(dst_place.device);
-  platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
-}
-
-template <>
-void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num) {
-  platform::SetDeviceId(dst_place.device);
-  platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
-}
-
 #endif

 }  // namespace memory

--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -63,8 +63,7 @@ class AccuracyOp : public framework::OperatorWithKernel {

 class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AccuracyOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  AccuracyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    // TODO(typhoonzero): support both inference value and indices.
    AddInput("Out", "The network output of topk (inferences)");

--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -26,7 +26,7 @@ template <int BlockSize>
 __global__ void AccuracyCudaKernel(const int N, const int D,
                                   const int64_t* Xdata,
                                   const int64_t* labeldata, int* correct_data,
-                                   float* accuracy) {
+                                   float* accuracy, int* total_data) {
  int count = 0;
  __shared__ int total[BlockSize];

@@ -47,6 +47,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D,
  if (threadIdx.x == 0) {
    *correct_data = result;
    *accuracy = static_cast<float>(result) / static_cast<float>(N);
+    *total_data = N;
  }
 }

@@ -80,22 +81,11 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
    if (num_samples == 0) {
      return;
    }
-    platform::GpuMemcpyAsync(total_data, &num_samples, sizeof(int),
-                             cudaMemcpyHostToDevice, stream);

    AccuracyCudaKernel<
        PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
        num_samples, infer_width, indices_data, label_data, correct_data,
-        accuracy_data);
-
-    int d_num_samples, d_num_correct;
-    float d_accuracy;
-    platform::GpuMemcpyAsync(&d_num_correct, correct_data, sizeof(int),
-                             cudaMemcpyDeviceToHost, stream);
-    platform::GpuMemcpyAsync(&d_num_samples, total_data, sizeof(int),
-                             cudaMemcpyDeviceToHost, stream);
-    platform::GpuMemcpyAsync(&d_accuracy, accuracy_data, sizeof(float),
-                             cudaMemcpyDeviceToHost, stream);
+        accuracy_data, total_data);
  }
 };


--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -38,9 +38,8 @@ class ActivationOpGrad : public framework::OperatorWithKernel {

 class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SigmoidOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Sigmoid operator");
    AddOutput("Y", "Output of Sigmoid operator");
    AddComment(R"DOC(
@@ -54,9 +53,8 @@ $$y = \frac{1}{1 + e^{-x}}$$

 class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LogSigmoidOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  LogSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of LogSigmoid operator");
    AddOutput("Y", "Output of LogSigmoid operator");
    AddComment(R"DOC(
@@ -70,8 +68,8 @@ $$y = \log \frac{1}{1 + e^{-x}}$$

 class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ExpOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ExpOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Exp operator");
    AddOutput("Y", "Output of Exp operator");
    AddComment(R"DOC(
@@ -85,8 +83,8 @@ $y = e^x$

 class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Relu operator");
    AddOutput("Y", "Output of Relu operator");
    AddComment(R"DOC(
@@ -100,9 +98,8 @@ $y = \max(x, 0)$

 class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LeakyReluOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of LeakyRelu operator");
    AddOutput("Y", "Output of LeakyRelu operator");
    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
@@ -117,9 +114,8 @@ $y = \max(x, \alpha * x)$

 class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SoftShrinkOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Softshrink operator");
    AddOutput("Y", "Output of Softshrink operator");
    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
@@ -140,8 +136,8 @@ $$

 class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  TanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  TanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Tanh operator");
    AddOutput("Y", "Output of Tanh operator");
    AddComment(R"DOC(
@@ -155,9 +151,8 @@ $$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

 class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  TanhShrinkOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  TanhShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of TanhShrink operator");
    AddOutput("Y", "Output of TanhShrink operator");
    AddComment(R"DOC(
@@ -171,9 +166,8 @@ $$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$

 class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  HardShrinkOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of HardShrink operator");
    AddOutput("Y", "Output of HardShrink operator");
    AddAttr<float>("threshold", "The value of threshold for HardShrink")
@@ -195,8 +189,8 @@ $$

 class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SqrtOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SqrtOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Sqrt operator");
    AddOutput("Y", "Output of Sqrt operator");
    AddComment(R"DOC(
@@ -210,8 +204,8 @@ $y = \sqrt{x}$

 class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AbsOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  AbsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Abs operator");
    AddOutput("Y", "Output of Abs operator");
    AddComment(R"DOC(
@@ -225,8 +219,8 @@ $y = |x|$

 class CeilOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CeilOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  CeilOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Ceil operator");
    AddOutput("Y", "Output of Ceil operator");
    AddComment(R"DOC(
@@ -240,8 +234,8 @@ $y = ceil(x)$

 class FloorOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  FloorOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  FloorOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Floor operator");
    AddOutput("Y", "Output of Floor operator");
    AddComment(R"DOC(
@@ -255,8 +249,8 @@ $y = floor(x)$

 class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RoundOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Round operator");
    AddOutput("Y", "Output of Round operator");
    AddComment(R"DOC(
@@ -270,9 +264,8 @@ $y = [x]$

 class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ReciprocalOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ReciprocalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Reciprocal operator");
    AddOutput("Y", "Output of Reciprocal operator");
    AddComment(R"DOC(
@@ -286,8 +279,8 @@ $$y = \frac{1}{x}$$

 class LogOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LogOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  LogOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Log operator");
    AddOutput("Y", "Output of Log operator");
    AddComment(R"DOC(
@@ -303,8 +296,8 @@ Natural logarithm of x.

 class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SquareOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SquareOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Square operator");
    AddOutput("Y", "Output of Square operator");
    AddComment(R"DOC(
@@ -318,9 +311,8 @@ $y = x^2$

 class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SoftplusOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftplusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Softplus operator");
    AddOutput("Y", "Output of Softplus operator");
    AddComment(R"DOC(
@@ -334,9 +326,8 @@ $y = \ln(1 + e^{x})$

 class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SoftsignOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftsignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Softsign operator");
    AddOutput("Y", "Output of Softsign operator");
    AddComment(R"DOC(
@@ -350,8 +341,8 @@ $$y = \frac{x}{1 + |x|}$$

 class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  BReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  BReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of BRelu operator");
    AddOutput("Y", "Output of BRelu operator");
    AddAttr<float>("t_min", "The min marginal value of BRelu")
@@ -369,9 +360,8 @@ $y = \max(\min(x, t_{min}), t_{max})$

 class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SoftReluOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of SoftRelu operator");
    AddOutput("Y", "Output of SoftRelu operator");
    AddAttr<float>("threshold", "The threshold value of SoftRelu")
@@ -387,8 +377,8 @@ $y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$

 class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ELUOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ELUOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of ELU operator");
    AddOutput("Y", "Output of ELU operator");
    AddAttr<float>("alpha", "The alpha value of ELU").SetDefault(1.0f);
@@ -406,8 +396,8 @@ $y = \max(0, x) + \min(0, \alpha * (e^x - 1))$

 class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Relu6OpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  Relu6OpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Relu6 operator");
    AddOutput("Y", "Output of Relu6 operator");
    AddAttr<float>("threshold", "The threshold value of Relu6")
@@ -423,8 +413,8 @@ $y = \min(\max(0, x), 6)$

 class PowOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  PowOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  PowOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Pow operator");
    AddOutput("Y", "Output of Pow operator");
    AddAttr<float>("factor", "The exponential factor of Pow").SetDefault(1.0f);
@@ -439,8 +429,8 @@ $y = x^{factor}$

 class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  STanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  STanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of STanh operator");
    AddOutput("Y", "Output of STanh operator");
    AddAttr<float>("scale_a", "The scale parameter of a for the input")
@@ -458,9 +448,8 @@ $$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$

 class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ThresholdedReluOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ThresholdedReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of ThresholdedRelu operator");
    AddOutput("Y", "Output of ThresholdedRelu operator");
    AddAttr<float>("threshold", "The threshold location of activation")
@@ -481,9 +470,8 @@ $$

 class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  HardSigmoidOpMaker(framework::OpProto *proto,
-                     framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  HardSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of HardSigmoid operator");
    AddOutput("Y", "Output of HardSigmoid operator");
    AddAttr<float>("slope", "Slope for linear approximation of sigmoid")
@@ -508,8 +496,8 @@ It is recommended to use the defaults for this activation.

 class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SwishOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SwishOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input of Swish operator");
    AddOutput("Y", "Output of Swish operator");
    AddAttr<float>("beta", "Constant beta of swish operator").SetDefault(1.0f);

--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -59,8 +59,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {

 class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AdadeltaOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  AdadeltaOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");

--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -59,8 +59,7 @@ class AdagradOp : public framework::OperatorWithKernel {

 class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AdagradOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  AdagradOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");

--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -73,7 +73,7 @@ class AdamOp : public framework::OperatorWithKernel {

 class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AdamOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AdamOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");

--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -67,7 +67,7 @@ class AdamaxOp : public framework::OperatorWithKernel {

 class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AdamaxOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AdamaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");

--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -114,8 +114,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {

 class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ArrayToLoDTensorOpProtoMaker(framework::OpProto *proto,
-                               framework::OpAttrChecker *op_checker)
+  ArrayToLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(std::vector<LodTensor>) A vector of tensors that is going to "
@@ -150,14 +149,14 @@ class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("lod_tensor_to_array");
    grad_op->SetInput("X", OutputGrad("Out"));
    grad_op->SetInput("RankTable", Input("RankTable"));
    grad_op->SetOutput("Out", InputGrad("X"));
    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };


--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -86,8 +86,7 @@ class AssignOp : public framework::OperatorBase {

 class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AssignOpProtoMaker(framework::OpProto *proto,
-                     framework::OpAttrChecker *op_checker)
+  AssignOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
@@ -109,8 +108,8 @@ class AssignInferShape : public framework::InferShapeBase {
  void operator()(framework::InferShapeContext *context) const override {
    if (context->HasInput("X")) {
      auto type = context->GetInputsVarType("X")[0];
-      if (type == framework::VarDesc_VarType_SELECTED_ROWS ||
-          type == framework::VarDesc_VarType_LOD_TENSOR) {
+      if (type == framework::proto::VarDesc_VarType_SELECTED_ROWS ||
+          type == framework::proto::VarDesc_VarType_LOD_TENSOR) {
        context->SetOutputDim("Out", context->GetInputDim("X"));
      }
    }
@@ -122,12 +121,12 @@ class AssignGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
    op->SetType("assign");
    op->SetInput("X", OutputGrad("Out"));
    op->SetOutput("Out", InputGrad("X"));
-    return std::unique_ptr<framework::OpDescBind>(op);
+    return std::unique_ptr<framework::OpDesc>(op);
  }
 };


--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -49,7 +49,7 @@ class AucOp : public framework::OperatorWithKernel {

 class AucOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AucOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AucOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Out",
             "A floating point 2D tensor, values are in the range [0, 1]."

--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/operators/batch_norm_op.h"
+#include "paddle/framework/data_layout.h"

 namespace paddle {
 namespace operators {

 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;

 template <typename T>
 using EigenArrayMap =
@@ -60,14 +62,14 @@ class BatchNormOp : public framework::OperatorWithKernel {
                      "Variance and VarianceOut should share the same memory");

    const auto x_dims = ctx->GetInputDim("X");
-    const TensorFormat tensor_format =
-        StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));

    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
                   "Input X must have 2 to 5 dimensions.");

    const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
                                          : x_dims[x_dims.size() - 1]);

    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
@@ -85,13 +87,12 @@ class BatchNormOp : public framework::OperatorWithKernel {

 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  BatchNormOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  BatchNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddAttr<bool>("is_test", "").SetDefault(false);
    AddAttr<float>("momentum", "").SetDefault(0.9);
    AddAttr<float>("epsilon", "").SetDefault(1e-5);
-    AddAttr<std::string>("tensor_format", "").SetDefault("NCHW");
+    AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
    AddInput("X", "The input tensor");
    AddInput("Scale",
             "Scale is a 1-dimensional tensor of size C "
@@ -142,9 +143,9 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
    const float epsilon = ctx.Attr<float>("epsilon");
    const float momentum = ctx.Attr<float>("momentum");
    const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);

    const auto *x = ctx.Input<Tensor>("X");
    const auto &x_dims = x->dims();
@@ -152,7 +153,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
                   "The Input dim size should be between 2 and 5");
    const int N = x_dims[0];
    const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
                                          : x_dims[x_dims.size() - 1]);
    const int sample_size = x->numel() / N / C;

@@ -178,8 +179,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
      saved_mean_e.setZero();
      saved_variance_e.setZero();

-      switch (tensor_format) {
-        case TensorFormat::NCHW: {
+      switch (data_layout) {
+        case DataLayout::kNCHW: {
          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
          for (int nc = 0; nc < N * C; ++nc) {
            saved_mean_e(nc % C) += x_arr.col(nc).sum();
@@ -192,7 +193,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
          saved_variance_e /= N * sample_size;
          break;
        }
-        case TensorFormat::NHWC: {
+        case DataLayout::kNHWC: {
          ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
          for (int i = 0; i < N * sample_size; ++i) {
            saved_mean_e += x_arr.col(i);
@@ -206,7 +207,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
          break;
        }
        default:
-          PADDLE_THROW("Unknown storage order: %s", tensor_format_str);
+          PADDLE_THROW("Unknown storage order: %s", data_layout_str);
      }

      EigenVectorArrayMap<T> running_mean_arr(
@@ -248,8 +249,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
    Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
        bias_arr - mean_arr * inv_std * scale_arr;

-    switch (tensor_format) {
-      case TensorFormat::NCHW: {
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
        EigenArrayMap<T> y_arr(y->mutable_data<T>(ctx.GetPlace()), sample_size,
                               N * C);
        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
@@ -258,7 +259,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
        }
        break;
      }
-      case TensorFormat::NHWC: {
+      case DataLayout::kNHWC: {
        EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C,
                         N * sample_size) =
            (ConstEigenArrayMap<T>(x->data<T>(), C, N * sample_size).colwise() *
@@ -268,7 +269,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
        break;
      }
      default:
-        PADDLE_THROW("Unknown storage order: %d", tensor_format);
+        PADDLE_THROW("Unknown storage order: %d", data_layout);
    }
  }
 };
@@ -291,10 +292,10 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), "");

    const auto x_dims = ctx->GetInputDim("X");
-    const TensorFormat tensor_format =
-        StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
    const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
                                          : x_dims[x_dims.size() - 1]);

    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
@@ -334,9 +335,9 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
    // SavedVariance have been reverted in forward operator
    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);

    // Get the size for each dimension.
    // NCHW [batch_size, in_channels, in_height, in_width]
@@ -345,7 +346,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
                   "The Input dim size should be between 2 and 5");
    const int N = x_dims[0];
    const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
                                          : x_dims[x_dims.size() - 1]);
    const int sample_size = x->numel() / N / C;

@@ -377,8 +378,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>

    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);

-    switch (tensor_format) {
-      case TensorFormat::NCHW: {
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
        EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()),
@@ -401,7 +402,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
        }
        break;
      }
-      case TensorFormat::NHWC: {
+      case DataLayout::kNHWC: {
        ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
        EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C,
@@ -426,7 +427,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
        break;
      }
      default:
-        PADDLE_THROW("Unknown storage order: %s", tensor_format_str);
+        PADDLE_THROW("Unknown storage order: %s", data_layout_str);
    }
  }
 };

--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/operators/batch_norm_op.h"
+#include "paddle/framework/data_layout.h"

 #include <cfloat>
 #include "paddle/operators/math/math_function.h"
@@ -22,12 +23,12 @@ namespace paddle {
 namespace operators {

 using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;

-void ExtractNCWHD(const framework::DDim &dims,
-                  const TensorFormat &tensor_format, int *N, int *C, int *H,
-                  int *W, int *D) {
+void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
+                  int *N, int *C, int *H, int *W, int *D) {
  *N = dims[0];
  if (dims.size() == 2) {
    *C = dims[1];
@@ -35,13 +36,13 @@ void ExtractNCWHD(const framework::DDim &dims,
    *W = 1;
    *D = 1;
  } else {
-    *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
-    *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
    *W = dims.size() > 3
-             ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
+             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
             : 1;
    *D = dims.size() > 4
-             ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
+             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
             : 1;
  }
 }
@@ -56,9 +57,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
    const float momentum = ctx.Attr<float>("momentum");
    const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);

    // Get the size for each dimension.
    // NCHW [batch_size, in_channels, in_height, in_width]
@@ -67,7 +68,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
                   "The Input dim size should be between 2 and 5");
    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);

    // ------------------- cudnn descriptors ---------------------
    cudnnTensorDescriptor_t data_desc_;
@@ -93,7 +94,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    VLOG(1) << "Setting descriptors.";
    std::vector<int> dims;
    std::vector<int> strides;
-    if (tensor_format == TensorFormat::NCHW) {
+    if (data_layout == DataLayout::kNCHW) {
      dims = {N, C, H, W, D};
      strides = {C * H * W * D, H * W * D, W * D, D, 1};
    } else {
@@ -180,9 +181,9 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "It must use GPUPlace.");
    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
    const auto *x = ctx.Input<Tensor>("X");
    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
    const auto *scale = ctx.Input<Tensor>("Scale");
@@ -192,7 +193,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
                   "The Input dim size should be between 2 and 5");
    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);

    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
    PADDLE_ENFORCE_EQ(scale->dims()[0], C);
@@ -219,7 +220,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>

    std::vector<int> dims;
    std::vector<int> strides;
-    if (tensor_format == TensorFormat::NCHW) {
+    if (data_layout == DataLayout::kNCHW) {
      dims = {N, C, H, W, D};
      strides = {C * H * W * D, H * W * D, W * D, D, 1};
    } else {

--- a/paddle/operators/batch_norm_op.h
+++ b/paddle/operators/batch_norm_op.h
@@ -19,21 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-enum TensorFormat {
-  NHWC = 0,
-  NCHW = 1,
-};
-
-inline TensorFormat StringToTensorFormat(const std::string& str) {
-  if (str == "NHWC" || str == "nhwc") {
-    return TensorFormat::NHWC;
-  } else if (str == "NCHW" || str == "nchw") {
-    return TensorFormat::NCHW;
-  } else {
-    PADDLE_THROW("Unknown storage order string: %s", str);
-  }
-}
-
 template <typename DeviceContext, typename T>
 class BatchNormKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -83,9 +83,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {

 class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  BeamSearchDecodeOpProtoMaker(framework::OpProto* proto,
-                               framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  BeamSearchDecodeOpProtoMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Ids",
             "(LodTensorArray)"
             "score of the candidate words in each step");
@@ -120,13 +119,13 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase {

 class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDescBind& op_desc,
-                  framework::BlockDescBind* block) const override {
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
    for (auto& o : op_desc.Output("SentenceIds")) {
-      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
    }
    for (auto& o : op_desc.Output("SentenceScores")) {
-      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
    }
  }
 };

--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/operators/beam_search_op.cc
@@ -153,8 +153,7 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
 class BeamSearchProtoAndCheckerMaker
    : public framework::OpProtoAndCheckerMaker {
 public:
-  BeamSearchProtoAndCheckerMaker(framework::OpProto *proto,
-                                 framework::OpAttrChecker *op_checker)
+  BeamSearchProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    // inputs and outputs stored in proto
    AddInput("pre_ids", "ids in previous step");

--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ b/paddle/operators/bilinear_tensor_product_op.cc
@@ -65,8 +65,7 @@ class BilinearTensorProductOp : public framework::OperatorWithKernel {

 class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  BilinearTensorProductOpMaker(framework::OpProto* proto,
-                               framework::OpAttrChecker* op_checker)
+  BilinearTensorProductOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The first input of bilinear_tensor_product operator.");
    AddInput("Y", "The second input of bilinear_tensor_product operator.");

--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -20,8 +20,7 @@ namespace operators {

 class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CastOpProtoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  CastOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of cast op");
    AddOutput("Out", "The output tensor of cast op");
@@ -53,14 +52,14 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto grad = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto grad = new framework::OpDesc();
    grad->SetType("cast");
    grad->SetInput("X", OutputGrad("Out"));
    grad->SetOutput("Out", InputGrad("X"));
    grad->SetAttr("out_dtype", GetAttr("in_dtype"));
    grad->SetAttr("in_dtype", GetAttr("out_dtype"));
-    return std::unique_ptr<framework::OpDescBind>(grad);
+    return std::unique_ptr<framework::OpDesc>(grad);
  }
 };


--- a/paddle/operators/cast_op.h
+++ b/paddle/operators/cast_op.h
@@ -55,7 +55,7 @@ class CastOpKernel : public framework::OpKernel<InT> {
    auto* in = context.Input<framework::Tensor>("X");
    auto* out = context.Output<framework::Tensor>("Out");
    framework::VisitDataType(
-        static_cast<framework::DataType>(context.Attr<int>("out_dtype")),
+        static_cast<framework::proto::DataType>(context.Attr<int>("out_dtype")),
        CastOpFunctor<DeviceContext, InT>(
            in, out, context.template device_context<DeviceContext>()));
  }

--- a/paddle/operators/chunk_eval_op.cc
+++ b/paddle/operators/chunk_eval_op.cc
@@ -57,15 +57,14 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetKernelType(
      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(framework::DataType::FP32,
+    return framework::OpKernelType(framework::proto::DataType::FP32,
                                   ctx.device_context());
  }
 };

 class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ChunkEvalOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ChunkEvalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Inference",
             "(Tensor, default: Tensor<int64_t>). "

--- a/paddle/operators/clip_by_norm_op.cc
+++ b/paddle/operators/clip_by_norm_op.cc
@@ -37,8 +37,7 @@ class ClipByNormOp : public framework::OperatorWithKernel {

 class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ClipByNormOpMaker(framework::OpProto* proto,
-                    framework::OpAttrChecker* op_checker)
+  ClipByNormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor) The input of clip_by_norm op."

--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -38,7 +38,7 @@ class ClipOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ClipOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor)The input of clip op."

--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -20,8 +20,7 @@ namespace operators {
 template <typename OpComment>
 class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CompareOpProtoMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  CompareOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    OpComment comment;
    AddInput("X",

--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -58,7 +58,7 @@ class ConcatOp : public framework::OperatorWithKernel {

 class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ConcatOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ConcatOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "Input tensors of concat operator.").AsDuplicable();
    AddOutput("Out", "Output tensor of concat operator.");
@@ -98,8 +98,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
-            ops::ConcatOpGrad)
+REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
+               ops::ConcatOpGrad, false)
 REGISTER_OP_CPU_KERNEL(concat,
                       ops::ConcatKernel<paddle::platform::CPUPlace, float>)
 REGISTER_OP_CPU_KERNEL(concat_grad,

--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -205,8 +205,7 @@ void CondOp::Run(const Scope& scope,

 class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CondOpProtoAndCheckerMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
+  CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Cond", "The condition, which is a bool vector");
    AddInput("Xs", "Inputs of Subnets").AsDuplicable();

--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
@@ -65,7 +65,7 @@ class ConditionalBlockOp : public ConditionalOp {
      scopes->front() = &scope.NewScope();
      auto &cur_scope = *scopes->front();

-      auto *block = Attr<framework::BlockDescBind *>("sub_block");
+      auto *block = Attr<framework::BlockDesc *>("sub_block");
      framework::Executor exec(dev_ctx);
      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
    }
@@ -74,8 +74,7 @@ class ConditionalBlockOp : public ConditionalOp {

 class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ConditionalBlockOpProtoMaker(framework::OpProto *proto,
-                               framework::OpAttrChecker *op_checker)
+  ConditionalBlockOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "The conditional variable of this operator. If X is empty, the "
@@ -87,7 +86,7 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
              "(std::vector<Scope*>) The step scope of conditional block. To "
              "unify the conditional block, rnn and while op, the type of "
              "scope is std::vector<Scope*>");
-    AddAttr<framework::BlockDescBind *>(
+    AddAttr<framework::BlockDesc *>(
        "sub_block", "The step block of conditional block operator");
    AddComment(R"DOC(Conditional block operator

@@ -117,7 +116,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
      auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
      framework::Scope &cur_scope = *scopes[0];

-      auto *block = Attr<framework::BlockDescBind *>("sub_block");
+      auto *block = Attr<framework::BlockDesc *>("sub_block");
      framework::Executor exec(dev_ctx);
      exec.Run(*block->Program(), &cur_scope, block->ID(), false);

@@ -171,18 +170,19 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto grad_op = new framework::OpDesc();
    grad_op->SetType("conditional_block_grad");
    grad_op->SetInput("X", Input("X"));
    grad_op->SetInput("Params", Input("Params"));
    grad_op->SetInput("Out", Output("Out"));
    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
    grad_op->SetInput("Scope", Output("Scope"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    grad_op->SetOutput(framework::GradVarName("Params"), InputGrad("Params"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
+    grad_op->SetOutput(framework::GradVarName("Params"),
+                       InputGrad("Params", false));
    grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };


--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
@@ -19,8 +19,7 @@ namespace operators {

 class CudnnConv2DOpMaker : public Conv2DOpMaker {
 public:
-  CudnnConv2DOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  CudnnConv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : Conv2DOpMaker(proto, op_checker) {
    AddAttr<int>("workspace_size_MB",
                 "workspace size for cudnn, in MB, "
@@ -34,8 +33,7 @@ class CudnnConv2DOpMaker : public Conv2DOpMaker {

 class CudnnConv3DOpMaker : public Conv3DOpMaker {
 public:
-  CudnnConv3DOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  CudnnConv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : Conv3DOpMaker(proto, op_checker) {
    AddAttr<int>("workspace_size_MB",
                 "workspace size for cudnn, in MB, "

--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -66,8 +66,7 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
 }

-Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
+Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput(
      "Input",
@@ -138,8 +137,7 @@ $$
 )DOC");
 }

-Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
+Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput(
      "Input",

--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@@ -50,14 +50,12 @@ inline bool IsExpand(std::vector<int64_t>& filter_dim,
 // operator implementations can reuse the code.
 class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Conv2DOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };

 class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Conv3DOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };

 class ConvOp : public framework::OperatorWithKernel {

--- a/paddle/operators/conv_shift_op.cc
+++ b/paddle/operators/conv_shift_op.cc
@@ -75,8 +75,7 @@ class ConvShiftGradOp : public framework::OperatorWithKernel {

 class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ConvShiftOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ConvShiftOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "

--- a/paddle/operators/conv_transpose_cudnn_op.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cc
@@ -19,11 +19,8 @@ namespace operators {

 class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
 public:
-  CudnnConv2DTransposeOpMaker(framework::OpProto* proto,
-                              framework::OpAttrChecker* op_checker)
+  CudnnConv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : Conv2DTransposeOpMaker(proto, op_checker) {
-    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault({1, 1});
    AddAttr<int>("workspace_size_MB",
                 "workspace size for cudnn, in MB, "
                 "workspace is a section of GPU memory which will be "
@@ -36,11 +33,8 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {

 class CudnnConv3DTransposeOpMaker : public Conv3DTransposeOpMaker {
 public:
-  CudnnConv3DTransposeOpMaker(framework::OpProto* proto,
-                              framework::OpAttrChecker* op_checker)
+  CudnnConv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : Conv3DTransposeOpMaker(proto, op_checker) {
-    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault({1, 1, 1});
    AddAttr<int>("workspace_size_MB",
                 "workspace size for cudnn, in MB, "
                 "workspace is a section of GPU memory which will be "

--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -29,6 +29,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
  auto filter_dims = ctx->GetInputDim("Filter");
  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");

  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                 "ConvTransposeOp intput should be 4-D or 5-D tensor.");
@@ -41,20 +42,24 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
                    "ConvTransposeOp paddings dimension and strides "
                    "dimension should be the same.");
+  PADDLE_ENFORCE_EQ(paddings.size(), dilations.size(),
+                    "ConvTransposeOp paddings dimension and dilations "
+                    "dimension should be the same.");
  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
                    "In ConvTransposeOp, The input channel should be the same "
                    "as the number of filters.");

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
  for (size_t i = 0; i < strides.size(); ++i) {
+    auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
-                           filter_dims[i + 2]);
+                           filter_extent);
  }
  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
 }

-Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
-    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto,
+                                               OpAttrChecker* op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput(
      "Input",
@@ -73,6 +78,12 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
  AddOutput("Output",
            "(Tensor) The output tensor of convolution transpose operator. "
            "The format of output tensor is also NCHW.");
+
+  AddAttr<std::vector<int>>("dilations",
+                            "(vector<int> default:{1, 1}), the "
+                            "dilations(h_dilation, w_dilation) of convolution "
+                            "transpose operator.")
+      .SetDefault({1, 1});
  AddAttr<std::vector<int>>(
      "strides",
      "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
@@ -87,7 +98,7 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
 Convolution2D Transpose Operator.

 The convolution transpose operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and dilations, strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
 Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
 number of channels, H is the height of the feature, and W is the width of the feature.
@@ -112,8 +123,8 @@ Example:
 )DOC");
 }

-Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
-    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto,
+                                               OpAttrChecker* op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput("Input",
           "(Tensor) The input tensor of convolution transpose operator."
@@ -136,6 +147,13 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
            "Where N is batch size, C is "
            "the number of channels, D is the depth of the feature, H is the "
            "height of the feature, and W is the width of the feature.");
+
+  AddAttr<std::vector<int>>(
+      "dilations",
+      "(vector<int> default:{1, 1, 1}), the "
+      "dilations(d_dilation,h_dilation, w_dilation) of convolution "
+      "transpose operator.")
+      .SetDefault({1, 1, 1});
  AddAttr<std::vector<int>>("strides",
                            "(vector<int> default:{1, 1, 1}), the "
                            "strides{d_stride, h_stride, w_stride} of "
@@ -149,7 +167,7 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
 Convolution3D Transpose Operator.

 The convolution transpose operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and dilations, strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
 Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
 number of channels, D is the depth of the feature, H is the height of the feature,

--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -30,14 +30,12 @@ using DDim = framework::DDim;
 // operator implementations can reuse the code.
 class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Conv2DTransposeOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker);
+  Conv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };

 class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Conv3DTransposeOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker);
+  Conv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };

 class ConvTransposeOp : public framework::OperatorWithKernel {
@@ -63,6 +61,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {

    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
    // groups will alway be disabled in conv2dtranspose.

    const int batch_size = static_cast<int>(input->dims()[0]);
@@ -115,7 +114,6 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {

    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
    math::Col2VolFunctor<DeviceContext, T> col2vol;
-    std::vector<int> dilations({1, 1, 1});

    // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
    // on input)
@@ -167,6 +165,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {

    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");

    const int batch_size = static_cast<int>(input->dims()[0]);

@@ -221,7 +220,6 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {

      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      std::vector<int> dilations({1, 1, 1});

      if (input_grad) {
        input_grad->mutable_data<T>(context.GetPlace());

--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -62,7 +62,7 @@ class CosSimOp : public framework::OperatorWithKernel {

 class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CosSimOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  CosSimOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The 1st input of cos_sim op.");
    AddInput("Y", "The 2nd input of cos_sim op.");

--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
@@ -18,8 +18,7 @@ namespace paddle {
 namespace operators {
 class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CRFDecodingOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  CRFDecodingOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Emission",
             "(LoDTensor, default: LoDTensor<float>). A LoDTensor with shape "

--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -52,7 +52,7 @@ class CropOp : public framework::OperatorWithKernel {

 class CropOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CropOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  CropOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "The input of pad op. "

--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -111,8 +111,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {

 class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  CrossEntropyOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "

--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/operators/decayed_adagrad_op.cc
@@ -55,8 +55,7 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {

 class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  DecayedAdagradOpMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  DecayedAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("Grad", "(Tensor) Input gradient");

--- a/paddle/operators/detail/recv_impl.cc
+++ b/paddle/operators/detail/recv_impl.cc
@@ -20,25 +20,57 @@ namespace detail {

 Status SendRecvServerImpl::SendVariable(ServerContext *context,
                                        const VariableMessage *in_var,
-                                        VariableMessage *out_var) {
-  framework::LoDTensor t;
-  // TODO(typhoonzero): desirealize in_tensor and run pserver network.
+                                        VoidMessage *out_var) {
+  // TODO(typhoonzero): support different variable types.
  std::istringstream iss(in_var->serialized());
+  framework::LoDTensor t;
  framework::DeserializeFromStream(iss, &t);
-  lodtensor_queue_.Push(std::move(t));
-  // Block util the sub graph is done.
-  t = lodtensor_return_queue_.Pop();
+  TensorWithName tensor_with_name =
+      std::make_pair(in_var->varname(), std::move(t));
+
+  var_recv_queue_.Push(std::move(tensor_with_name));
+  return Status::OK;
+}
+
+Status SendRecvServerImpl::GetVariable(ServerContext *context,
+                                       const VariableMessage *in_var,
+                                       VariableMessage *out_var) {
+  std::string get_var_name = in_var->varname();
+  auto *var = scope_->FindVar(get_var_name);
+  auto tensor = var->Get<framework::LoDTensor>();
  std::ostringstream oss;
-  // FIXME(typhoonzero): get context from op.
-  framework::SerializeToStream(oss, t, platform::CPUDeviceContext());
+  framework::SerializeToStream(oss, tensor, platform::CPUDeviceContext());
+
  std::string *varname = out_var->mutable_varname();
-  *varname = in_var->varname();
+  *varname = get_var_name;
  std::string *serialized = out_var->mutable_serialized();
  *serialized = oss.str();
+  return Status::OK;
+}

+Status SendRecvServerImpl::Wait(ServerContext *context,
+                                const VoidMessage *in_var,
+                                VoidMessage *out_var) {
+  {
+    std::unique_lock<std::mutex> lock(this->mutex_);
+    condition_.wait(lock, [=] { return this->done_ == true; });
+  }
  return Status::OK;
 }

+void SendRecvServerImpl::Reset() {
+  std::lock_guard<std::mutex> lock(this->mutex_);
+  done_ = false;
+}
+
+void SendRecvServerImpl::Done() {
+  {
+    std::lock_guard<std::mutex> lock(this->mutex_);
+    done_ = true;
+  }
+  condition_.notify_all();
+}
+
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/detail/send_impl.cc
+++ b/paddle/operators/detail/send_impl.cc
@@ -19,10 +19,10 @@ namespace operators {
 namespace detail {

 bool RPCClient::SendVariable(const framework::Scope& scope,
-                             const std::string& inname,
-                             const std::string& outname) {
+                             const std::string& inname) {
  ClientContext context;
-  VariableMessage msg, out_msg;
+  VariableMessage msg;
+  VoidMessage out_msg;
  // FIXME(typhoonzero): pass device context to here.
  auto ctx = platform::CPUDeviceContext();
  auto* var = scope.FindVar(inname);
@@ -37,9 +37,26 @@ bool RPCClient::SendVariable(const framework::Scope& scope,
  msg.set_serialized(oss.str());
  Status status = stub_->SendVariable(&context, msg, &out_msg);
  if (!status.ok()) {
+    LOG(ERROR) << "gRPC error: " << status.error_message();
    return false;
  }
-  std::istringstream iss(out_msg.serialized());
+  return true;
+}
+
+bool RPCClient::GetVariable(const framework::Scope& scope,
+                            const std::string& outname) {
+  ClientContext context;
+  VariableMessage call_msg, ret_msg;
+  call_msg.set_varname(outname);
+  auto ctx = platform::CPUDeviceContext();
+  Status status = stub_->GetVariable(&context, call_msg, &ret_msg);
+  if (!status.ok()) {
+    LOG(ERROR) << "gRPC error: " << status.error_message();
+    return false;
+  }
+
+  std::istringstream iss(ret_msg.serialized());
+
  framework::LoDTensor ret_tensor;
  framework::DeserializeFromStream(iss, &ret_tensor);
  auto* outvar = scope.FindVar(outname);
@@ -49,6 +66,12 @@ bool RPCClient::SendVariable(const framework::Scope& scope,
  return true;
 }

+void RPCClient::Wait() {
+  ClientContext context;
+  VoidMessage call_msg, ret_msg;
+  stub_->Wait(&context, call_msg, &ret_msg);
+}
+
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/detail/send_recv.proto
+++ b/paddle/operators/detail/send_recv.proto
@@ -19,7 +19,12 @@ package sendrecv;
 service SendRecvService {
  // For parameter server round-robin like hashing, do not split tensors.
  // Send and recv only one tensor
-  rpc SendVariable(VariableMessage) returns (VariableMessage) {}
+  // TODO(typhoonzero): add streaming API
+  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
+  // Argument VariableMessage for GetVariable should only contain varname.
+  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  // wait for one execution of the program
+  rpc Wait(VoidMessage) returns (VoidMessage) {}
 }

 // VariableMessage is serialized paddle variable message.

--- a/paddle/operators/detail/send_recv_impl.h
+++ b/paddle/operators/detail/send_recv_impl.h
@@ -20,10 +20,6 @@
 #include "paddle/framework/selected_rows.h"
 #include "paddle/operators/detail/simple_block_queue.h"

-// #include <grpc++/channel.h>
-// #include <grpc++/client_context.h>
-// #include <grpc++/create_channel.h>
-// #include <grpc++/security/credentials.h>
 #include "paddle/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/operators/detail/send_recv.pb.h"

@@ -48,24 +44,32 @@ namespace paddle {
 namespace operators {
 namespace detail {

+typedef std::pair<std::string, framework::LoDTensor> TensorWithName;
+
 class SendRecvServerImpl final : public SendRecvService::Service {
 public:
  explicit SendRecvServerImpl() {}

  Status SendVariable(ServerContext *context, const VariableMessage *in_var,
+                      VoidMessage *out_var) override;
+  Status GetVariable(ServerContext *context, const VariableMessage *in_var,
                     VariableMessage *out_var) override;
+  Status Wait(ServerContext *context, const VoidMessage *in_var,
+              VoidMessage *out_var) override;
+  void Reset();
+  void Done();
+  void SetScope(framework::Scope *scope) { scope_ = scope; };

-  const framework::LoDTensor Get() { return this->lodtensor_queue_.Pop(); }
-
-  void Push(const framework::LoDTensor &tensor) {
-    this->lodtensor_return_queue_.Push(tensor);
-  }
+  const TensorWithName Get() { return this->var_recv_queue_.Pop(); }

 private:
-  SimpleBlockQueue<framework::LoDTensor> lodtensor_queue_;
-  SimpleBlockQueue<framework::LoDTensor> lodtensor_return_queue_;
-  SimpleBlockQueue<framework::SelectedRows> selected_rows_queue_;
-  SimpleBlockQueue<framework::SelectedRows> selected_rows_return_queue_;
+  // received variable from RPC, operators fetch variable from this queue.
+  SimpleBlockQueue<TensorWithName> var_recv_queue_;
+  framework::Scope *scope_;
+  // condition of the sub program
+  std::mutex mutex_;
+  bool done_;
+  std::condition_variable condition_;
 };

 // RPCClient is a class to send tensors to pserver sub-network
@@ -75,8 +79,9 @@ class RPCClient {
  RPCClient(std::shared_ptr<Channel> channel)
      : stub_(SendRecvService::NewStub(channel)) {}

-  bool SendVariable(const framework::Scope &scope, const std::string &inname,
-                    const std::string &outname);
+  bool SendVariable(const framework::Scope &scope, const std::string &inname);
+  bool GetVariable(const framework::Scope &scope, const std::string &outname);
+  void Wait();

 private:
  std::unique_ptr<SendRecvService::Stub> stub_;

--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -40,8 +40,7 @@ class DropoutOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  DropoutOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of dropout op.");
    AddOutput("Out", "The output of dropout op.");

--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -71,7 +71,7 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
      auto M = EigenMatrix<T>::Reshape(*mask, 1);
      Y.device(place) = X * M;
    } else {
-      Y.device(place) = X * dropout_prob;
+      Y.device(place) = X * (1.0f - dropout_prob);
    }
  }
 };

--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -57,7 +57,7 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
      auto Y = EigenMatrix<T>::Reshape(*y, 1);
      auto& place =
          *context.template device_context<DeviceContext>().eigen_device();
-      Y.device(place) = X * dropout_prob;
+      Y.device(place) = X * (1.0f - dropout_prob);
    }
  }
 };

--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
@@ -19,8 +19,7 @@ namespace paddle {
 namespace operators {
 class ElementwiseAddOpMaker : public ElementwiseOpMaker {
 public:
-  ElementwiseAddOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseAddOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : ElementwiseOpMaker(proto, op_checker) {
    SetComment("Add", "$Out = X + Y$");
    AddComment(comment_);

--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
@@ -19,8 +19,7 @@ namespace paddle {
 namespace operators {
 class ElementwiseDivOpMaker : public ElementwiseOpMaker {
 public:
-  ElementwiseDivOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseDivOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : ElementwiseOpMaker(proto, op_checker) {
    SetComment("Div", "$Out = X / Y$");
    AddComment(comment_);

--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -20,8 +20,7 @@ namespace operators {

 class ElementwiseMulOpMaker : public ElementwiseOpMaker {
 public:
-  ElementwiseMulOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : ElementwiseOpMaker(proto, op_checker) {
    SetComment("Mul", "$Out = X \\odot\\ Y$");
    AddComment(comment_);

--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -43,8 +43,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {

 class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ElementwiseOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) The first input tensor of elementwise op");
    AddInput("Y", "(Tensor) The second input tensor of elementwise op");

--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -103,11 +103,13 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext> {

  MidWiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
    ++j_;
-    i_ = j_ / post_;
-    if (UNLIKELY(i_ == n_)) {
+    if (UNLIKELY(j_ == post_)) {
+      ++i_;
      j_ = 0;
+      if (UNLIKELY(i_ == n_)) {
        i_ = 0;
      }
+    }
    return *this;
  }

@@ -125,10 +127,10 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext> {

 private:
  const T* ptr_;
-  int i_;
+  int64_t i_;
  int64_t j_;
  int64_t n_;
-  int post_;
+  int64_t post_;
 };

 #ifdef __NVCC__

--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -19,8 +19,7 @@ namespace paddle {
 namespace operators {
 class ElementwiseSubOpMaker : public ElementwiseOpMaker {
 public:
-  ElementwiseSubOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseSubOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : ElementwiseOpMaker(proto, op_checker) {
    SetComment("Sub", "$Out = X - Y$");
    AddComment(comment_);

--- a/paddle/operators/expand_op.cc
+++ b/paddle/operators/expand_op.cc
@@ -55,7 +55,7 @@ class ExpandOp : public framework::OperatorWithKernel {

 class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ExpandOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  ExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."

--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -54,8 +54,7 @@ class FeedOp : public framework::OperatorBase {

 class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  FeedOpInfoMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  FeedOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of feed op");
    AddOutput("Out", "The output of feed op");

--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -61,8 +61,7 @@ class FetchOp : public framework::OperatorBase {

 class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  FetchOpInfoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  FetchOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of fetch op");
    AddOutput("Out", "The output of fetch op");

--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -52,7 +52,7 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
  framework::OpKernelType GetKernelType(
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
        ctx.device_context());
  }
 };
@@ -60,13 +60,12 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
 class FillConstantBatchSizeLikeOpMaker
    : public framework::OpProtoAndCheckerMaker {
 public:
-  FillConstantBatchSizeLikeOpMaker(framework::OpProto *proto,
-                                   framework::OpAttrChecker *op_checker)
+  FillConstantBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddAttr<int>("dtype",
                 "(int, default 5 (FP32)) "
                 "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
    AddInput("Input",
             "(Tensor) Tensor "
             "whose dim_idx th dimension is used to specify the batch_size");

--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -34,7 +34,8 @@ class FillConstantOp : public framework::OperatorBase {
  using framework::OperatorBase::OperatorBase;
  void Run(const framework::Scope &scope,
           const platform::DeviceContext &dev_ctx) const override {
-    auto data_type = static_cast<framework::DataType>(Attr<int>("dtype"));
+    auto data_type =
+        static_cast<framework::proto::DataType>(Attr<int>("dtype"));
    auto value = Attr<float>("value");
    auto force_cpu = Attr<bool>("force_cpu");
    auto &out =
@@ -52,13 +53,12 @@ class FillConstantOp : public framework::OperatorBase {

 class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  FillConstantOpMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  FillConstantOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddAttr<int>("dtype",
                 "(int, default 5 (FP32)) "
                 "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
    AddAttr<float>("value", "(float, default 0) The value to be filled")
        .SetDefault(0.0f);

--- a/paddle/operators/fill_op.cc
+++ b/paddle/operators/fill_op.cc
@@ -48,7 +48,7 @@ class FillOp : public framework::OperatorBase {
                                "Cannot find variable %s", Output("Out"))
                        .GetMutable<framework::LoDTensor>());
    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
-    auto dtype = static_cast<framework::DataType>(Attr<int>("dtype"));
+    auto dtype = static_cast<framework::proto::DataType>(Attr<int>("dtype"));
    platform::CPUPlace cpu;
    auto force_cpu = Attr<bool>("force_cpu");
    out.mutable_data(force_cpu ? cpu : dev_ctx.GetPlace(),
@@ -76,7 +76,7 @@ class FillOp : public framework::OperatorBase {

 class FillOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  FillOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  FillOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddComment(R"DOC(Fill operator

@@ -88,7 +88,7 @@ Fill an tensor with `value` and `shape`. The type of the tensor is specify by
        "value", "The float values of tensor, which are flatten in row major");
    AddAttr<std::vector<int>>("shape", "The shape of output tensor");
    AddAttr<int>("dtype", "The data type of output tensor, Default is float")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
    AddAttr<bool>("force_cpu",
                  "Whether the output tensor must be at CPU memory or not. "
                  "Default is false.")

--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -24,20 +24,19 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of FillZerosLikeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"),
-                   "Output(Y) of FillZerosLikeOp should not be null.");
-    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Y");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FillZerosLikeOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
  }
 };

 class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  FillZerosLikeOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
+  FillZerosLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of fill-zeros-like op.");
-    AddOutput("Y", "The variable will be filled up with zeros.");
+    AddOutput("Out", "The variable will be filled up with zeros.");
    AddComment(R"DOC(
 FillZerosLike Operator.


--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -23,7 +23,7 @@ template <typename DeviceContext, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
    out->mutable_data<T>(context.GetPlace());

    math::SetConstant<DeviceContext, T> setter;

--- a/paddle/operators/ftrl_op.cc
+++ b/paddle/operators/ftrl_op.cc
@@ -57,7 +57,7 @@ class FTRLOp : public framework::OperatorWithKernel {

 class FTRLOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  FTRLOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  FTRLOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param",
             "(Tensor, default Tensor<float>) "

--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -67,7 +67,7 @@ class GatherGradOp : public framework::OperatorWithKernel {

 class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  GatherOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  GatherOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The source input of gather op");
    AddInput("Index", "The index input of gather op");

--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -60,15 +60,14 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  framework::OpKernelType GetKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
        ctx.device_context());
  }
 };

 class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  GaussianRandomOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  GaussianRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddOutput("Out", "Output matrix of gaussian random op");

@@ -91,7 +90,7 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("dtype",
                 "(int, default 5(FP32)) "
                 "Output data type.")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);

    AddComment(R"DOC(
 GaussianRandom Operator.

--- a/paddle/operators/gru_op.cc
+++ b/paddle/operators/gru_op.cc
@@ -67,7 +67,7 @@ class GRUOp : public framework::OperatorWithKernel {

 class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  GRUOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  GRUOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Input",
             "(LoDTensor) The first input is a LodTensor, which supports "

--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
@@ -71,8 +71,7 @@ class GRUUnitOp : public framework::OperatorWithKernel {

 class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  GRUUnitOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  GRUUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Input",
             "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "

--- a/paddle/operators/hinge_loss_op.cc
+++ b/paddle/operators/hinge_loss_op.cc
@@ -46,8 +46,7 @@ class HingeLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  HingeLossOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  HingeLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Logits",
             "The input value (Logits) of Hinge loss op."

--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
@@ -45,8 +45,7 @@ class HuberLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  HuberLossOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  HuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "The input value of huber loss op."

--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@@ -70,8 +70,7 @@ class IncrementOp : public framework::OperatorBase {

 class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  IncrementOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  IncrementOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) The input tensor of increment operator");
    AddOutput("Out", "(Tensor) The output tensor of increment operator.");
@@ -94,13 +93,13 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("increment");
    grad_op->SetInput("X", Output("Out"));
    grad_op->SetOutput("Out", Input("X"));
    grad_op->SetAttr("step", -boost::get<float>(GetAttr("step")));
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };


--- a/paddle/operators/is_empty_op.cc
+++ b/paddle/operators/is_empty_op.cc
@@ -47,8 +47,7 @@ class IsEmptyOp : public framework::OperatorBase {

 class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  IsEmptyOpProtoMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  IsEmptyOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(kInput, "(Tensor) Tensor which is to be checked.");
    AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");

--- a/paddle/operators/l1_norm_op.cc
+++ b/paddle/operators/l1_norm_op.cc
@@ -48,7 +48,7 @@ class L1NormGradOp : public framework::OperatorWithKernel {

 class L1NormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  L1NormOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  L1NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) The input of l1_norm op.");
    AddOutput("Out", "(Scalar) The output of l1_norm op.");

--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -19,8 +19,7 @@ namespace operators {

 class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LinearChainCRFOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  LinearChainCRFOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Emission",
             "(LoDTensor, default LoDTensor<float>) "

--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -58,8 +58,7 @@ class LoadOp : public framework::OperatorBase {

 class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LoadOpProtoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  LoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddOutput("Out", "(Tensor) The tensor need to be loaded");
    AddAttr<std::string>("file_path",

--- a/paddle/operators/lod_array_length_op.cc
+++ b/paddle/operators/lod_array_length_op.cc
@@ -38,8 +38,7 @@ class LoDArrayLengthOp : public framework::OperatorBase {

 class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LoDArrayLengthProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  LoDArrayLengthProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(LoDTensorArray) The input tensor array.");
    AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t");

--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -30,13 +30,13 @@ class LoDRankTableOp : public framework::OperatorBase {
        scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
    VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));
    out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
+    VLOG(10) << Input("X") << "'s lod information is " << *out;
  }
 };

 class LoDRankTableOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LoDRankTableOpProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  LoDRankTableOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(LoDTensor) input lod tensor, must contain lod information.");
@@ -63,11 +63,11 @@ class LoDRankTableInferShape : public framework::InferShapeBase {

 class LoDRankTableInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
    for (auto &o : op_desc.Output("Out")) {
      block->FindRecursiveOrCreateVar(o)->SetType(
-          framework::VarDesc::LOD_RANK_TABLE);
+          framework::proto::VarDesc::LOD_RANK_TABLE);
    }
  }
 };

--- a/paddle/operators/lod_reset_op.cc
+++ b/paddle/operators/lod_reset_op.cc
@@ -48,8 +48,7 @@ class LoDResetOp : public framework::OperatorWithKernel {

 class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LoDResetOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  LoDResetOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
    AddInput("TargetLoD",

--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -97,8 +97,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase {

 class LoDTensorToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LoDTensorToArrayOpProtoMaker(framework::OpProto *proto,
-                               framework::OpAttrChecker *op_checker)
+  LoDTensorToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "");
    AddInput("RankTable", "");
@@ -128,10 +127,10 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase {

 class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+      block->Var(out_var)->SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
    }
  }
 };
@@ -141,14 +140,14 @@ class LoDTensorToArrayGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("array_to_lod_tensor");
    grad_op->SetInput("X", OutputGrad("Out"));
    grad_op->SetInput("RankTable", Input("RankTable"));
    grad_op->SetOutput("Out", InputGrad("X"));
    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };


--- a/paddle/operators/log_loss_op.cc
+++ b/paddle/operators/log_loss_op.cc
@@ -46,8 +46,7 @@ class LogLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LogLossOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  LogLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Predicted",
             "The input value (Predicted) of Log loss op."

--- a/paddle/operators/logical_op.cc
+++ b/paddle/operators/logical_op.cc
@@ -20,8 +20,7 @@ namespace operators {
 template <typename OpComment>
 class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  BinaryLogicalOpProtoMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  BinaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    OpComment comment;
    AddInput("X",
@@ -45,8 +44,7 @@ Each element of Out is calculated by %s
 template <typename OpComment>
 class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  UnaryLogicalOpProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  UnaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    OpComment comment;
    AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator",

--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -51,8 +51,7 @@ class LookupTableOp : public framework::OperatorWithKernel {

 class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LookupTableOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  LookupTableOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("W",
             "An input represents embedding tensors, "
@@ -109,19 +108,20 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {

 class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDescBind& op_desc,
-                  framework::BlockDescBind* block) const override {
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
    auto attr = op_desc.GetAttr("is_sparse");
    bool is_sparse = boost::get<bool>(attr);
    if (is_sparse) {
      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
              << " is set to SelectedRows";
-      block->Var(out_var_name)->SetType(framework::VarDesc::SELECTED_ROWS);
+      block->Var(out_var_name)
+          ->SetType(framework::proto::VarDesc::SELECTED_ROWS);
    } else {
      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
              << " is set to LoDTensor";
-      block->Var(out_var_name)->SetType(framework::VarDesc::LOD_TENSOR);
+      block->Var(out_var_name)->SetType(framework::proto::VarDesc::LOD_TENSOR);
    }
  }
 };

--- a/paddle/operators/lrn_op.cc
+++ b/paddle/operators/lrn_op.cc
@@ -140,7 +140,7 @@ class LRNOp : public framework::OperatorWithKernel {
 template <typename T>
 class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LRNOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  LRNOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor) The input of LRN operator. "

--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -102,7 +102,7 @@ class LSTMOp : public framework::OperatorWithKernel {

 class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LSTMOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  LSTMOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Input",
             "(LoDTensor) the first input is a LodTensor, which support "

--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -48,10 +48,12 @@ class LstmUnitOp : public framework::OperatorWithKernel {

 class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  LstmUnitOpMaker(framework::OpProto* proto,
-                  framework::OpAttrChecker* op_checker)
+  LstmUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "FC input before the non-linear activation.");
+    AddInput("X",
+             "Lstm unit only applies non-linear activations, please make sure"
+             "that linear tranformation has already been applied to `X`. "
+             "Linear tranformation can be applied by adding a `fc` layer");
    AddInput(
        "C_prev",
        "The cell state tensor of last time-step in the Lstm Unit operator.");

--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
@@ -42,8 +42,7 @@ class MarginRankLossOp : public framework::OperatorWithKernel {
 template <typename T>
 class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MarginRankLossOpMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  MarginRankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X1",
             "(2-D tensor with shape [batch_size x 1]) The score for "

--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
@@ -61,14 +61,13 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,

    const T* im_data = im.data<T>();
    T* col_data = col->data<T>();
-
    for (int c = 0; c < channels_col; ++c) {
      int w_offset = c % filter_width;
      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / filter_width / filter_height;
+      int c_im = c / (filter_width * filter_height);
      for (int h = 0; h < col_height; ++h) {
-        for (int w = 0; w < col_width; ++w) {
        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+        for (int w = 0; w < col_width; ++w) {
          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
          int col_idx = (c * col_height + h) * col_width + w;
          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
@@ -130,16 +129,14 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
    for (int c = 0; c < channels_col; ++c) {
      int w_offset = c % filter_width;
      int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / filter_width / filter_height;
+      int c_im = c / (filter_width * filter_height);
      for (int h = 0; h < col_height; ++h) {
-        for (int w = 0; w < col_width; ++w) {
        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+        for (int w = 0; w < col_width; ++w) {
          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-
          if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
              (im_col_idx) >= 0 && (im_col_idx) < im_width) {
-            im_row_idx += c_im * im_height;
-            im_data[im_row_idx * im_width + im_col_idx] +=
+            im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] +=
                col_data[(c * col_height + h) * col_width + w];
          }
        }
@@ -199,12 +196,13 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
        for (int channel = 0; channel < im_channels; ++channel) {
          for (int filter_row_idx = 0; filter_row_idx < filter_height;
               ++filter_row_idx) {
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
            int im_row_offset =
                col_row_idx * stride[0] + filter_row_idx - padding[0];
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
              int im_col_offset =
                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+
              int col_offset =
                  ((((col_row_idx)*col_width + col_col_idx) * im_channels +
                    channel) *
@@ -271,12 +269,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
        for (int channel = 0; channel < im_channels; ++channel) {
          for (int filter_row_idx = 0; filter_row_idx < filter_height;
               ++filter_row_idx) {
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
            int im_row_offset =
                col_row_idx * stride[0] + filter_row_idx - padding[0];
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
              int im_col_offset =
                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+
              int col_offset =
                  (((col_row_idx * col_width + col_col_idx) * im_channels +
                    channel) *
@@ -284,6 +283,7 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                   filter_row_idx) *
                      filter_width +
                  filter_col_idx;
+
              if (im_row_offset >= 0 && im_row_offset < im_height &&
                  im_col_offset >= 0 && im_col_offset < im_width) {
                int im_offset =

--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -67,18 +67,45 @@ void RowwiseAdd<DeviceContext, T>::operator()(const DeviceContext& context,
 template <typename DeviceContext, typename T>
 void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                              const framework::Tensor& input,
-                                              framework::Tensor* vector) {
+                                              framework::Tensor* out) {
  auto in_dims = input.dims();
  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), size);
+  PADDLE_ENFORCE_EQ(out->numel(), size);

-  auto vec = framework::EigenMatrix<T>::From(*vector);
  auto in = framework::EigenMatrix<T>::From(input);
-  Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
-  vec.reshape(shape).device(*context.eigen_device()) =
-      in.sum(Eigen::array<int, 1>({{0}})).reshape(shape);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
 }

+// Specialize for CPU, since Eigen implement a general reduce. However,
+// colwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class ColwiseSum<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), size);
+
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < height; ++i) {
+      for (size_t j = 0; j < size; ++j) {
+        if (i == 0) {
+          out_buf[j] = in_buf[i * size + j];
+        } else {
+          out_buf[j] += in_buf[i * size + j];
+        }
+      }
+    }
+  }
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
@@ -130,7 +130,7 @@ class MatMulOp : public framework::OperatorWithKernel {

 class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MatMulOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MatMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The first input of MatMul op");
    AddInput("Y", "The second input of MatMul op");

--- a/paddle/operators/max_sequence_len_op.cc
+++ b/paddle/operators/max_sequence_len_op.cc
@@ -40,8 +40,7 @@ class MaxSeqenceLenOp : public framework::OperatorBase {

 class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MaxSeqenceLenOpProtoMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  MaxSeqenceLenOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("RankTable", "The lod_rank_table.");
    AddOutput("Out", "The max sequence length.");

--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
@@ -20,7 +20,7 @@ using framework::Tensor;

 class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MaxOutOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MaxOutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(
        "X",

--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -32,7 +32,7 @@ class MeanOp : public framework::OperatorWithKernel {

 class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MeanOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MeanOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of mean op");
    AddOutput("Out", "The output of mean op");
@@ -60,13 +60,13 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto* grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
    grad_op->SetType("mean_grad");
    grad_op->SetInput("X", Input("X"));
    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };


--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -114,8 +114,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {

 class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MergeLoDTensorOpProtoMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+  MergeLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "The input LoDTensor, contains complete lod information to "
@@ -162,15 +161,15 @@ class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("split_lod_tensor");
    grad_op->SetInput("X", OutputGrad("Out"));
    grad_op->SetInput("Mask", Input("Mask"));
    grad_op->SetOutput("OutTrue", InputGrad("InTrue"));
    grad_op->SetOutput("OutFalse", InputGrad("InFalse"));
    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };


--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -46,7 +46,7 @@ class MinusOp : public framework::OperatorWithKernel {

 class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MinusOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The left tensor of minus operator.");
    AddInput("Y", "The right tensor of minus operator.");
@@ -70,12 +70,11 @@ class MinusGradMaker : public framework::GradOpDescMakerBase {
 public:
  using framework::GradOpDescMakerBase::GradOpDescMakerBase;

-  std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
-      const override {
-    std::vector<std::unique_ptr<framework::OpDescBind>> ops;
+  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
+    std::vector<std::unique_ptr<framework::OpDesc>> ops;
    auto x_g = InputGrad("X");
    if (!x_g.empty()) {
-      auto *x_g_op = new framework::OpDescBind();
+      auto *x_g_op = new framework::OpDesc();
      x_g_op->SetType("scale");
      x_g_op->SetInput("X", OutputGrad("Out"));
      x_g_op->SetOutput("Out", x_g);
@@ -85,7 +84,7 @@ class MinusGradMaker : public framework::GradOpDescMakerBase {

    auto y_g = InputGrad("Y");
    if (!y_g.empty()) {
-      auto *y_g_op = new framework::OpDescBind();
+      auto *y_g_op = new framework::OpDesc();
      y_g_op->SetType("scale");
      y_g_op->SetInput("X", OutputGrad("Out"));
      y_g_op->SetOutput("Out", y_g);

--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -39,8 +39,7 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {

 class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ModifiedHuberLossOpMaker(framework::OpProto* proto,
-                           framework::OpAttrChecker* op_checker)
+  ModifiedHuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "The input tensor of modified huber loss op. "

--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
@@ -54,8 +54,7 @@ class MomentumOp : public framework::OperatorWithKernel {

 class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MomentumOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  MomentumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param",
             "(Tensor, default Tensor<float>) "

--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -71,41 +71,52 @@ class MulOpShapeInference : public framework::InferShapeBase {

 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MulOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of mul op");
-    AddInput("Y", "The second input of mul op");
-    AddOutput("Out", "The output of mul op");
+    AddInput("X", "(Tensor), The first input tensor of mul op.");
+    AddInput("Y", "(Tensor), The second input tensor of mul op.");
+    AddOutput("Out", "(Tensor), The output tensor of mul op.");
    AddAttr<int>(
        "x_num_col_dims",
-        "(int, default 1) "
-        R"DOC(mul_op can take tensors with more than two dimensions as input `X`,
-            in that case, tensors will be reshaped to a matrix. The matrix's first
-            dimension(column length) will be the product of tensor's last
-            `num_col_dims` dimensions, and the matrix's second dimension(row length)
-            will be the product of tensor's first `rank - num_col_dims` dimensions.
+        R"DOC((int, default 1), The mul_op can take tensors with more than two
+              dimensions as its inputs. If the input $X$ is a tensor with more
+              than two dimensions, $X$ will be flattened into a two-dimensional
+              matrix first. The flattening rule is: the first `num_col_dims`
+              will be flattened to form the first dimension of the final matrix
+              (the height of the matrix), and the rest `rank(X) - num_col_dims`
+              dimensions are flattened to form the second dimension of the final
+              matrix (the width of the matrix). As a result, height of the
+              flattened matrix is equal to the product of $X$'s first
+              `x_num_col_dims` dimensions' sizes, and width of the flattened
+              matrix is equal to the product of $X$'s last `rank(x) - num_col_dims`
+              dimensions' size. For example, suppose $X$ is a 6-dimensional
+              tensor with the shape [2, 3, 4, 5, 6], and `x_num_col_dims` = 3.
+              Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] =
+              [24, 30].
        )DOC")
        .SetDefault(1)
        .EqualGreaterThan(1);
    AddAttr<int>(
        "y_num_col_dims",
-        "(int, default 1) "
-        R"DOC(mul_op can take tensors with more than two dimensions as input `Y`,
-             in that case, tensors will be reshaped to a matrix. Just like input `X`.
+        R"DOC((int, default 1), The mul_op can take tensors with more than two,
+              dimensions as its inputs. If the input $Y$ is a tensor with more
+              than two dimensions, $Y$ will be flattened into a two-dimensional
+              matrix first. The attribute `y_num_col_dims` determines how $Y$ is
+              flattened. See comments of `x_num_col_dims` for more details.
        )DOC")
        .SetDefault(1)
        .EqualGreaterThan(1);
    AddComment(R"DOC(
 Mul Operator.

-This operator is used to perform matrix multiplication for input X and Y.
+This operator is used to perform matrix multiplication for input $X$ and $Y$.

 The equation is:

    $$Out = X * Y$$

-Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input `X`.
+Both the input $X$ and $Y$ can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input $X$.

 )DOC");
  }

--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -61,8 +61,7 @@ class MultiplexOp : public framework::OperatorWithKernel {

 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MultiplexOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  MultiplexOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Ids", "The index tensor of multiplex operator.");
    AddInput("X", "The candidate tensors of multiplex operator.")

--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -43,8 +43,7 @@ class NCCLInitOp : public framework::OperatorBase {

 class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  NCCLInitOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  NCCLInitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddOutput("Communicator",
              "Create Communicator for communicating between gpus");
@@ -52,7 +51,7 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<int>("dtype",
                 "(int, default 5 (FP32)) "
                 "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
    AddComment(R"DOC(
 NCCLInit Operator.

@@ -141,8 +140,7 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
 // AllreduceOp
 class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  NCCLAllReduceOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
+  NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of AllReduce op");
    AddInput("Communicator", "Communicator for communicating between gpus");
@@ -163,8 +161,7 @@ AllReduce the input tensors.
 // ReduceOp
 class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  NCCLReduceOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  NCCLReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of Reduce op");
    AddInput("Communicator", "Communicator for communicating between gpus");
@@ -190,8 +187,7 @@ Reduce the tensors.
 // BcastOp
 class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  NCCLBcastOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  NCCLBcastOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of BcastSend op");
    AddInput("Communicator", "Communicator for communicating between gpus");

--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -65,7 +65,7 @@ class NCCLTester : public ::testing::Test {
  }

  void NCCLInitOp() {
-    std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
+    std::unique_ptr<f::OpDesc> op1(new f::OpDesc);

    op1->SetType("ncclInit");
    op1->SetOutput("Communicator", {"comm"});
@@ -81,10 +81,9 @@ class NCCLTester : public ::testing::Test {
  }

  template <class T>
-  void PerThreadProgram(int gpu_id, const f::OpDescBind &op_desc,
-                        f::Scope *scope) {
+  void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
    std::unique_lock<std::mutex> lk(mu);
-    const f::OpDescBind *op1 = &op_desc;
+    const f::OpDesc *op1 = &op_desc;

    p::GPUPlace place(gpu_id);
    auto &ctx = dev_ctxs.at(gpu_id);
@@ -125,7 +124,7 @@ class NCCLTester : public ::testing::Test {

 // ncclInitOp with desc
 TEST(NCCL, ncclInitOp) {
-  std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);

  op_desc->SetType("ncclInit");
  op_desc->SetOutput("Communicator", {"x1"});
@@ -145,7 +144,7 @@ TEST(NCCL, ncclInitOp) {

 // ncclAllReduceOp with desc
 TEST_F(NCCLTester, ncclAllReduceOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
  op2->SetType("ncclAllReduce");
  op2->SetInput("X", {"st"});
  op2->SetInput("Communicator", {"comm"});
@@ -192,7 +191,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {

 // ncclReduceOp with desc
 TEST_F(NCCLTester, ncclReduceOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
  const int kRoot = 0;
  op2->SetType("ncclReduce");
  op2->SetInput("X", {"st"});
@@ -240,7 +239,7 @@ TEST_F(NCCLTester, ncclReduceOp) {

 // ncclBcastOp with desc
 TEST_F(NCCLTester, ncclBcastOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
  const int kRoot = 5;
  op2->SetType("ncclBcast");
  op2->SetInput("X", {"st"});

--- a/paddle/operators/nce_op.cc
+++ b/paddle/operators/nce_op.cc
@@ -73,7 +73,7 @@ class NCEOp : public framework::OperatorWithKernel {

 class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  NCEOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  NCEOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim].");
    AddInput(

--- a/paddle/operators/batch_norm_op.md
+++ b/paddle/operators/batch_norm_op.md
--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/name_convention.md
@@ -35,8 +35,8 @@ Here we give some examples to show how these rules will be used.
 ```c++
 class AccumulateOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AccumulateOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  AccumulateOpMaker(OpProto *proto,
+                    OpAttrChecker *op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
    If the output size is not the same as input size,

--- a/paddle/operators/net_op_design.md
+++ b/paddle/operators/net_op_design.md
--- a/paddle/operators/op_documentation/op_markdown_format.md
+++ b/paddle/operators/op_documentation/op_markdown_format.md
+# Standard Markdown Format for Operators
+The following should be the standard format for documentation for all the operators that will get rendered in the `html`:
+
+```
+Operator Name (In PaddlePaddle)
+
+Operator Name (Standard)
+
+Operator description.
+
+LaTeX equation of how the operator performs an update.
+
+The signature of the operator.
+```
+
+Each section mentioned above has been covered in further detail in the rest of the document.
+
+# PaddlePaddle Operator Name
+This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
+`array to lod tensor` should be written as `array_to_lod_tensor`.
+
+This naming convention should be standard across all PaddlePaddle operators.
+
+# Standard Operator Name
+This is the standard name of the operator as used in the community. The general standard is usually:
+- Standard abbreviations like `SGD` are written in all capital letters.
+- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
+- Keep numbers inside a word as is, with no boundary delimiters.
+- Follow the name of the operator with the keyword: `Activation Operator.`
+
+# Operator description
+This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
+
+# LaTeX equation
+This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
+
+# The signature
+This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
+`Section :
+VariableName : (VariableType) VariableDescription
+...
+...
+`
+
+
+The following example for an `sgd` operator covers the above mentioned sections as they would ideally look like in the `html`:
+
+```
+sgd
+
+SGD operator
+
+This operator implements one step of the stochastic gradient descent algorithm.
+
+param_out = param_learning_rate * grad
+
+Inputs:
+Param : (Tensor) Input parameter
+LearningRate : (Tensor) Learning rate of SGD
+Grad : (Tensor) Input gradient
+
+Outputs:
+ParamOut : (Tensor) Output parameter
+```
--- a/paddle/operators/rnn_design.md
+++ b/paddle/operators/rnn_design.md
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -48,7 +48,7 @@ class PadOp : public framework::OperatorWithKernel {

 class PadOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  PadOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  PadOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "The input of pad op. "
@@ -116,14 +116,14 @@ class PadOpGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto* bind = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* bind = new framework::OpDesc();
    bind->SetInput("X", Input("X"));
    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
    bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
    bind->SetAttrMap(Attrs());
    bind->SetType("pad_grad");
-    return std::unique_ptr<framework::OpDescBind>(bind);
+    return std::unique_ptr<framework::OpDesc>(bind);
  }
 };


--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -67,8 +67,7 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
  ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
 }

-Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput(
      "X",
@@ -136,8 +135,7 @@ Example:
 )DOC");
 }

-Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
    : OpProtoAndCheckerMaker(proto, op_checker) {
  AddInput("X",
           "(Tensor) The input tensor of pooling operator. "

--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -40,14 +40,12 @@ class PoolOpGrad : public framework::OperatorWithKernel {

 class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Pool2dOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Pool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };

 class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Pool3dOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Pool3dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };

 template <typename DeviceContext, typename T>

--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -100,8 +100,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {

 class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MaxPool2dWithIndexOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  MaxPool2dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(
        "X",
@@ -178,8 +177,7 @@ Example:

 class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MaxPool3dWithIndexOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  MaxPool3dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor) The input tensor of pooling operator. "

--- a/paddle/operators/positive_negative_pair_op.cc
+++ b/paddle/operators/positive_negative_pair_op.cc
@@ -95,8 +95,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {

 class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  PositiveNegativePairOpMaker(framework::OpProto *proto,
-                              framework::OpAttrChecker *op_checker)
+  PositiveNegativePairOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Score",
             "(Tensor, float) Model Score on an item (with "

--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
@@ -90,8 +90,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {

 class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  PrecisionRecallOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
+  PrecisionRecallOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("MaxProbs",
             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "

--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -38,7 +38,7 @@ class PReluOp : public framework::OperatorWithKernel {

 class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  PReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  PReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of prelu operator.");
    AddInput("Alpha", "The alpha weight of prelu operator.");

--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
@@ -59,8 +59,7 @@ class ProximalAdagradOp : public framework::OperatorWithKernel {

 class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ProximalAdagradOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
+  ProximalAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param",
             "(Tensor, default Tensor<float>) "

--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
@@ -47,8 +47,7 @@ class ProximalGDOp : public framework::OperatorWithKernel {

 class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ProximalGDOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  ProximalGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param",
             "(Tensor, default Tensor<float>) "

--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -45,8 +45,7 @@ class RankLossOp : public framework::OperatorWithKernel {

 class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RankLossOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  RankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Label",
             "(2-D Tensor with shape [batch_size x 1]) "

--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -234,7 +234,7 @@ class RecurrentOp : public RecurrentBase {
    auto reverse = Attr<bool>(kReverse);

    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();

    for (size_t i = 0; i < seq_len; ++i) {
@@ -317,7 +317,7 @@ class RecurrentGradOp : public RecurrentBase {
    auto reverse = Attr<bool>(kReverse);

    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();

    for (size_t step_id = 0; step_id < seq_len; ++step_id) {
@@ -497,8 +497,7 @@ class RecurrentGradOp : public RecurrentBase {

 class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RecurrentOpProtoMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  RecurrentOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(kInputs, "rnn inputs").AsDuplicable();
    AddInput(kInitialStates, "rnn initial states").AsDuplicable();
@@ -523,8 +522,7 @@ The ex-state means the state value in the ex-timestep or the previous time step
        string::Sprintf(
            "The state variable names. [%s, %s, %s] must be the same order",
            kExStates, kStates, kInitStateGrads));
-    AddAttr<framework::BlockDescBind *>(kStepBlock,
-                                        "The step block inside RNN");
+    AddAttr<framework::BlockDesc *>(kStepBlock, "The step block inside RNN");
    AddAttr<bool>(kReverse, R"DOC(Calculate RNN reversely or not.
 By default reverse=False

@@ -566,13 +564,13 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  virtual std::unique_ptr<framework::OpDescBind> Apply() const {
-    auto *grad = new framework::OpDescBind();
+  virtual std::unique_ptr<framework::OpDesc> Apply() const {
+    auto *grad = new framework::OpDesc();
    grad->SetType("recurrent_grad");
    for (auto &input_param : this->InputNames()) {
      grad->SetInput(input_param, this->Input(input_param));
      grad->SetOutput(framework::GradVarName(input_param),
-                      this->InputGrad(input_param));
+                      this->InputGrad(input_param, false));
    }

    for (auto &output_param : this->OutputNames()) {
@@ -589,7 +587,7 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
    grad->SetAttrMap(this->Attrs());
    grad->SetBlockAttr(kStepBlock, *grad_block_[0]);

-    return std::unique_ptr<framework::OpDescBind>(grad);
+    return std::unique_ptr<framework::OpDesc>(grad);
  }
 };


--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -24,6 +24,7 @@
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/proto_desc.h"
 #include "paddle/operators/detail/send_recv_impl.h"
 #include "paddle/operators/detail/simple_block_queue.h"

@@ -61,29 +62,76 @@ class RecvOp : public framework::OperatorBase {
    server_thread_->join();
  }

+  std::string GetGradVarNameForTrainer(const std::string &varname) const {
+    if (grads_counter_.find(varname) == grads_counter_.end()) {
+      grads_counter_[varname] = 0;
+    }
+    char ret[256];
+    snprintf(ret, sizeof(ret), "%s.trainer_%d", varname.c_str(),
+             grads_counter_[varname]++);
+    return std::string(ret);
+  }
+
  void Run(const framework::Scope &scope,
           const platform::DeviceContext &dev_ctx) const override {
-    // blocking get one var from client.
-    const framework::LoDTensor &t = rpc_service_->Get();
+    // FIXME(typhoonzero): no new scopes for every run.
    framework::Scope &recv_scope = scope.NewScope();
-    // set graph input var
-    auto *var = recv_scope.Var(Input("RX"));
+    rpc_service_->SetScope(&recv_scope);
+    auto param_list = Attr<std::vector<std::string>>("ParamList");
+    auto grad_list = Attr<std::vector<std::string>>("GradList");
+    auto trainer_count = Attr<int>("Trainers");
+    size_t param_count = param_list.size();
+    rpc_service_->Reset();
+    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
+    while (true) {
+      // Get from multiple trainers, we don't care about order in which
+      // the gradient arrives, just add suffix 0~n then average the gradient.
+      for (size_t i = 0; i < param_count * trainer_count; ++i) {
+        // blocking get one var from client.
+        const detail::TensorWithName &v = rpc_service_->Get();
+        auto grad_var_name = v.first;
+        auto it = std::find(grad_list.begin(), grad_list.end(), grad_var_name);
+        std::string param_var_name;
+        if (it != grad_list.end()) {
+          param_var_name = param_list[it - grad_list.begin()];
+        } else {
+          LOG(ERROR) << "grad have no paired param found!";
+        }
+        VLOG(3) << "recved grad: " << grad_var_name
+                << " updating param: " << param_var_name;
+        auto *merged_grad = recv_scope.FindVar(grad_var_name);
+        if (merged_grad == nullptr) {
+          // create output of merged var.
+          auto merged_var = recv_scope.Var(grad_var_name);
+          merged_var->GetMutable<framework::LoDTensor>();
+        }
+
+        if (trainer_count > 1) {
+          grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
+        }
+
+        auto *var = recv_scope.Var(grad_var_name);
        auto *tensor = var->GetMutable<framework::LoDTensor>();
        // FIXME(typhoonzero): do not copy
-    framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor);
+        framework::CopyFrom(v.second, dev_ctx.GetPlace(), dev_ctx, tensor);
+      }
+      rpc_service_->Reset();

      std::string program_str = Attr<std::string>("OptimizeProgram");
-    framework::ProgramDesc program_desc;
+      framework::proto::ProgramDesc program_desc;
      program_desc.ParseFromString(program_str);
-    framework::ProgramDescBind program(program_desc);
+      framework::ProgramDesc program(program_desc);
      framework::Executor executor(dev_ctx);
      // Run sub graph to get optimized tensor
+      try {
        executor.Run(program, &recv_scope, 0, /*global_block*/
-                 false /*create_local_scope*/);
-
-    auto *out_var = recv_scope.FindVar("Out");
-    // push back
-    rpc_service_->Push(out_var->Get<framework::LoDTensor>());
+                     false /*create_local_scope*/, false /*create_vars*/);
+      } catch (std::exception &e) {
+        LOG(ERROR) << "run sub program error " << e.what();
+      }
+      rpc_service_->Done();
+      grads_counter_.clear();
+    }  // while(true)
  }

 protected:
@@ -93,13 +141,14 @@ class RecvOp : public framework::OperatorBase {
  // grpc send/recv service implement to register.
  std::shared_ptr<detail::SendRecvServerImpl> rpc_service_;
  std::shared_ptr<std::thread> server_thread_;
+  mutable std::unordered_map<std::string, int> grads_counter_;
 };

 class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RecvOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("RX", "(Tensor) Input tensor to be saved");
+    AddInput("RX", "(Tensor) Input tensor to be optimized").AsDuplicable();
    AddComment(R"DOC(
 Recv operator

@@ -112,6 +161,17 @@ This operator will recv tensor from send_op
        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
    AddAttr<std::string>("OptimizeProgram", "type string",
                         "Serialized ProgramDesc string for recv to run.");
+    AddAttr<std::vector<std::string>>(
+        "ParamList", "type list of string",
+        "grad->param name mapping to find which param to optimize.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "GradList", "type list of string",
+        "grad->param name mapping to find which param to optimize.")
+        .SetDefault({});
+    AddAttr<int>("Trainers", "type int",
+                 "Number of trainers in the current cluster job")
+        .SetDefault(1);
  }
 };


--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -83,7 +83,7 @@ class ReduceGradOp : public framework::OperatorWithKernel {

 class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ReduceOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor) The input tensor. Tensors with rank at most 6 are "
@@ -135,8 +135,7 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.

 class ReduceSumOpMaker : public ReduceOpMaker {
 public:
-  ReduceSumOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ReduceSumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : ReduceOpMaker(proto, op_checker) {
    SetComment("ReduceSum", "sum");
    AddComment(comment_);
@@ -145,8 +144,7 @@ class ReduceSumOpMaker : public ReduceOpMaker {

 class ReduceMeanOpMaker : public ReduceOpMaker {
 public:
-  ReduceMeanOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  ReduceMeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : ReduceOpMaker(proto, op_checker) {
    SetComment("ReduceMean", "mean");
    AddComment(comment_);
@@ -155,8 +153,7 @@ class ReduceMeanOpMaker : public ReduceOpMaker {

 class ReduceMaxOpMaker : public ReduceOpMaker {
 public:
-  ReduceMaxOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ReduceMaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : ReduceOpMaker(proto, op_checker) {
    SetComment("ReduceMax", "max");
    AddComment(comment_);
@@ -165,8 +162,7 @@ class ReduceMaxOpMaker : public ReduceOpMaker {

 class ReduceMinOpMaker : public ReduceOpMaker {
 public:
-  ReduceMinOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ReduceMinOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : ReduceOpMaker(proto, op_checker) {
    SetComment("ReduceMin", "min");
    AddComment(comment_);

--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -77,8 +77,7 @@ class ReshapeOp : public framework::OperatorWithKernel {

 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ReshapeOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input tensor of reshape operator.");
    AddOutput("Out", "The output tensor of reshape operator.");

--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
@@ -63,8 +63,7 @@ class RmspropOp : public framework::OperatorWithKernel {

 class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RmspropOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  RmspropOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param",
             "(Tensor, default Tensor<float>) "

--- a/paddle/operators/rnn_memory_helper_op.cc
+++ b/paddle/operators/rnn_memory_helper_op.cc
@@ -57,15 +57,14 @@ class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {

 class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RNNMemoryHelperOpInfoMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+  RNNMemoryHelperOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "");
    AddOutput("Out", "");
    AddAttr<int>("dtype",
                 "(int, default 5 (FP32)) "
                 "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
    AddComment("");
  }
 };
@@ -114,8 +113,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
 class RNNMemoryHelperGradOpInfoMaker
    : public framework::OpProtoAndCheckerMaker {
 public:
-  RNNMemoryHelperGradOpInfoMaker(framework::OpProto *proto,
-                                 framework::OpAttrChecker *op_checker)
+  RNNMemoryHelperGradOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(framework::GradVarName("Out"), "");
    AddInput("X", "");
@@ -124,7 +122,7 @@ class RNNMemoryHelperGradOpInfoMaker
    AddAttr<int>("dtype",
                 "(int, default 5 (FP32)) "
                 "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
    AddComment("");
  }
 };

--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
@@ -99,8 +99,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {

 class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ROIPoolOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  ROIPoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor), "

--- a/paddle/operators/row_conv_op.cc
+++ b/paddle/operators/row_conv_op.cc
@@ -76,8 +76,7 @@ class RowConvGradOp : public framework::OperatorWithKernel {

 class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RowConvOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  RowConvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(LoDTensor), the input(X) is a LodTensor, which supports "

--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -94,8 +94,7 @@ class SaveOp : public framework::OperatorBase {

 class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SaveOpProtoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  SaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor ) Input tensor to be saved");
    AddComment(R"DOC(

--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -38,7 +38,7 @@ class ScaleOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) Input tensor of scale operator.");
    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
@@ -58,13 +58,13 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("scale");
    grad_op->SetInput("X", OutputGrad("Out"));
    grad_op->SetOutput("Out", InputGrad("X"));
    grad_op->SetAttr("scale", GetAttr("scale"));
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };


--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -78,8 +78,7 @@ class ScatterGradOp : public framework::OperatorWithKernel {

 class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ScatterOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Ref", "The source input of scatter op");
    AddInput("Index",

--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -34,45 +34,56 @@ class SendOp : public framework::OperatorBase {
         const framework::AttributeMap &attrs)
      : OperatorBase(type, inputs, outputs, attrs) {
    // init client when the operator is created at runtime.
-    if (!client_) {
-      std::string endpoint = Attr<std::string>("endpoint");
-      client_.reset(new detail::RPCClient(
-          grpc::CreateChannel(endpoint, grpc::InsecureChannelCredentials())));
-      // TODO(typhoonzero): how to call InitVariables
+    std::vector<std::string> endpoints =
+        Attr<std::vector<std::string>>("endpoints");
+    for (auto ep : endpoints) {
+      client_map_[ep].reset(new detail::RPCClient(
+          grpc::CreateChannel(ep, grpc::InsecureChannelCredentials())));
    }
  }
  void Run(const framework::Scope &scope,
           const platform::DeviceContext &dev_ctx) const override {
-    auto iname = Input("X");
-    auto oname = Output("Out");
-    // TODO(typhoonzero): currently it's non-blocking,
-    // should block until server responds.
-    bool ret = client_->SendVariable(scope, iname, oname);
+    auto ins = Inputs("X");
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    // TODO(typhoonzero): use async calls to send multiple variable asyncly.
+    for (size_t i = 0; i < ins.size(); ++i) {
+      bool ret = client_map_[epmap[i]]->SendVariable(scope, ins[i]);
      if (!ret) {
-      LOG(ERROR) << "send variable error";
+        LOG(ERROR) << "send variable error: " << ins[i];
+      }
+    }
+    // TODO(typhoonzero): support async optimization
+    client_map_[epmap[0]]->Wait();
+    for (size_t i = 0; i < ins.size(); ++i) {
+      bool ret = client_map_[epmap[i]]->GetVariable(scope, ins[i]);
+      if (!ret) {
+        LOG(ERROR) << "GetVariable error: " << ins[i];
+      }
    }
  }

 protected:
-  std::shared_ptr<detail::RPCClient> client_{nullptr};
+  mutable std::unordered_map<std::string, std::shared_ptr<detail::RPCClient>>
+      client_map_;
 };

 class SendOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SendOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SendOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) Input tensor to be saved");
-    AddOutput("Out", "(Tensor) Output fetched from server");
+    AddInput("X", "(Tensor) Input tensor to be send").AsDuplicable();
    AddComment(R"DOC(
 Recv operator

 This operator will recv tensor from send_op
 )DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
-        .SetDefault("127.0.0.1:6164")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.");
+    AddAttr<std::vector<std::string>>("epmap",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints in the order of input "
+                                      "variables for mapping");
  }
 };


--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
@@ -16,12 +16,14 @@
 // a RemoteOptimizer.

 #include <unistd.h>
+#include <string>
 #include <thread>

 #include "gtest/gtest.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/program_desc.h"
+#include "paddle/string/printf.h"

 USE_NO_KERNEL_OP(send);
 USE_NO_KERNEL_OP(recv);
@@ -33,30 +35,33 @@ std::unique_ptr<paddle::framework::OperatorBase> recv_op;
 void InitTensorsInScope(paddle::framework::Scope &scope,
                        paddle::platform::CPUPlace &place) {
  paddle::platform::CPUDeviceContext ctx(place);
-  auto var = scope.Var("X");
+  for (int i = 0; i < 2; ++i) {
+    auto var_name = paddle::string::Sprintf("x%d", i);
+    auto var = scope.Var(var_name);
    auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
    tensor->Resize({10, 10});
    float *expect = tensor->mutable_data<float>(place);
    for (int64_t i = 0; i < tensor->numel(); ++i) {
      expect[i] = static_cast<float>(i);
    }
+  }

  auto out_var = scope.Var("Out");
  auto out_tensor = out_var->GetMutable<paddle::framework::LoDTensor>();
  out_tensor->Resize({10, 10});
-  tensor->mutable_data<float>(place);  // allocate
+  out_tensor->mutable_data<float>(place);  // allocate
 }

 void AddOp(const std::string &type,
           const paddle::framework::VariableNameMap &inputs,
           const paddle::framework::VariableNameMap &outputs,
           paddle::framework::AttributeMap attrs,
-           paddle::framework::BlockDescBind *block) {
+           paddle::framework::BlockDesc *block) {
  // insert output
  for (auto kv : outputs) {
    for (auto v : kv.second) {
      auto var = block->Var(v);
-      var->SetDataType(paddle::framework::DataType::FP32);
+      var->SetDataType(paddle::framework::proto::DataType::FP32);
    }
  }

@@ -78,10 +83,10 @@ void StartServerNet() {
  InitTensorsInScope(scope, place);

  // sub program run in recv_op, for simple test we use sum
-  paddle::framework::ProgramDescBind program;
-  paddle::framework::BlockDescBind *block = program.MutableBlock(0);
+  paddle::framework::ProgramDesc program;
+  paddle::framework::BlockDesc *block = program.MutableBlock(0);
  // X for server side tensors, RX for received tensers, must be of same shape.
-  AddOp("sum", {{"X", {"X", "RX"}}}, {{"Out", {"Out"}}}, {}, block);
+  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block);

  paddle::framework::AttributeMap attrs;
  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
@@ -89,8 +94,8 @@ void StartServerNet() {
  PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto));

  attrs.insert({"OptimizeProgram", program_proto});
-  recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}},
-                                                    {{"Out", {"Out"}}}, attrs);
+  recv_op = paddle::framework::OpRegistry::CreateOp(
+      "recv", {{"RX", {"x0", "x1"}}}, {{"Out", {"Out"}}}, attrs);
  paddle::platform::CPUDeviceContext ctx(place);
  recv_op->Run(scope, ctx);
 }
@@ -107,11 +112,11 @@ TEST(SendRecvOp, CPU) {
  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});

  auto send_op = paddle::framework::OpRegistry::CreateOp(
-      "send", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
+      "send", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, attrs);
  paddle::platform::CPUDeviceContext ctx(place);
  send_op->Run(scope, ctx);

-  auto in_var = scope.Var("X");
+  auto in_var = scope.Var("x0");
  auto tensor = in_var->GetMutable<paddle::framework::LoDTensor>();
  float *expected = tensor->data<float>();


--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -43,8 +43,7 @@ class SequenceConcatOp : public framework::OperatorWithKernel {

 class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SequenceConcatOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  SequenceConcatOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(LodTensorArray) Input is a vector of LoDTensor, "
@@ -125,8 +124,9 @@ class SequenceConcatGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP(sequence_concat, ops::SequenceConcatOp, ops::SequenceConcatOpMaker,
-            sequence_concat_grad, ops::SequenceConcatGradOp);
+REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp,
+               ops::SequenceConcatOpMaker, sequence_concat_grad,
+               ops::SequenceConcatGradOp, false);
 REGISTER_OP_CPU_KERNEL(
    sequence_concat,
    ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
@@ -100,8 +100,7 @@ class SequenceConvGradOp : public framework::OperatorWithKernel {

 class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SequenceConvOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  SequenceConvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(
        "X",

--- a/paddle/operators/sequence_expand_op.cc
+++ b/paddle/operators/sequence_expand_op.cc
@@ -37,8 +37,7 @@ class SequenceExpandOp : public framework::OperatorWithKernel {

 class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SequenceExpandOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  SequenceExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor or LoDTensor) The input(X) of this operator can be a "

--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -37,8 +37,7 @@ class SequencePoolOp : public framework::OperatorWithKernel {

 class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SequencePoolOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  SequencePoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
    AddOutput("Out",

--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -79,8 +79,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {

 class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SequenceSliceOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+  SequenceSliceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(LoDTensor), "

--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -33,8 +33,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {

 class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SequenceSoftmaxOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker)
+  SequenceSoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension "
@@ -51,10 +50,14 @@ input Tensor can be either [N, 1] or [N], where N is the sum of the length
 of all sequences.

 The algorithm works as follows:
+
    for i-th sequence in a mini-batch:
-        $$Out(X[lod[i]:lod[i+1]], :) =
-            \frac{\exp(X[lod[i]:lod[i+1], :])}
-            {\sum(\exp(X[lod[i]:lod[i+1], :]))}$$
+
+$$
+Out(X[lod[i]:lod[i+1]], :) = \
+\frac{\exp(X[lod[i]:lod[i+1], :])} \
+{\sum(\exp(X[lod[i]:lod[i+1], :]))}
+$$

 For example, for a mini-batch of 3 sequences with variable-length,
 each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],

--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -43,7 +43,7 @@ class SGDOp : public framework::OperatorWithKernel {

 class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SGDOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Param", "(Tensor) Input parameter");
    AddInput("LearningRate", "(Tensor) Learning rate of SGD");

--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -54,8 +54,7 @@ class ShrinkRNNMemoryOp : public ArrayOp {

 class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ShrinkRNNMemoryOpProtoMaker(framework::OpProto *proto,
-                              framework::OpAttrChecker *op_checker)
+  ShrinkRNNMemoryOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
    AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
@@ -137,14 +136,14 @@ class ShrinkRNNGradOpMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
    op->SetType("shrink_rnn_memory_grad");
    op->SetInput("X", Input("X"));
    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
    op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(op);
+    return std::unique_ptr<framework::OpDesc>(op);
  }
 };


--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -86,8 +86,8 @@ class SigmoidCrossEntropyWithLogitsGradOp
 class SigmoidCrossEntropyWithLogitsOpMaker
    : public framework::OpProtoAndCheckerMaker {
 public:
-  SigmoidCrossEntropyWithLogitsOpMaker(framework::OpProto* proto,
-                                       framework::OpAttrChecker* op_checker)
+  SigmoidCrossEntropyWithLogitsOpMaker(OpProto* proto,
+                                       OpAttrChecker* op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "

--- a/paddle/operators/sign_op.cc
+++ b/paddle/operators/sign_op.cc
@@ -34,7 +34,7 @@ class SignOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class SignOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SignOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) Input tensor of sign operator.");
    AddOutput("Out", "(Tensor) Output tensor of sign operator.");
@@ -50,13 +50,13 @@ class SignGradMaker : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("scale");
    grad_op->SetInput("X", OutputGrad("Out"));
    grad_op->SetOutput("Out", InputGrad("X"));
    grad_op->SetAttr("scale", 0.0f);
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };


--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -47,8 +47,7 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SmoothL1LossOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  SmoothL1LossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "

--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -36,8 +36,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {

 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SoftmaxOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
             "The input tensor of softmax. "

--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -20,8 +20,7 @@ namespace operators {
 class SoftmaxWithCrossEntropyOpMaker
    : public framework::OpProtoAndCheckerMaker {
 public:
-  SoftmaxWithCrossEntropyOpMaker(framework::OpProto* proto,
-                                 framework::OpAttrChecker* op_checker)
+  SoftmaxWithCrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Logits",
             "(Tensor, default: Tensor<float>), The unscaled log probabilities "
@@ -174,8 +173,8 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto* grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
    grad_op->SetType("softmax_with_cross_entropy_grad");
    grad_op->SetInput("Label", Input("Label"));
    grad_op->SetInput("Softmax", Output("Softmax"));
@@ -184,7 +183,7 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
    grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
    grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };


--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -118,8 +118,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {

 class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SplitLoDTensorOpProtoMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+  SplitLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input LoDTensor");
    AddInput("Mask", "A bool column vector which mask the input");
@@ -164,8 +163,8 @@ class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("merge_lod_tensor");
    grad_op->SetInput("InTrue", OutputGrad("OutTrue"));
    grad_op->SetInput("InFalse", OutputGrad("OutFalse"));
@@ -173,7 +172,7 @@ class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
    grad_op->SetInput("X", Input("X"));
    grad_op->SetOutput("Out", InputGrad("X"));
    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };


--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
@@ -65,7 +65,7 @@ class SplitOp : public framework::OperatorWithKernel {

 class SplitOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SplitOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SplitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) Input tensor of the split operator.");
    AddOutput("Out", "(Tensor) Output tensors of the split operator.")
@@ -108,13 +108,13 @@ class SplitGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto op = new framework::OpDesc();
    op->SetType("concat");
    op->SetInput("X", OutputGrad("Out"));
    op->SetOutput("Out", InputGrad("X"));
    op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(op);
+    return std::unique_ptr<framework::OpDesc>(op);
  }
 };


--- a/paddle/operators/spp_op.cc
+++ b/paddle/operators/spp_op.cc
@@ -18,7 +18,7 @@ namespace operators {

 class SppOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SppOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  SppOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(
        "X",

--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -56,8 +56,7 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {

 class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SquaredL2DistanceOpMaker(framework::OpProto* proto,
-                           framework::OpAttrChecker* op_checker)
+  SquaredL2DistanceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) Input of SquaredL2DistanceOp.");
    AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp.");

--- a/paddle/operators/squared_l2_norm_op.cc
+++ b/paddle/operators/squared_l2_norm_op.cc
@@ -48,8 +48,7 @@ class SquaredL2NormGradOp : public framework::OperatorWithKernel {

 class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SquaredL2NormOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+  SquaredL2NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) The input of squared_l2_norm op.");
    AddOutput("Out", "(Scalar) The output of squared_l2_norm op.");

--- a/paddle/operators/strided_memcpy_test.cc
+++ b/paddle/operators/strided_memcpy_test.cc
@@ -85,8 +85,10 @@ TEST(StridedMemcpy, GPUCrop) {
  platform::GPUPlace gpu0(0);
  platform::CPUPlace cpu;

+  platform::CUDADeviceContext ctx(gpu0);
+
  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());

  framework::DDim src_stride({5, 1});

@@ -96,7 +98,6 @@ TEST(StridedMemcpy, GPUCrop) {
  framework::DDim dst_dim({2, 2});
  framework::DDim dst_stride({2, 1});

-  platform::CUDADeviceContext ctx(gpu0);
  StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
                     gpu_dst);

@@ -122,9 +123,10 @@ TEST(StridedMemcpy, GPUConcat) {

  platform::GPUPlace gpu0(0);
  platform::CPUPlace cpu;
+  platform::CUDADeviceContext ctx(gpu0);

  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());

  int dst[8];
  int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
@@ -132,7 +134,6 @@ TEST(StridedMemcpy, GPUConcat) {
  framework::DDim src_stride({2, 1});
  framework::DDim dst_dim({2, 2});
  framework::DDim dst_stride({4, 1});
-  platform::CUDADeviceContext ctx(gpu0);

  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,

--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -29,7 +29,7 @@ class SumOp : public framework::OperatorWithKernel {
                   "Output(Out) of SumOp should not be null.");
    if (ctx->IsRuntime() &&
        ctx->GetOutputsVarType("Out")[0] ==
-            framework::VarDesc::LOD_TENSOR_ARRAY) {
+            framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
      return;  // skip runtime infershape when is tensor array;
    }

@@ -72,8 +72,8 @@ class SumOp : public framework::OperatorWithKernel {
      PADDLE_ENFORCE_NE(dtype, -1,
                        "Sum operator should have at least one tensor");

-      return framework::OpKernelType(static_cast<framework::DataType>(dtype),
-                                     ctx.device_context());
+      return framework::OpKernelType(
+          static_cast<framework::proto::DataType>(dtype), ctx.device_context());
    } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
      return framework::OpKernelType(
          framework::ToDataType(
@@ -98,7 +98,7 @@ class SumOp : public framework::OperatorWithKernel {

 class SumOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SumOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  SumOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
        .AsDuplicable();
@@ -115,10 +115,10 @@ the LoD information with the first input.

 class SumOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDescBind& op_desc,
-                  framework::BlockDescBind* block) const override {
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
    auto& inputs = op_desc.Input("X");
-    auto var_type = framework::VarDesc::SELECTED_ROWS;
+    auto var_type = framework::proto::VarDesc::SELECTED_ROWS;

    for (auto& name : op_desc.Input("X")) {
      VLOG(10) << name << " "
@@ -128,12 +128,12 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
    bool any_input_is_lod_tensor = std::any_of(
        inputs.begin(), inputs.end(), [block](const std::string& name) {
          return block->FindRecursiveOrCreateVar(name)->GetType() ==
-                 framework::VarDesc::LOD_TENSOR;
+                 framework::proto::VarDesc::LOD_TENSOR;
        });

    auto is_tensor_array = [block](const std::string& name) {
      return detail::Ref(block->FindRecursiveOrCreateVar(name)).GetType() ==
-             framework::VarDesc::LOD_TENSOR_ARRAY;
+             framework::proto::VarDesc::LOD_TENSOR_ARRAY;
    };

    bool any_input_is_tensor_array =
@@ -152,9 +152,9 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
        PADDLE_ENFORCE(all_inputs_are_tensor_array,
                       "Not all inputs are tensor array:\n%s", os.str());
      }
-      var_type = framework::VarDesc::LOD_TENSOR_ARRAY;
+      var_type = framework::proto::VarDesc::LOD_TENSOR_ARRAY;
    } else if (any_input_is_lod_tensor) {
-      var_type = framework::VarDesc::LOD_TENSOR;
+      var_type = framework::proto::VarDesc::LOD_TENSOR;
    }

    auto out_var_name = op_desc.Output("Out").front();
@@ -169,20 +169,19 @@ class SumGradMaker : public framework::GradOpDescMakerBase {
 public:
  using framework::GradOpDescMakerBase::GradOpDescMakerBase;

-  std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
-      const override {
-    auto x_grads = InputGrad("X");
-    std::vector<std::unique_ptr<framework::OpDescBind>> grad_ops;
+  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
+    auto x_grads = InputGrad("X", false);
+    std::vector<std::unique_ptr<framework::OpDesc>> grad_ops;
    grad_ops.reserve(x_grads.size());
    auto og = OutputGrad("Out");
    std::transform(x_grads.begin(), x_grads.end(), std::back_inserter(grad_ops),
                   [&og](const std::string& x_grad) {
-                     auto* grad_op = new framework::OpDescBind();
+                     auto* grad_op = new framework::OpDesc();
                     grad_op->SetType("scale");
                     grad_op->SetInput("X", og);
                     grad_op->SetOutput("Out", {x_grad});
                     grad_op->SetAttr("scale", 1.0f);
-                     return std::unique_ptr<framework::OpDescBind>(grad_op);
+                     return std::unique_ptr<framework::OpDesc>(grad_op);
                   });
    return grad_ops;
  }

--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -51,8 +51,7 @@ class WriteToArrayOp : public ArrayOp {

 class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  WriteToArrayOpProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  WriteToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(LoDTensor) the tensor will be written to tensor array");
    AddInput(
@@ -97,14 +96,14 @@ class WriteToArrayInferShape : public framework::InferShapeBase {

 class WriteToArrayInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
    auto x_name = op_desc.Input("X")[0];
    auto out_name = op_desc.Output("Out")[0];
    VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
    auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name),
                            "Cannot found %s", out_name);
-    out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+    out.SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
    auto *x = block->FindVarRecursive(x_name);
    if (x != nullptr) {
      out.SetDataType(x->GetDataType());
@@ -140,8 +139,7 @@ class ReadFromArrayOp : public ArrayOp {

 class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  ReadFromArrayProtoMaker(framework::OpProto *proto,
-                          framework::OpAttrChecker *op_checker)
+  ReadFromArrayProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(TensorArray) the array will be read from.");
    AddInput("I",
@@ -177,14 +175,14 @@ class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("read_from_array");
    grad_op->SetInput("I", Input("I"));
    grad_op->SetInput("X", OutputGrad("Out"));
    grad_op->SetOutput("Out", InputGrad("X"));
    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };

@@ -193,14 +191,14 @@ class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("write_to_array");
    grad_op->SetInput("I", Input("I"));
    grad_op->SetInput("X", OutputGrad("Out"));
    grad_op->SetOutput("Out", InputGrad("X"));
    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };


--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -46,7 +46,7 @@ class TopkOp : public framework::OperatorWithKernel {

 class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  TopkOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  TopkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "(Tensor) The input of Topk op");
    AddOutput("Out", "(Tensor) The output tensor of Topk op");

--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -55,8 +55,7 @@ class TransposeOp : public framework::OperatorWithKernel {

 class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  TransposeOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  TransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(
        "X",

--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -66,15 +66,14 @@ class UniformRandomOp : public framework::OperatorWithKernel {
  framework::OpKernelType GetKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
        ctx.GetPlace());
  }
 };

 class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  UniformRandomOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+  UniformRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddOutput("Out", "(Tensor) The output tensor of uniform random op");
    AddComment(R"DOC(
@@ -100,7 +99,7 @@ uniform distribution.
                 "0 means use a seed generated by the system.")
        .SetDefault(0);
    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
  }
 };
 }  // namespace operators

--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -18,8 +18,7 @@ namespace operators {

 class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  Unpool2dOpMaker(framework::OpProto* proto,
-                  framework::OpAttrChecker* op_checker)
+  Unpool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(
        "X",

--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -46,7 +46,7 @@ class WhileOp : public framework::OperatorBase {
    PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));

    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();

    auto step_scopes =
@@ -64,7 +64,7 @@ class WhileOp : public framework::OperatorBase {

 class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  WhileOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(kParameters,
             "A set of variables, which are required by operators inside the "
@@ -82,7 +82,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
              "(StepScopeVar) A vector of local scope, which size equals the "
              "step number of While Op. The i'th scope storages temporary "
              "variables generated in the i'th step.");
-    AddAttr<framework::BlockDescBind *>(kStepBlock,
+    AddAttr<framework::BlockDesc *>(kStepBlock,
                                    "The step block inside WhileOp");
    AddComment(R"DOC(
 )DOC");
@@ -99,7 +99,7 @@ class WhileGradOp : public framework::OperatorBase {
  void Run(const framework::Scope &scope,
           const platform::DeviceContext &dev_ctx) const override {
    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();

    auto *step_scopes =
@@ -209,8 +209,8 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;

 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad = new framework::OpDesc();
    grad->SetType("while_grad");
    grad->SetInput(kParameters, Input(kParameters));

@@ -279,14 +279,14 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
    // while operator could be renamed.
    grad->SetAttr("original_output_grad", extra_inputs_list);

-    return std::unique_ptr<framework::OpDescBind>(grad);
+    return std::unique_ptr<framework::OpDesc>(grad);
  }
 };

 class WhileGradOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
    auto p_names = op_desc.Input(kParameters);
    auto pg_names = op_desc.Output(framework::GradVarName(kParameters));

@@ -321,10 +321,10 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
        continue;
      }
      auto dims = ctx->GetInputsElementDim(kParameters, i);
-      if (var_types[i] == framework::VarDesc::LOD_TENSOR) {
+      if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) {
        names_to_set.push_back(pg_names[i]);
        dims_to_set.push_back(dims);
-      } else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) {
+      } else if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
        // not sure how to set the dim of LOD_TENSOR_ARRAY
        names_to_set.push_back(pg_names[i]);
        dims_to_set.push_back(dims);

--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -19,7 +19,7 @@ CPUDeviceContext::CPUDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());
 }

-CPUDeviceContext::CPUDeviceContext(CPUPlace place) {
+CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) {
  eigen_device_.reset(new Eigen::DefaultDevice());
 }

@@ -27,7 +27,7 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
  return eigen_device_.get();
 }

-Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
+Place CPUDeviceContext::GetPlace() const { return place_; }

 #ifdef PADDLE_WITH_CUDA


--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -45,6 +45,7 @@ class CPUDeviceContext : public DeviceContext {
  Place GetPlace() const override;

 private:
+  CPUPlace place_;
  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };


--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -97,17 +97,6 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
 }

-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum cudaMemcpyKind kind) {
-  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
-                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync");
-  // note: cudaMemcpy may actually be asynchronous with respect to the caller,
-  //       block on stream 0 to make sure the copy has completed
-  PADDLE_ENFORCE(
-      cudaStreamSynchronize(0),
-      "cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync");
-}
-
 void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
                   size_t count, cudaStream_t stream) {
  PADDLE_ENFORCE(

--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -52,10 +52,6 @@ size_t GpuMaxChunkSize();
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind, cudaStream_t stream);

-//! Copy memory from address src to dst synchronously.
-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum cudaMemcpyKind kind);
-
 //! Copy memory from one device to another device.
 void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
                   size_t count, cudaStream_t stream);

--- a/paddle/platform/transform_test.cu
+++ b/paddle/platform/transform_test.cu
@@ -53,11 +53,11 @@ TEST(Transform, GPUUnary) {
  CUDADeviceContext ctx(gpu0);
  float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
  float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
-  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
+  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
  Transform<paddle::platform::CUDADeviceContext> trans;
  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
  ctx.Wait();
-  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
+  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
  Free(gpu0, gpu_buf);
  for (int i = 0; i < 4; ++i) {
    ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
@@ -83,11 +83,11 @@ TEST(Transform, GPUBinary) {
  GPUPlace gpu0(0);
  CUDADeviceContext ctx(gpu0);
  int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
-  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
+  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
  Transform<paddle::platform::CUDADeviceContext> trans;
  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
  ctx.Wait();
-  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
+  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
  Free(gpu0, gpu_buf);
  for (int i = 0; i < 4; ++i) {
    ASSERT_EQ((i + 1) * (i + 1), buf[i]);

--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
 if(WITH_PYTHON)
  cc_library(paddle_pybind SHARED
-    SRCS pybind.cc exception.cc protobuf.cc
+    SRCS pybind.cc exception.cc protobuf.cc const_value.cc
    DEPS pybind python backward proto_desc paddle_memory executor prune init
    ${GLOB_OP_LIB})
 endif(WITH_PYTHON)

--- a/paddle/pybind/const_value.cc
+++ b/paddle/pybind/const_value.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "const_value.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindConstValue(pybind11::module& m) {
+  m.def("kEmptyVarName", [] { return framework::kEmptyVarName; });
+  m.def("kTempVarName", [] { return framework::kTempVarName; });
+  m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
+  m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+}
+
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/pybind/const_value.h
+++ b/paddle/pybind/const_value.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <Python.h>
+#include "paddle/platform/enforce.h"
+#include "pybind11/pybind11.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+extern void BindConstValue(pybind11::module& m);
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/pybind/print_operators_doc.cc
+++ b/paddle/pybind/print_operators_doc.cc
@@ -31,31 +31,32 @@ std::string Escape(const std::string& s) {
  return r;
 }

-std::string AttrType(paddle::framework::AttrType at) {
+std::string AttrType(paddle::framework::proto::AttrType at) {
  switch (at) {
-    case paddle::framework::INT:
+    case paddle::framework::proto::INT:
      return "int";
-    case paddle::framework::FLOAT:
+    case paddle::framework::proto::FLOAT:
      return "float";
-    case paddle::framework::STRING:
+    case paddle::framework::proto::STRING:
      return "string";
-    case paddle::framework::BOOLEAN:
+    case paddle::framework::proto::BOOLEAN:
      return "bool";
-    case paddle::framework::INTS:
+    case paddle::framework::proto::INTS:
      return "int array";
-    case paddle::framework::FLOATS:
+    case paddle::framework::proto::FLOATS:
      return "float array";
-    case paddle::framework::STRINGS:
+    case paddle::framework::proto::STRINGS:
      return "string array";
-    case paddle::framework::BOOLEANS:
+    case paddle::framework::proto::BOOLEANS:
      return "bool array";
-    case paddle::framework::BLOCK:
+    case paddle::framework::proto::BLOCK:
      return "block id";
  }
  return "UNKNOWN";  // not possible
 }

-void PrintVar(const paddle::framework::OpProto::Var& v, std::stringstream& ss) {
+void PrintVar(const paddle::framework::proto::OpProto::Var& v,
+              std::stringstream& ss) {
  ss << " { "
     << "\n"
     << "   \"name\" : \"" << Escape(v.name()) << "\",\n"
@@ -65,7 +66,7 @@ void PrintVar(const paddle::framework::OpProto::Var& v, std::stringstream& ss) {
     << " },";
 }

-void PrintAttr(const paddle::framework::OpProto::Attr& a,
+void PrintAttr(const paddle::framework::proto::OpProto::Attr& a,
               std::stringstream& ss) {
  ss << " { "
     << "\n"
@@ -81,7 +82,7 @@ void PrintOpProto(const std::string& type,
                  std::stringstream& ss) {
  std::cerr << "Processing " << type << "\n";

-  const paddle::framework::OpProto* p = opinfo.proto_;
+  const paddle::framework::proto::OpProto* p = opinfo.proto_;
  if (p == nullptr) {
    return;  // It is possible that an operator doesn't have OpProto.
  }

--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -108,21 +108,21 @@ static py::bytes SerializeMessage(T &self) {

 // Bind Methods
 void BindProgramDesc(py::module &m) {
-  py::class_<ProgramDescBind>(m, "ProgramDesc", "")
+  py::class_<ProgramDesc>(m, "ProgramDesc", "")
      .def(py::init<>())
      .def("__init__",
-           [](ProgramDescBind &self, const ProgramDescBind &other) {
-             new (&self) ProgramDescBind(other);
+           [](ProgramDesc &self, const ProgramDesc &other) {
+             new (&self) ProgramDesc(other);
           })
      .def("__init__",
-           [](ProgramDescBind &self, const py::bytes &binary_str) {
+           [](ProgramDesc &self, const py::bytes &binary_str) {
             std::string str(binary_str);
-             new (&self) ProgramDescBind(str);
+             new (&self) ProgramDesc(str);
           })
-      .def("append_block", &ProgramDescBind::AppendBlock,
+      .def("append_block", &ProgramDesc::AppendBlock,
           py::return_value_policy::reference)
      .def("append_backward",
-           [](ProgramDescBind &program_desc, const VarDescBind &target,
+           [](ProgramDesc &program_desc, const VarDesc &target,
              const std::unordered_set<std::string> &no_grad_vars) {
             ParamGradInfoMap param_grad_map =
                 AppendBackward(program_desc, target, no_grad_vars);
@@ -138,13 +138,13 @@ void BindProgramDesc(py::module &m) {
             }
             return retv;
           })
-      .def("block", &ProgramDescBind::MutableBlock,
+      .def("block", &ProgramDesc::MutableBlock,
           py::return_value_policy::reference)
-      .def("num_blocks", &ProgramDescBind::Size)
-      .def("serialize_to_string", SerializeMessage<ProgramDescBind>)
+      .def("num_blocks", &ProgramDesc::Size)
+      .def("serialize_to_string", SerializeMessage<ProgramDesc>)
      .def("parse_from_string",
-           [](ProgramDescBind &program_desc, const std::string &data) {
-             ProgramDesc *desc = program_desc.Proto();
+           [](ProgramDesc &program_desc, const std::string &data) {
+             proto::ProgramDesc *desc = program_desc.Proto();
             PADDLE_ENFORCE(desc->ParseFromString(data),
                            "Fail to parse ProgramDesc from string. This could "
                            "be a bug of Paddle.");
@@ -152,109 +152,115 @@ void BindProgramDesc(py::module &m) {
 }

 void BindBlockDesc(py::module &m) {
-  py::class_<BlockDescBind>(m, "BlockDesc", "")
-      .def_property_readonly("id", &BlockDescBind::ID)
-      .def_property_readonly("parent", &BlockDescBind::Parent)
-      .def("append_op", &BlockDescBind::AppendOp,
+  py::class_<BlockDesc>(m, "BlockDesc", "")
+      .def_property_readonly("id", &BlockDesc::ID)
+      .def_property_readonly("parent", &BlockDesc::Parent)
+      .def("append_op", &BlockDesc::AppendOp,
           py::return_value_policy::reference)
-      .def("prepend_op", &BlockDescBind::PrependOp,
+      .def("prepend_op", &BlockDesc::PrependOp,
           py::return_value_policy::reference)
+      .def("remove_op", &BlockDesc::RemoveOp)
      .def("var",
-           [](BlockDescBind &self, py::bytes byte_name) {
+           [](BlockDesc &self, py::bytes byte_name) {
             std::string name = byte_name;
             return self.Var(name);
           },
           py::return_value_policy::reference)
      .def("has_var",
-           [](BlockDescBind &self, py::bytes byte_name) {
+           [](BlockDesc &self, py::bytes byte_name) {
             std::string name = byte_name;
             return self.HasVar(name);
           })
      .def("find_var",
-           [](BlockDescBind &self, py::bytes byte_name) {
+           [](BlockDesc &self, py::bytes byte_name) {
             std::string name = byte_name;
             return self.FindVar(name);
           },
           py::return_value_policy::reference)
-      .def("all_vars", &BlockDescBind::AllVars,
-           py::return_value_policy::reference)
-      .def("op_size", &BlockDescBind::OpSize)
-      .def("op", &BlockDescBind::Op, py::return_value_policy::reference)
-      .def("serialize_to_string", SerializeMessage<BlockDescBind>);
+      .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
+      .def("op_size", &BlockDesc::OpSize)
+      .def("op", &BlockDesc::Op, py::return_value_policy::reference)
+      .def("serialize_to_string", SerializeMessage<BlockDesc>);
 }

 void BindVarDsec(py::module &m) {
-  py::enum_<DataType>(m, "DataType", "")
-      .value("BOOL", DataType::BOOL)
-      .value("INT16", DataType::INT16)
-      .value("INT32", DataType::INT32)
-      .value("INT64", DataType::INT64)
-      .value("FP16", DataType::FP16)
-      .value("FP32", DataType::FP32)
-      .value("FP64", DataType::FP64);
+  py::enum_<proto::DataType>(m, "DataType", "")
+      .value("BOOL", proto::DataType::BOOL)
+      .value("INT16", proto::DataType::INT16)
+      .value("INT32", proto::DataType::INT32)
+      .value("INT64", proto::DataType::INT64)
+      .value("FP16", proto::DataType::FP16)
+      .value("FP32", proto::DataType::FP32)
+      .value("FP64", proto::DataType::FP64);

-  py::class_<VarDescBind> var_desc(m, "VarDesc", "");
+  py::class_<VarDesc> var_desc(m, "VarDesc", "");
  var_desc
      .def("name",
-           [](const VarDescBind &self) {
+           [](const VarDesc &self) {
             py::bytes name = self.Name();
             return name;
           },
           py::return_value_policy::reference)
-      .def("set_shape", &VarDescBind::SetShape)
-      .def("set_dtype", &VarDescBind::SetDataType)
-      .def("shape", &VarDescBind::Shape, py::return_value_policy::reference)
-      .def("dtype", &VarDescBind::GetDataType)
-      .def("lod_level", &VarDescBind::GetLodLevel)
-      .def("set_lod_level", &VarDescBind::SetLoDLevel)
-      .def("type", &VarDescBind::GetType)
-      .def("set_type", &VarDescBind::SetType)
-      .def("serialize_to_string", SerializeMessage<VarDescBind>)
-      .def("persistable", &VarDescBind::Persistable)
-      .def("set_persistable", &VarDescBind::SetPersistable);
+      .def("set_shape", &VarDesc::SetShape)
+      .def("set_dtype", &VarDesc::SetDataType)
+      .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
+      .def("dtype", &VarDesc::GetDataType)
+      .def("lod_level", &VarDesc::GetLodLevel)
+      .def("set_lod_level", &VarDesc::SetLoDLevel)
+      .def("type", &VarDesc::GetType)
+      .def("set_type", &VarDesc::SetType)
+      .def("serialize_to_string", SerializeMessage<VarDesc>)
+      .def("persistable", &VarDesc::Persistable)
+      .def("set_persistable", &VarDesc::SetPersistable);

-  py::enum_<VarDesc::VarType>(var_desc, "VarType", "")
-      .value("LOD_TENSOR", VarDesc::LOD_TENSOR)
-      .value("SELECTED_ROWS", VarDesc::SELECTED_ROWS)
-      .value("FEED_MINIBATCH", VarDesc::FEED_MINIBATCH)
-      .value("FETCH_LIST", VarDesc::FETCH_LIST)
-      .value("STEP_SCOPES", VarDesc::STEP_SCOPES)
-      .value("LOD_RANK_TABLE", VarDesc::LOD_RANK_TABLE)
-      .value("LOD_TENSOR_ARRAY", VarDesc::LOD_TENSOR_ARRAY);
+  py::enum_<proto::VarDesc::VarType>(var_desc, "VarType", "")
+      .value("LOD_TENSOR", proto::VarDesc::LOD_TENSOR)
+      .value("SELECTED_ROWS", proto::VarDesc::SELECTED_ROWS)
+      .value("FEED_MINIBATCH", proto::VarDesc::FEED_MINIBATCH)
+      .value("FETCH_LIST", proto::VarDesc::FETCH_LIST)
+      .value("STEP_SCOPES", proto::VarDesc::STEP_SCOPES)
+      .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
+      .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY);
 }

 void BindOpDesc(py::module &m) {
-  py::enum_<AttrType>(m, "AttrType", "")
-      .value("INT", AttrType::INT)
-      .value("INTS", AttrType::INTS)
-      .value("FLOAT", AttrType::FLOAT)
-      .value("FLOATS", AttrType::FLOATS)
-      .value("STRING", AttrType::STRING)
-      .value("STRINGS", AttrType::STRINGS)
-      .value("BOOL", AttrType::BOOLEAN)
-      .value("BOOLS", AttrType::BOOLEANS)
-      .value("BLOCK", AttrType::BLOCK);
+  py::enum_<proto::AttrType>(m, "AttrType", "")
+      .value("INT", proto::AttrType::INT)
+      .value("INTS", proto::AttrType::INTS)
+      .value("FLOAT", proto::AttrType::FLOAT)
+      .value("FLOATS", proto::AttrType::FLOATS)
+      .value("STRING", proto::AttrType::STRING)
+      .value("STRINGS", proto::AttrType::STRINGS)
+      .value("BOOL", proto::AttrType::BOOLEAN)
+      .value("BOOLS", proto::AttrType::BOOLEANS)
+      .value("BLOCK", proto::AttrType::BLOCK);

-  py::class_<OpDescBind> op_desc(m, "OpDesc", "");
-  op_desc.def("type", &OpDescBind::Type)
-      .def("set_type", &OpDescBind::SetType)
-      .def("input", &OpDescBind::Input)
-      .def("input_names", &OpDescBind::InputNames)
-      .def("set_input", &OpDescBind::SetInput)
-      .def("output", &OpDescBind::Output)
-      .def("output_names", &OpDescBind::OutputNames)
-      .def("set_output", &OpDescBind::SetOutput)
-      .def("has_attr", &OpDescBind::HasAttr)
-      .def("attr_type", &OpDescBind::GetAttrType)
-      .def("attr_names", &OpDescBind::AttrNames)
-      .def("set_attr", &OpDescBind::SetAttr)
-      .def("attr", &OpDescBind::GetAttr)
-      .def("set_block_attr", &OpDescBind::SetBlockAttr)
-      .def("block_attr", &OpDescBind::GetBlockAttr)
-      .def("check_attrs", &OpDescBind::CheckAttrs)
-      .def("infer_shape", &OpDescBind::InferShape)
-      .def("infer_var_type", &OpDescBind::InferVarType)
-      .def("serialize_to_string", SerializeMessage<OpDescBind>);
+  py::class_<OpDesc> op_desc(m, "OpDesc", "");
+  op_desc.def("type", &OpDesc::Type)
+      .def("set_type", &OpDesc::SetType)
+      .def("input", &OpDesc::Input)
+      .def("input_names", &OpDesc::InputNames)
+      .def("set_input", &OpDesc::SetInput)
+      .def("output", &OpDesc::Output)
+      .def("output_names", &OpDesc::OutputNames)
+      .def("set_output", &OpDesc::SetOutput)
+      .def("has_attr", &OpDesc::HasAttr)
+      .def("attr_type", &OpDesc::GetAttrType)
+      .def("attr_names", &OpDesc::AttrNames)
+      .def("set_attr", &OpDesc::SetAttr)
+      .def("attr", &OpDesc::GetAttr)
+      .def("set_block_attr", &OpDesc::SetBlockAttr)
+      .def("set_serialized_attr",
+           [](OpDesc &self, const std::string &name,
+              const py::bytes &seriralized) {
+             std::string ser(seriralized);
+             self.SetAttr(name, ser);
+           })
+      .def("block_attr", &OpDesc::GetBlockAttr)
+      .def("check_attrs", &OpDesc::CheckAttrs)
+      .def("infer_shape", &OpDesc::InferShape)
+      .def("infer_var_type", &OpDesc::InferVarType)
+      .def("serialize_to_string", SerializeMessage<OpDesc>);
 }

 }  // namespace pybind

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/operators/net_op.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "paddle/pybind/const_value.h"
 #include "paddle/pybind/exception.h"
 #include "paddle/pybind/pybind.h"
 #include "paddle/pybind/tensor_py.h"
@@ -265,36 +266,36 @@ All parameter, weight, gradient are variables in Paddle.
    return ret_values;
  });
  m.def("get_grad_op_descs",
-        [](const OpDescBind &op_desc,
+        [](const OpDesc &op_desc,
           const std::unordered_set<std::string> &no_grad_set,
           std::unordered_map<std::string, std::string> &grad_to_var,
-           const std::vector<BlockDescBind *> &grad_sub_block) {
-          std::vector<std::unique_ptr<OpDescBind>> grad_op_descs =
+           const std::vector<BlockDesc *> &grad_sub_block) {
+          std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
              framework::OpInfoMap::Instance()
                  .Get(op_desc.Type())
                  .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
                                 grad_sub_block);
-          std::vector<OpDescBind *> grad_op_desc_ptrs(grad_op_descs.size());
+          std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
          std::transform(
              grad_op_descs.begin(), grad_op_descs.end(),
              grad_op_desc_ptrs.begin(),
-              [](std::unique_ptr<OpDescBind> &p) { return p.release(); });
+              [](std::unique_ptr<OpDesc> &p) { return p.release(); });
          return grad_op_desc_ptrs;
        });
-  m.def("prune", [](const ProgramDescBind &origin,
+  m.def("prune", [](const ProgramDesc &origin,
                    const std::vector<std::array<size_t, 2>> &targets) {
-    ProgramDescBind prog_with_targets(origin);
+    ProgramDesc prog_with_targets(origin);
    for (const auto &t : targets) {
      prog_with_targets.MutableBlock(t[0])->Op(t[1])->MarkAsTarget();
    }
-    ProgramDesc pruned_desc;
+    proto::ProgramDesc pruned_desc;
    Prune(*prog_with_targets.Proto(), &pruned_desc);
-    return new ProgramDescBind(pruned_desc);
+    return new ProgramDesc(pruned_desc);
  });
-  m.def("inference_optimize", [](ProgramDescBind &origin) {
-    ProgramDesc pruned_desc;
+  m.def("inference_optimize", [](ProgramDesc &origin) {
+    proto::ProgramDesc pruned_desc;
    InferenceOptimize(*(origin.Proto()), &pruned_desc);
-    return new ProgramDescBind(pruned_desc);
+    return new ProgramDesc(pruned_desc);
  });
  m.def_submodule(
       "var_names",
@@ -344,7 +345,7 @@ All parameter, weight, gradient are variables in Paddle.
  py::class_<OperatorBase>(m, "Operator")
      .def_static("create",
                  [](py::bytes protobin) {
-                    OpDesc desc;
+                    proto::OpDesc desc;
                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
                                   "Cannot parse user input to OpDesc");
                    PADDLE_ENFORCE(desc.IsInitialized(),
@@ -397,7 +398,7 @@ All parameter, weight, gradient are variables in Paddle.
  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
      .def_static("create",
                  [](py::bytes protobin) -> operators::CondOp * {
-                    OpDesc desc;
+                    proto::OpDesc desc;
                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
                                   "Cannot parse user input to OpDesc");
                    PADDLE_ENFORCE(desc.IsInitialized(),
@@ -431,6 +432,7 @@ All parameter, weight, gradient are variables in Paddle.
  BindBlockDesc(m);
  BindVarDsec(m);
  BindOpDesc(m);
+  BindConstValue(m);

  py::class_<framework::LoDRankTable>(m, "LodRankTable")
      .def("items", [](framework::LoDRankTable &table) {

--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -14,6 +14,7 @@

 #pragma once
 #include <string>
+#include "paddle/framework/executor.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/memory/memcpy.h"
 #include "pybind11/numpy.h"
@@ -61,11 +62,15 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
        auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
            tensor.dims(), platform::CPUPlace()));
-        // TODO(qijun): Here we use default CUDA stream to set GPU Tensor to
-        // a Python numpy array. It's better to manage CDUA stream unifiedly.
-        paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
-                                        sizeof(CUR_TYPE) * tensor.numel(),
-                                        cudaMemcpyDeviceToHost);
+
+        framework::DeviceContextPool &pool =
+            framework::DeviceContextPool::Get();
+        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
+            pool.Borrow(tensor.place()));
+
+        paddle::platform::GpuMemcpyAsync(
+            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
+            cudaMemcpyDeviceToHost, dev_ctx->stream());
 #else
        PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
 #endif
@@ -132,10 +137,12 @@ void PyCUDATensorSetFromArray(

  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<T>(place);
-  // TODO(qijun): Here we use default CUDA stream to set a Python numpy
-  // array to a GPU Tensor. It's better to manage CDUA stream unifiedly.
-  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
-                                  cudaMemcpyHostToDevice);
+
+  framework::DeviceContextPool &pool = framework::DeviceContextPool::Get();
+  auto dev_ctx =
+      static_cast<const platform::CUDADeviceContext *>(pool.Borrow(place));
+  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
+                                   cudaMemcpyHostToDevice, dev_ctx->stream());
 }
 #endif


--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -14,9 +14,8 @@ make -j `nproc` print_operators_doc
 paddle/pybind/print_operators_doc > doc/en/html/operators.json

 # check websites for broken links
-# It will be failed now!
-#linkchecker doc/en/html/index.html
-#linkchecker doc/cn/html/index.html
+linkchecker doc/en/html/index.html
+linkchecker doc/cn/html/index.html

 # Parse Github URL
 REPO=`git config remote.origin.url`

--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -25,10 +25,10 @@ from paddle.trainer.config_parser import *
 __all__ = [
    'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
    "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
-    'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
-    'simple_attention', 'dot_product_attention', 'multi_head_attention',
-    'simple_gru2', 'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm',
-    'inputs', 'outputs'
+    'img_conv_group', 'img_separable_conv', 'vgg_16_network', 'gru_unit',
+    'gru_group', 'simple_gru', 'simple_attention', 'dot_product_attention',
+    'multi_head_attention', 'simple_gru2', 'bidirectional_gru',
+    'text_conv_pool', 'bidirectional_lstm', 'inputs', 'outputs'
 ]

 ######################################################
@@ -435,6 +435,85 @@ def img_conv_group(input,
        input=tmp, stride=pool_stride, pool_size=pool_size, pool_type=pool_type)


+@wrap_name_default("separable_conv")
+def img_separable_conv(input,
+                       num_channels,
+                       num_out_channels,
+                       filter_size,
+                       stride=1,
+                       padding=0,
+                       depth_multiplier=1,
+                       act=None,
+                       bias_attr=None,
+                       param_attr=None,
+                       shared_bias=True,
+                       layer_type='exconv',
+                       name=None):
+    """
+    Separable Convolution.
+
+    The separable convolution module is consisted of a depthwise convolution
+    that acts separately on input channels, followed by a pointwise convolution
+    with 1*1 kernels that mixes channels. It is used for Xception:
+    https://arxiv.org/pdf/1610.02357.pdf
+
+    :param input: input layer.
+    :type input: LayerOutput
+    :param num_channels: the number of input channels.
+    :type num_channels: int
+    :param num_out_channels: the number of output channels.
+    :type num_out_channels: int
+    :param filter_size: the filter size for the depthwise convolution.
+    :type filter_size: int|tuple
+    :param stride: the stride size for the depthwise convolution.
+    :type stride: int|tuple
+    :param padding: the padding size for the depthwise convolution.
+    :type padding: int|tuple
+    :param depth_multiplier: the number of filter for one channel in the
+                             depthwize convolution.
+    :type depth_multiplier: int
+    :param act: the activation function for the output.
+    :type act: BaseActivation
+    :param bias_attr: see img_conv_layer for details.
+    :type bias_attr: ParameterAttribute
+    :param param_attr: see img_conv_layer for details.
+    :type param_attr: ParameterAttribute
+    :param shared_bias: see img_conv_layer for details.
+    :type shared_bias: bool
+    :param layer_type: see img_conv_layer for details.
+    :type layer_type: bool
+    :return: layer's output
+    :rtype: LayerOutput
+    """
+    __depthwise_conv__ = img_conv_layer(
+        name="%s_depthwise_conv" % name,
+        input=input,
+        num_channels=num_channels,
+        num_filters=num_channels * depth_multiplier,
+        groups=num_channels,
+        filter_size=filter_size,
+        stride=stride,
+        padding=padding,
+        act=LinearActivation(),
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        shared_biases=shared_bias,
+        layer_type=layer_type)
+    __pointwise_conv__ = img_conv_layer(
+        name="%s_pointwise_conv" % name,
+        input=__depthwise_conv__,
+        num_channels=num_channels * depth_multiplier,
+        num_filters=num_out_channels,
+        filter_size=1,
+        stride=1,
+        padding=0,
+        act=act,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        shared_biases=shared_bias)
+    return __pointwise_conv__
+
+
 def small_vgg(input_image, num_channels, num_classes):
    def __vgg__(ipt, num_filter, times, dropouts, num_channels_=None):
        return img_conv_group(

--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -16,12 +16,14 @@ import regularizer
 from param_attr import ParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, GPUPlace
+from distribute_transpiler import DistributeTranspiler
+import clip

 Tensor = LoDTensor
 __all__ = framework.__all__ + executor.__all__ + [
    'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
    'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr'
-    'DataFeeder'
+    'DataFeeder', 'clip', 'DistributeTranspiler'
 ]



--- a/python/paddle/v2/fluid/clip.py
+++ b/python/paddle/v2/fluid/clip.py
+import functools
+import layers
+
+__all__ = ['GradientClipByValue', 'append_gradient_clip_ops']
+
+
+class BaseGradientClipAttr(object):
+    def process_context(self, context, p_g):
+        raise NotImplementedError()
+
+    def create_operators(self, param, grad):
+        raise NotImplementedError()
+
+
+class NullGradientClipAttr(BaseGradientClipAttr):
+    def process_context(self, context, p_g):
+        pass
+
+    def create_operators(self, param, grad):
+        return param, grad
+
+
+class GradientClipByValue(BaseGradientClipAttr):
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def process_context(self, context, p_g):
+        pass
+
+    def create_operators(self, param, grad):
+        new_grad = layers.clip(x=grad, min=self.min, max=self.max)
+        return param, new_grad
+
+
+def append_gradient_clip_ops(param_grad):
+    context = dict()
+    create_op_callbacks = []
+    for p, g in param_grad:
+        clip_attr = getattr(p, 'clip_attr', NullGradientClipAttr())
+        if clip_attr is None:
+            clip_attr = NullGradientClipAttr()
+        if not isinstance(clip_attr, BaseGradientClipAttr):
+            raise TypeError(
+                "clip attribute should be an instance of BaseGradientClippingAttr"
+            )
+
+        clip_attr.process_context(context=context, p_g=param_grad)
+        create_op_callbacks.append(
+            functools.partial(
+                clip_attr.create_operators, param=p, grad=g))
+
+    return [each_callback() for each_callback in create_op_callbacks]
+
+
+ClipByValue = GradientClipByValue
--- a/python/paddle/v2/fluid/distribute_transpiler.py
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
+import framework
+from framework import Program, default_main_program, Parameter, Variable
+import optimizer
+from layer_helper import LayerHelper
+
+
+def hash_name_to_server(params_grads, pserver_endpoints):
+    """
+    :param param_grads:
+    :return: a map of pserver endpoint -> 
+                    params -> [param list]
+                    grads  -> [grad list]
+    """
+
+    def _hash_param(param_name, total):
+        return hash(param_name) % total
+
+    param_grad_map = dict()
+    for param, grad in params_grads:
+        if param.trainable is True and grad is not None:
+            server_id = _hash_param(param.name, len(pserver_endpoints))
+            server_for_param = pserver_endpoints[server_id]
+            if not param_grad_map.has_key(server_for_param):
+                param_grad_map[server_for_param] = {"params": [], "grads": []}
+            param_grad_map[server_for_param]["params"].append(param)
+            param_grad_map[server_for_param]["grads"].append(grad)
+
+    return param_grad_map
+
+
+def round_robin(params_grads, pserver_endpoints):
+    assert (len(params_grads) > len(pserver_endpoints))
+
+    param_grad_map = dict()
+    pserver_idx = 0
+    for param, grad in params_grads:
+        if param.trainable is True:
+            server_for_param = pserver_endpoints[pserver_idx]
+            if not param_grad_map.has_key(server_for_param):
+                param_grad_map[server_for_param] = {"params": [], "grads": []}
+
+            param_grad_map[server_for_param]["params"].append(param)
+            param_grad_map[server_for_param]["grads"].append(grad)
+
+            pserver_idx += 1
+            if pserver_idx >= len(pserver_endpoints):
+                pserver_idx = 0
+    return param_grad_map
+
+
+class DistributeTranspiler:
+    def transpile(self,
+                  optimize_ops,
+                  params_grads,
+                  program=None,
+                  pservers="127.0.0.1:6174",
+                  trainers=1,
+                  split_method=round_robin):
+        """
+            Transpile the program to a distributed data-parallelism programs.
+
+            The main_program will be transform to use a remote parameter server
+            to do parameter optimization. And the optimization graph will be put
+            in to a parameter server program.
+
+            Use different methods to split trainable varialbles to different
+            parameter servers.
+
+            Example to run:
+
+            exe = fluid.Executor(place)
+            t = fluid.DistributeTranspiler()
+            t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)
+
+            pserver_endpoint = os.getenv("PSERVER")
+            if pserver_endpoint:
+                pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops)
+                exe.run(fluid.default_startup_program())
+                exe.run(pserver_prog)
+            else:
+                feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+                exe.run(fluid.default_startup_program())
+
+                for pass_id in range(PASS_NUM):
+                    ...
+
+            :param optimize_ops: op list of optimization, should be the
+                                 return value of Optimizer.minimize
+            :type optimize_ops: list
+            :param program: program to optimize, default default_main_program
+            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
+            :type pservers: string
+
+            :return: return a list of programs
+        """
+        if program is None:
+            program = default_main_program()
+        self.trainers = trainers
+        self._optimize_distributed(
+            optimize_ops,
+            program,
+            params_grads,
+            pservers=pservers,
+            trainers=trainers,
+            split_method=split_method)
+
+    def _clone_param(self, block, v):
+        assert isinstance(v, Parameter)
+        new_p = Parameter(
+            block=block,
+            shape=v.shape,
+            dtype=v.dtype,
+            type=v.type,
+            lod_level=v.lod_level,
+            stop_gradient=v.stop_gradient,
+            trainable=v.trainable,
+            optimize_attr=v.optimize_attr,
+            regularizer=v.regularizer,
+            name=v.name)
+        block.vars[new_p.name] = new_p
+
+    def _clone_var(self, block, var):
+        assert isinstance(var, Variable)
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            lod_level=var.lod_level,
+            persistable=var.persistable)
+
+    def _optimize_distributed(self, optimize_ops, program, params_and_grads,
+                              **kwargs):
+        if kwargs.has_key("split_method"):
+            split_method = kwargs["split_method"]
+        else:
+            split_method = round_robin
+
+        assert (callable(split_method))
+        pserver_endpoints = kwargs["pservers"].split(",")
+        self.param_grad_map = split_method(params_and_grads, pserver_endpoints)
+
+        send_op_ordered_inputs = []
+        epmap = []
+        for ep, v in self.param_grad_map.iteritems():
+            send_op_ordered_inputs.extend(v["grads"])
+            for i in v["grads"]:
+                epmap.append(ep)
+        send_op = program.global_block().append_op(
+            type="send",
+            inputs={"X": send_op_ordered_inputs
+                    },  # inputs is a list of tensors to be send
+            outputs={},
+            attrs={"endpoints": pserver_endpoints,
+                   "epmap": epmap})
+
+    def get_trainer_program(optimize_ops, program):
+        # remove optimize ops and add a send op to main_program
+        program.global_block().delete_ops(optimize_ops)
+
+    def _create_var_for_trainers(self, block, var, trainers):
+        var_list = []
+        for i in xrange(trainers):
+            var_each = block.create_var(
+                name="%s.trainer_%d" % (var.name, i),
+                psersistable=var.persistable,
+                dtype=var.dtype,
+                shape=var.shape)
+            var_list.append(var_each)
+        return var_list
+
+    def get_pserver_program(self, endpoint, optimize_ops):
+        pserver_program = Program()
+        for v in self.param_grad_map[endpoint]["params"]:
+            self._clone_param(pserver_program.global_block(), v)
+
+        optimize_sub_program = Program()
+        grad_var_names = [
+            var.name for var in self.param_grad_map[endpoint]["grads"]
+        ]
+        for opt_op in optimize_ops:
+            for _, var in opt_op.inputs.iteritems():
+                # NOTE: append operators to merge gradients from multiple
+                # trainers. If trainers == 1, this is not needed.
+                if self.trainers > 1 and var.name in grad_var_names:
+                    vars2merge = self._create_var_for_trainers(
+                        optimize_sub_program.global_block(), var, self.trainers)
+                    merged_var = optimize_sub_program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+                    optimize_sub_program.global_block().append_op(
+                        type="sum",
+                        inputs={"X": vars2merge},
+                        outputs={"Out": merged_var})
+                    optimize_sub_program.global_block().append_op(
+                        type="scale",
+                        inputs={"X": merged_var},
+                        outputs={"Out": merged_var},
+                        attrs={"scale": 1.0 / float(self.trainers)})
+                else:
+                    optimize_sub_program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+
+            if opt_op.inputs.has_key("Grad"):
+                if opt_op.inputs["Grad"].name in grad_var_names:
+                    print "appending ", opt_op.type, opt_op.inputs
+                    optimize_sub_program.global_block().append_op(
+                        type=opt_op.type,
+                        inputs=opt_op.inputs,
+                        outputs=opt_op.outputs,
+                        attrs=opt_op.attrs)
+            else:
+                optimize_sub_program.global_block().append_op(
+                    type=opt_op.type,
+                    inputs=opt_op.inputs,
+                    outputs=opt_op.outputs,
+                    attrs=opt_op.attrs)
+        pserver_program.global_block().append_op(
+            type="recv",
+            inputs={"RX":
+                    self.param_grad_map[endpoint]["grads"]},  # grads to recv
+            outputs={},
+            attrs={
+                "OptimizeProgram": optimize_sub_program.desc,
+                "endpoint": endpoint,
+                "ParamList":
+                [p.name for p in self.param_grad_map[endpoint]["params"]],
+                "GradList":
+                [p.name for p in self.param_grad_map[endpoint]["grads"]],
+                "Trainers": self.trainers
+            })
+        pserver_program.sync_with_cpp()
+        return pserver_program
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
 import numpy as np

 import layers
-from framework import Program, unique_name, Variable
+from framework import Program, unique_name, Variable, program_guard
 from layer_helper import LayerHelper

 __all__ = ['Accuracy', 'ChunkEvaluator']
@@ -49,15 +49,12 @@ class Evaluator(object):
        if reset_program is None:
            reset_program = Program()

+        with program_guard(main_program=reset_program):
            for var in self.states:
                assert isinstance(var, Variable)
                g_var = _clone_var_(reset_program.current_block(), var)
                layers.fill_constant(
-                shape=g_var.shape,
-                value=0.0,
-                dtype=g_var.dtype,
-                out=g_var,
-                main_program=reset_program)
+                    shape=g_var.shape, value=0.0, dtype=g_var.dtype, out=g_var)

        executor.run(reset_program)

@@ -104,20 +101,14 @@ class Accuracy(Evaluator):
        self.total = self.create_state(dtype='int64', shape=[1], suffix='total')
        self.correct = self.create_state(
            dtype='int64', shape=[1], suffix='correct')
-        kwargs = {'main_program': main_program}
        total = self.helper.create_tmp_variable(dtype='int')
        correct = self.helper.create_tmp_variable(dtype='int')
        acc = layers.accuracy(
-            input=input,
-            label=label,
-            k=k,
-            total=total,
-            correct=correct,
-            **kwargs)
-        total = layers.cast(x=total, dtype='int64', **kwargs)
-        correct = layers.cast(x=correct, dtype='int64', **kwargs)
-        layers.sums(input=[self.total, total], out=self.total, **kwargs)
-        layers.sums(input=[self.correct, correct], out=self.correct, **kwargs)
+            input=input, label=label, k=k, total=total, correct=correct)
+        total = layers.cast(x=total, dtype='int64')
+        correct = layers.cast(x=correct, dtype='int64')
+        layers.sums(input=[self.total, total], out=self.total)
+        layers.sums(input=[self.correct, correct], out=self.correct)

        self.metrics.append(acc)

@@ -125,12 +116,12 @@ class Accuracy(Evaluator):
        if eval_program is None:
            eval_program = Program()
        block = eval_program.current_block()
-        kwargs = {'main_program': eval_program}
+        with program_guard(main_program=eval_program):
            total = _clone_var_(block, self.total)
            correct = _clone_var_(block, self.correct)
-        total = layers.cast(total, dtype='float32', **kwargs)
-        correct = layers.cast(correct, dtype='float32', **kwargs)
-        out = layers.elementwise_div(x=correct, y=total, **kwargs)
+            total = layers.cast(total, dtype='float32')
+            correct = layers.cast(correct, dtype='float32')
+            out = layers.elementwise_div(x=correct, y=total)
        return np.array(executor.run(eval_program, fetch_list=[out])[0])


@@ -141,14 +132,14 @@ class ChunkEvaluator(Evaluator):
    numbers.
    """

-    def __init__(self,
+    def __init__(
+            self,
            input,
            label,
            chunk_scheme,
            num_chunk_types,
-                 excluded_chunk_types=None,
-                 **kwargs):
-        super(ChunkEvaluator, self).__init__("chunk_eval", **kwargs)
+            excluded_chunk_types=None, ):
+        super(ChunkEvaluator, self).__init__("chunk_eval")
        main_program = self.helper.main_program
        if main_program.current_block().idx != 0:
            raise ValueError("You can only invoke Evaluator in root block")
@@ -159,26 +150,21 @@ class ChunkEvaluator(Evaluator):
            dtype='int64', shape=[1], suffix='num_label_chunks')
        self.num_correct_chunks = self.create_state(
            dtype='int64', shape=[1], suffix='num_correct_chunks')
-        kwargs = {'main_program': main_program}
        precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval(
            input=input,
            label=label,
            chunk_scheme=chunk_scheme,
            num_chunk_types=num_chunk_types,
-            excluded_chunk_types=excluded_chunk_types,
-            **kwargs)
+            excluded_chunk_types=excluded_chunk_types, )
        layers.sums(
            input=[self.num_infer_chunks, num_infer_chunks],
-            out=self.num_infer_chunks,
-            **kwargs)
+            out=self.num_infer_chunks)
        layers.sums(
            input=[self.num_label_chunks, num_label_chunks],
-            out=self.num_label_chunks,
-            **kwargs)
+            out=self.num_label_chunks)
        layers.sums(
            input=[self.num_correct_chunks, num_correct_chunks],
-            out=self.num_correct_chunks,
-            **kwargs)
+            out=self.num_correct_chunks)

        self.metrics.extend([precision, recall, f1_score])

@@ -186,7 +172,6 @@ class ChunkEvaluator(Evaluator):
        if eval_program is None:
            eval_program = Program()
        block = eval_program.current_block()
-        kwargs = {'main_program': eval_program}
        num_infer_chunks, num_label_chunks, num_correct_chunks = executor.run(
            eval_program,
            fetch_list=[_clone_var_(block, state) for state in self.states])

--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
 import numpy as np
 from . import core
-from framework import Program, default_main_program
+from framework import Program, default_main_program, Parameter, Variable

 __all__ = ['Executor', 'g_scope']

@@ -148,7 +148,7 @@ class Executor(object):
                outputs={'Out': [fetch_var]},
                attrs={'col': i})

-        self.executor.run(program.desc, scope, 0, True)
+        self.executor.run(program.desc, scope, 0, True, True)
        outs = [
            core.get_fetch_variable(scope, fetch_var_name, i)
            for i in xrange(len(fetch_list))

--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
 import collections
+import contextlib

 import numpy as np
-from . import core
+
 import proto.framework_pb2 as framework_pb2
-import google.protobuf.message
-import contextlib
+from . import core

 __all__ = [
    'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
@@ -12,6 +12,18 @@ __all__ = [
    'switch_main_program'
 ]

+EMPTY_VAR_NAME = core.kEmptyVarName()
+TEMP_VAR_NAME = core.kTempVarName()
+GRAD_VAR_SUFFIX = core.kGradVarSuffix()
+ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
+
+
+def grad_var_name(var_name):
+    """
+    return gradient name for a certain var name
+    """
+    return var_name + GRAD_VAR_SUFFIX
+

 def unique_name(prefix):
    """
@@ -347,6 +359,10 @@ class Operator(object):
        """
        self.block = block
        self.desc = desc
+        # for clone a new operator
+        self.inputs = inputs
+        self.outputs = outputs
+        self.attrs = attrs
        if len(self.desc.type()) != 0:
            return
        if type is None:
@@ -418,13 +434,18 @@ class Operator(object):
                    continue
                if isinstance(attrs[attr_name], Block):
                    self.desc.set_block_attr(attr_name, attrs[attr_name].desc)
+                elif isinstance(attrs[attr_name], core.BlockDesc) or \
+                   isinstance(attrs[attr_name], core.ProgramDesc):
+                    self.desc.set_serialized_attr(
+                        attr_name, attrs[attr_name].serialize_to_string())
                else:
                    self.desc.set_attr(attr_name, attrs[attr_name])

        self.desc.check_attrs()
        no_kernel_op_set = {
            'feed', 'fetch', 'save', 'load', 'recurrent',
-            'rnn_memory_helper_grad', 'conditional_block', 'while'
+            'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
+            'recv'
        }
        if type not in no_kernel_op_set:
            self.desc.infer_var_type(self.block.desc)
@@ -570,6 +591,7 @@ class Block(object):
        self.vars = dict()  # var_name --> var
        self.ops = collections.deque()  # operator list
        self.program = program
+        self.removed_vars = dict()

    def __str__(self):
        return self.to_string(True)
@@ -626,6 +648,16 @@ class Block(object):
        self.ops.append(op)
        return op

+    def delete_ops(self, ops):
+        # remove from cpp
+        # FIXME(typhoonzero): remove only the first occuracy.
+        try:
+            start = list(self.ops).index(ops[0])
+            end = list(self.ops).index(ops[-1])
+        except Exception, e:
+            raise e
+        self.desc.remove_op(start, end)
+
    def prepend_op(self, *args, **kwargs):
        op_desc = self.desc.prepend_op()
        op = Operator(self, op_desc, *args, **kwargs)
@@ -704,6 +736,7 @@ class Block(object):
                trainable=p.trainable,
                optimize_attr=p.optimize_attr,
                regularizer=p.regularizer,
+                clip_attr=p.clip_attr,
                name=v.name)
            self.vars[new_p.name] = new_p

@@ -866,6 +899,8 @@ class Parameter(Variable):

        self.regularizer = kwargs.get('regularizer', None)

+        self.clip_attr = kwargs.get('clip_attr', None)
+

 # program is a global instance.
 _main_program_ = Program()

--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -21,19 +21,11 @@ class LayerHelper(object):

    @property
    def main_program(self):
-        prog = self.kwargs.get('main_program', None)
-        if prog is None:
        return default_main_program()
-        else:
-            return prog

    @property
    def startup_program(self):
-        prog = self.kwargs.get('startup_program', None)
-        if prog is None:
        return default_startup_program()
-        else:
-            return prog

    def append_op(self, *args, **kwargs):
        return self.main_program.current_block().append_op(*args, **kwargs)
@@ -151,13 +143,6 @@ class LayerHelper(object):
            persistable=True,
            initializer=initializer)

-    @property
-    def to_kwargs(self):
-        return {
-            'main_program': self.main_program,
-            'startup_program': self.startup_program
-        }
-
    def append_bias_op(self, input_var, dim_start=1, dim_end=None):
        """
        Append bias operator and return its output. If the user does not set

--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -14,11 +14,7 @@ __all__ = [
 ]


-def split_lod_tensor(input,
-                     mask,
-                     level=0,
-                     main_program=None,
-                     startup_program=None):
+def split_lod_tensor(input, mask, level=0):
    helper = LayerHelper('split_lod_tensor', **locals())
    out_true = helper.create_tmp_variable(dtype=input.dtype)
    out_false = helper.create_tmp_variable(dtype=input.dtype)
@@ -34,13 +30,7 @@ def split_lod_tensor(input,
    return out_true, out_false


-def merge_lod_tensor(in_true,
-                     in_false,
-                     x,
-                     mask,
-                     level=0,
-                     main_program=None,
-                     startup_program=None):
+def merge_lod_tensor(in_true, in_false, x, mask, level=0):
    helper = LayerHelper('merge_lod_tensor', **locals())
    out = helper.create_tmp_variable(dtype=in_true.dtype)
    helper.append_op(
@@ -135,9 +125,8 @@ class StaticRNN(object):
    IN_RNN_BLOCK = 1
    AFTER_RNN_BLOCK = 2

-    def __init__(self, name=None, main_program=None):
-        self.helper = LayerHelper(
-            "static_rnn", name=name, main_program=main_program)
+    def __init__(self, name=None):
+        self.helper = LayerHelper("static_rnn", name=name)
        self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
        self.inputs = []  # input variable list in current block
        self.outputs = []  # output variable list in parent block
@@ -354,8 +343,8 @@ class While(object):
    IN_WHILE_BLOCK = 1
    AFTER_WHILE_BLOCK = 2

-    def __init__(self, cond, name=None, main_program=None):
-        self.helper = LayerHelper("while", name=name, main_program=main_program)
+    def __init__(self, cond, name=None):
+        self.helper = LayerHelper("while", name=name)
        self.status = While.BEFORE_WHILE_BLOCK
        if not isinstance(cond, Variable):
            raise TypeError("condition should be a variable")
@@ -406,7 +395,7 @@ class While(object):
            attrs={'sub_block': while_block})


-def lod_rank_table(x, level=0, main_program=None):
+def lod_rank_table(x, level=0):
    """
    This function creates an operator for creating a LOD_RANK_TABLE
    using the input x.
@@ -423,7 +412,7 @@ def lod_rank_table(x, level=0, main_program=None):
    return table


-def max_sequence_len(rank_table, main_program=None):
+def max_sequence_len(rank_table):
    """
    This function creates an operator to calculate the length of
    max seqence through input rank_table(should be a lod_rank_table)
@@ -437,7 +426,7 @@ def max_sequence_len(rank_table, main_program=None):
    return res


-def topk(input, k, main_program=None, startup_program=None):
+def topk(input, k):
    helper = LayerHelper('topk', **locals())
    topk_out = helper.create_tmp_variable(dtype=input.data_type)
    topk_indices = helper.create_tmp_variable(dtype='int64')
@@ -450,10 +439,26 @@ def topk(input, k, main_program=None, startup_program=None):
    return topk_out, topk_indices


-def lod_tensor_to_array(x, table, main_program=None):
-    """
-    This function creates an operator to convert an LOD_Tensor to
+def lod_tensor_to_array(x, table):
+    """This function performs the operation that converts an LOD_Tensor to
       an array.
+
+    Args:
+        x (Variable|list): The tensor that needs to be converted to an array.
+        table (ParamAttr|list): The variable that stores the level of lod
+                                which is ordered by sequence length in
+                                descending order.
+
+    Returns:
+        Variable: The variable of type array that has been converted from a
+                  tensor.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[10])
+          table = fluid.layers.lod_rank_table(x, level=0)
+          array = fluid.layers.lod_tensor_to_array(x, table)
    """
    helper = LayerHelper("lod_tensor_to_array", **locals())
    array = helper.create_variable(
@@ -468,10 +473,27 @@ def lod_tensor_to_array(x, table, main_program=None):
    return array


-def array_to_lod_tensor(x, table, main_program=None, startup_program=None):
-    """
-    This function creates an operator to convert an array to a
-    LOD_Tensor.
+def array_to_lod_tensor(x, table):
+    """This function performs the operations that converts an array to
+       an LOD_Tensor.
+
+    Args:
+        x (Variable|list): The array that needs to be converted to a tensor.
+        table (ParamAttr|list): The variable that stores the level of lod
+                                which is ordered by sequence length in
+                                descending order.
+
+    Returns:
+        Variable: The variable of type tensor that has been converted
+                  from an array.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[10])
+          table = fluid.layers.lod_rank_table(x, level=0)
+          array = fluid.layers.lod_tensor_to_array(x, table)
+          lod_tensor = fluid.layers.array_to_lod_tensor(array, table)
    """
    helper = LayerHelper("array_to_lod_tensor", **locals())
    tmp = helper.create_tmp_variable(dtype=x.dtype)
@@ -483,15 +505,25 @@ def array_to_lod_tensor(x, table, main_program=None, startup_program=None):
    return tmp


-def increment(x,
-              value=1.0,
-              in_place=True,
-              main_program=None,
-              startup_program=None):
-    """
-    This function creates an operator to increment each value in the input
-    `x` by an amount: `value` as mentioned in the input parameter. This
-    operation is performed in-place by default.
+def increment(x, value=1.0, in_place=True):
+    """This function performs an operation that increments each value in the
+    input :math:`x` by an amount: :math:`value` as mentioned in the input
+    parameter. This operation is performed in-place by default.
+
+    Args:
+        x (Variable|list): The tensor that has the input values.
+        value (float): The amount by which the values should be incremented.
+        in_place (bool): If the increment should be performed in-place.
+
+    Returns:
+        Variable: The tensor variable storing the transformation of
+                  element-wise increment of each value in the input.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32')
+          data = fluid.layers.increment(x=data, value=3.0, in_place=True)
    """
    helper = LayerHelper("increment", **locals())
    if not in_place:
@@ -506,10 +538,25 @@ def increment(x,
    return out


-def array_write(x, i, array=None, main_program=None, startup_program=None):
-    """
-    This function creates an operator to write the data out as a
+def array_write(x, i, array=None):
+    """This function performs the operation to write the data out as an
    LOD_TENSOR_ARRAY.
+
+    Args:
+        x (Variable|list): The input tensor from which the data will be read.
+        i (Variable|list): The subscript index in tensor array, that points the
+                           place from which data will be read.
+        array (Variable|list): The data can be read into this variable if
+                               this is assigned.
+    Returns:
+        Variable: The tensor type variable that has the data written to it.
+
+    Examples:
+        .. code-block::python
+
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = layers.array_write(tmp, i=i)
    """
    helper = LayerHelper('array_write', **locals())
    if array is None:
@@ -525,7 +572,22 @@ def array_write(x, i, array=None, main_program=None, startup_program=None):
    return array


-def create_array(dtype, main_program=None):
+def create_array(dtype):
+    """This function creates an array of type :math:`LOD_TENSOR_ARRAY` using the
+    LayerHelper.
+
+    Args:
+        dtype (int|float): The data type of the elements in the array.
+
+    Returns:
+        Variable: The tensor variable storing the elements of data type.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.create_array(dtype='float32')
+
+    """
    helper = LayerHelper("array", **locals())
    return helper.create_variable(
        name="{0}.out".format(helper.name),
@@ -533,7 +595,25 @@ def create_array(dtype, main_program=None):
        dtype=dtype)


-def less_than(x, y, cond=None, main_program=None, **ignored):
+def less_than(x, y, cond=None, **ignored):
+    """
+    **Less than**
+
+    This layer returns the truth value of :math:`x < y` elementwise.
+
+    Args:
+        x(Variable): First operand of *less_than*
+        y(Variable): Second operand of *less_than*
+        cond(Variable|None): Optional output variable to store the result of *less_than*
+
+    Returns:
+        Variable: The tensor variable storing the output of *less_than*.
+
+    Examples:
+        .. code-block:: python
+
+          less = fluid.layers.less_than(x=label, y=limit)
+    """
    helper = LayerHelper("less_than", **locals())
    if cond is None:
        cond = helper.create_tmp_variable(dtype='bool')
@@ -545,10 +625,20 @@ def less_than(x, y, cond=None, main_program=None, **ignored):
    return cond


-def array_read(array, i, main_program=None, startup_program=None):
-    """
-    This function creates an operator to read the data in as a
+def array_read(array, i):
+    """This function performs the operation to read the data in as an
    LOD_TENSOR_ARRAY.
+    Args:
+        array (Variable|list): The input tensor that will be written to an array.
+        i (Variable|list): The subscript index in tensor array, that points the
+                           place where data will be written to.
+    Returns:
+        Variable: The tensor type variable that has the data written to it.
+    Examples:
+        .. code-block::python
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = layers.array_read(tmp, i=i)
    """
    helper = LayerHelper('array_read', **locals())
    if not isinstance(
@@ -564,7 +654,7 @@ def array_read(array, i, main_program=None, startup_program=None):
    return out


-def shrink_memory(x, i, table, main_program=None, startup_program=None):
+def shrink_memory(x, i, table):
    """
    This function creates an operator to shrink_rnn_memory using the RankTable
    as mentioned in the input parameter.
@@ -581,10 +671,24 @@ def shrink_memory(x, i, table, main_program=None, startup_program=None):
    return out


-def array_length(array, main_program=None):
-    """
-    This function creates an operator to find the length of the
+def array_length(array):
+    """This function performs the operation to find the length of the input
    LOD_TENSOR_ARRAY.
+
+    Args:
+        array (LOD_TENSOR_ARRAY): The input array that will be used
+                                  to compute the length.
+
+    Returns:
+        Variable: The length of the input LoDTensorArray.
+
+    Examples:
+        .. code-block::python
+
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = fluid.layers.array_write(tmp, i=i)
+          arr_len = fluid.layers.array_length(arr)
    """
    helper = LayerHelper('array_length', **locals())
    tmp = helper.create_tmp_variable(dtype='int64')
@@ -611,20 +715,12 @@ class ConditionalBlockGuard(BlockGuard):


 class ConditionalBlock(object):
-    def __init__(self,
-                 inputs,
-                 name=None,
-                 main_program=None,
-                 startup_program=None):
+    def __init__(self, inputs, name=None):
        for each_input in inputs:
            if not isinstance(each_input, Variable):
                raise TypeError("Each input should be variable")
        self.inputs = inputs
-        self.helper = LayerHelper(
-            'conditional_block',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
+        self.helper = LayerHelper('conditional_block', name=name)

    def block(self):
        return ConditionalBlockGuard(self)
@@ -709,15 +805,10 @@ class IfElse(object):
    IN_IF_ELSE_TRUE_BLOCKS = 1
    IN_IF_ELSE_FALSE_BLOCKS = 2

-    def __init__(self, cond, name=None, main_program=None,
-                 startup_program=None):
+    def __init__(self, cond, name=None):
        if not isinstance(cond, Variable):
            raise TypeError("cond must be a Variable")
-        self.helper = LayerHelper(
-            'ifelse',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
+        self.helper = LayerHelper('ifelse', name=name)
        self.cond = cond
        self.input_table = {}
        self.status = IfElse.OUT_IF_ELSE_BLOCKS
@@ -782,11 +873,7 @@ class IfElse(object):
            out_table.append(outside_out)

            # assign local var to outside
-            assign(
-                input=each_out,
-                output=outside_out,
-                main_program=self.helper.main_program,
-                startup_program=self.helper.startup_program)
+            assign(input=each_out, output=outside_out)

    def __call__(self):
        if self.status != self.OUT_IF_ELSE_BLOCKS:
@@ -810,9 +897,7 @@ class IfElse(object):
                    in_false=false_var,
                    mask=self.cond,
                    x=self.cond,
-                    level=0,
-                    main_program=self.helper.main_program,
-                    startup_program=self.helper.startup_program))
+                    level=0))
        return rlist


@@ -821,12 +906,8 @@ class DynamicRNN(object):
    IN_RNN = 1
    AFTER_RNN = 2

-    def __init__(self, name=None, main_program=None, startup_program=None):
-        self.helper = LayerHelper(
-            'dynamic_rnn',
-            name=name,
-            main_program=main_program,
-            startup_program=startup_program)
+    def __init__(self, name=None):
+        self.helper = LayerHelper('dynamic_rnn', name=name)
        self.status = DynamicRNN.BEFORE_RNN
        self.lod_rank_table = None
        self.max_seq_len = None
@@ -880,8 +961,7 @@ class DynamicRNN(object):
            inputs={'X': x,
                    'RankTable': self.lod_rank_table},
            outputs={'Out': input_array})
-        return array_read(
-            array=input_array, i=self.step_idx, **self.helper.to_kwargs)
+        return array_read(array=input_array, i=self.step_idx)

    @contextlib.contextmanager
    def block(self):
@@ -892,32 +972,18 @@ class DynamicRNN(object):
        self.status = DynamicRNN.IN_RNN
        with self.while_op.block():
            yield
-            increment(
-                x=self.step_idx,
-                value=1.0,
-                in_place=True,
-                **self.helper.to_kwargs)
+            increment(x=self.step_idx, value=1.0, in_place=True)

            for new_mem, mem_array in self.mem_link:
-                array_write(
-                    x=new_mem,
-                    i=self.step_idx,
-                    array=mem_array,
-                    **self.helper.to_kwargs)
-
-            less_than(
-                x=self.step_idx,
-                y=self.max_seq_len,
-                cond=self.cond,
-                **self.helper.to_kwargs)
+                array_write(x=new_mem, i=self.step_idx, array=mem_array)
+
+            less_than(x=self.step_idx, y=self.max_seq_len, cond=self.cond)

        self.status = DynamicRNN.AFTER_RNN
        for each_array in self.output_array:
            self.outputs.append(
                array_to_lod_tensor(
-                    x=each_array,
-                    table=self.lod_rank_table,
-                    **self.helper.to_kwargs))
+                    x=each_array, table=self.lod_rank_table))

    def __call__(self, *args, **kwargs):
        if self.status != DynamicRNN.AFTER_RNN:
@@ -944,13 +1010,9 @@ class DynamicRNN(object):
                inputs={'X': init,
                        'I': self.zero_idx},
                outputs={'Out': mem_array})
-            retv = array_read(
-                array=mem_array, i=self.step_idx, **self.helper.to_kwargs)
+            retv = array_read(array=mem_array, i=self.step_idx)
            retv = shrink_memory(
-                x=retv,
-                i=self.step_idx,
-                table=self.lod_rank_table,
-                **self.helper.to_kwargs)
+                x=retv, i=self.step_idx, table=self.lod_rank_table)
            self.mem_dict[retv.name] = mem_array
            return retv
        else:

--- a/python/paddle/v2/fluid/layers/io.py
+++ b/python/paddle/v2/fluid/layers/io.py
@@ -10,24 +10,11 @@ def data(name,
         dtype='float32',
         lod_level=0,
         type=core.VarDesc.VarType.LOD_TENSOR,
-         main_program=None,
-         startup_program=None,
         stop_gradient=True):
    """
-    Data Layer.
+    **Data Layer**

-    Args:
-       name: The name/alias of the function
-       shape: Tuple declaring the shape.
-       append_batch_size: Whether or not to append the data as a batch.
-       dtype: The type of data : float32, float_16, int etc
-       type: The output type. By default it is LOD_TENSOR.
-       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
-       stop_gradient: A boolean that mentions whether gradient should flow.
-
-    This function takes in input and based on whether data has
+    This function takes in the input and based on whether data has
    to be returned back as a minibatch, it creates the global variable using
    the helper functions. The global variables can be accessed by all the
    following operations and layers in the graph.
@@ -35,6 +22,24 @@ def data(name,
    All the input variables of this function are passed in as local variables
    to the LayerHelper constructor.

+    Args:
+       name(str): The name/alias of the function
+       shape(list): Tuple declaring the shape.
+       append_batch_size(bool): Whether or not to append the data as a batch.
+       dtype(int|float): The type of data : float32, float_16, int etc
+       type(VarType): The output type. By default it is LOD_TENSOR.
+       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
+       main_program(Program): Name of the main program that calls this
+       startup_program(Program): Name of the startup program
+       stop_gradient(bool): A boolean that mentions whether gradient should flow.
+
+    Returns:
+        Variable: The global variable that gives access to the data.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='x', shape=[784], dtype='float32')
    """
    helper = LayerHelper('data', **locals())
    shape = list(shape)

--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -5,12 +5,15 @@ All layers just related to the neural network.
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
+from ..param_attr import ParamAttr
+from tensor import concat

 __all__ = [
    'fc', 'embedding', 'dynamic_lstm', 'gru_unit', 'linear_chain_crf',
    'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', 'accuracy',
    'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d',
-    'batch_norm', 'beam_search_decode', 'conv2d_transpose'
+    'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'sequence_expand',
+    'lstm_unit', 'reduce_sum', 'reduce_mean'
 ]


@@ -20,38 +23,85 @@ def fc(input,
       param_attr=None,
       bias_attr=None,
       act=None,
-       name=None,
-       main_program=None,
-       startup_program=None):
+       name=None):
    """
-    Fully Connected Layer.
+    **Fully Connected Layer**
+
+    The fully connected layer can take multiple tensors as its inputs. It
+    creates a variable (one for each input tensor) called weights for each input
+    tensor, which represents a fully connected weight matrix from each input
+    unit to each output unit. The fully connected layer multiplies each input
+    tensor with its coresponding weight to produce an output Tensor. If
+    multiple input tensors are given, the results of multiple multiplications
+    will be sumed up. If bias_attr is not None, a biases variable will be
+    created and added to the output. Finally, if activation is not None,
+    it will be applied to the output as well.
+
+    This process can be formulated as follows:
+
+    .. math::
+
+        Out = Act({\sum_{i=0}^{N-1}W_iX_i + b})
+
+    In the above equation:
+
+    * :math:`N`: Number of the input.
+    * :math:`X_i`: The input tensor.
+    * :math:`W`: The weights created by this layer.
+    * :math:`b`: The bias parameter created by this layer (if needed).
+    * :math:`Act`: The activation funtion.
+    * :math:`Out`: The output tensor.

    Args:
-       input: The input tensor to the function
-       size: The size of the layer
-       num_flatten_dims: Number of columns in input
-       param_attr: The parameters/weights to the FC Layer
-       param_initializer: Initializer used for the weight/parameter. If None, XavierInitializer() is used
-       bias_attr: The bias parameter for the FC layer
-       bias_initializer: Initializer used for the bias. If None, then ConstantInitializer() is used
-       act: Activation to be applied to the output of FC layer
-       name: Name/alias of the function
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
+       input(Variable|list): The input tensor(s) to the fully connected layer.
+       size(int): The number of output units in the fully connected layer.
+       num_flatten_dims(int): The fc layer can accept an input tensor with more
+                              than two dimensions. If this happens, the
+                              multidimensional tensor will first be flattened
+                              into a 2-dimensional matrix. The parameter
+                              `num_flatten_dims` determines how the input tensor
+                              is flattened: the first `num_flatten_dims`
+                              dimensions will be flatten to form the first
+                              dimension of the final matrix (height of the
+                              matrix), and the rest `rank(X) - num_col_dims`
+                              dimensions are flattened to form the second
+                              dimension of the final matrix (width of the matrix).
+                              For example, suppose `X` is a 6-dimensional tensor
+                              with a shape [2, 3, 4, 5, 6], and
+                              `x_num_col_dims` = 3. Then, the flattened matrix
+                              will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+                              By default, `x_num_col_dims` is set to 1.
+       param_attr(ParamAttr|list): The parameter attribute for learnable
+                                   parameters/weights of the fully connected
+                                   layer.
+       param_initializer(ParamAttr|list): The initializer used for the
+                                          weight/parameter. If set None,
+                                          XavierInitializer() will be used.
+       bias_attr(ParamAttr|list): The parameter attribute for the bias parameter
+                                  for this layer. If set None, no bias will be
+                                  added to the output units.
+       bias_initializer(ParamAttr|list): The initializer used for the bias.
+                                        If set None, then ConstantInitializer()
+                                        will be used.
+       act(str): Activation to be applied to the output of the fully connected
+                 layer.
+       name(str): Name/alias of the fully connected layer.

-    This function can take in multiple inputs and performs the Fully Connected
-    function (linear transformation) on top of each of them.
-    So for input x, the output will be : Wx + b. Where W is the parameter,
-    b the bias and x is the input.

-    The function also applies an activation (non-linearity) on top of the
-    output, if activation is passed in the input.
+    Returns:
+        Variable: The output tensor variable.

-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.

+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          fc = fluid.layers.fc(input=data, size=1000, act="tanh")
    """
-    helper = LayerHelper('fc', **locals())
+
+    helper = LayerHelper("fc", **locals())

    dtype = helper.input_dtype()

@@ -71,8 +121,8 @@ def fc(input,
                "Y": w,
            },
            outputs={"Out": tmp},
-            attrs={'x_num_col_dims': num_flatten_dims,
-                   'y_num_col_dims': 1})
+            attrs={"x_num_col_dims": num_flatten_dims,
+                   "y_num_col_dims": 1})
        mul_results.append(tmp)

    # sum
@@ -88,33 +138,32 @@ def fc(input,
    return helper.append_activation(pre_activation)


-def embedding(input,
-              size,
-              is_sparse=False,
-              param_attr=None,
-              dtype='float32',
-              main_program=None,
-              startup_program=None):
+def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'):
    """
-    Embedding Layer.
+    **Embedding Layer**
+
+    This layer is used to lookup a vector of IDs, provided by *input*, in a lookup table.
+    The result of this lookup is the embedding of each ID in the *input*.
+
+    All the input variables are passed in as local variables to the LayerHelper
+    constructor.

    Args:
-       param_initializer:
-       input: The input to the function
-       size: The size of the layer
-       is_sparse: A flag that decleares whether the input is sparse
-       param_attr: Parameters for this layer
-       dtype: The type of data : float32, float_16, int etc
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
+       input(Variable): Input to the function
+       size(tuple|list|None): Shape of the look up table parameter 
+       is_sparse(bool): Boolean flag that specifying whether the input is sparse
+       param_attr(ParamAttr): Parameters for this layer
+       dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc

-    This function can take in the input (which is a vector of IDs) and
-    performs a lookup in the lookup_table using these IDs, to result into
-    the embedding of each ID in the input.
+    Returns:
+        Variable: The tensor variable storing the embeddings of the \
+                  supplied inputs.

-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
+    Examples:
+        .. code-block:: python

+          data = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
+          fc = fluid.layers.embedding(input=data, size=16)
    """

    helper = LayerHelper('embedding', **locals())
@@ -140,9 +189,7 @@ def dynamic_lstm(input,
                 gate_activation='sigmoid',
                 cell_activation='tanh',
                 candidate_activation='tanh',
-                 dtype='float32',
-                 main_program=None,
-                 startup_program=None):
+                 dtype='float32'):
    helper = LayerHelper('lstm', **locals())
    size = size / 4
    weight = helper.create_parameter(
@@ -185,9 +232,7 @@ def gru_unit(input,
             weight=None,
             bias=None,
             activation='tanh',
-             gate_activation='sigmoid',
-             main_program=None,
-             startup_program=None):
+             gate_activation='sigmoid'):
    """
    GRUUnit Operator implements partial calculations of the GRU unit as following:

@@ -250,11 +295,7 @@ def gru_unit(input,
    return updated_hidden, reset_hidden_pre, gate


-def linear_chain_crf(input,
-                     label,
-                     param_attr=None,
-                     main_program=None,
-                     startup_program=None):
+def linear_chain_crf(input, label, param_attr=None):
    helper = LayerHelper('linear_chain_crf', **locals())
    size = input.shape[1]
    transition = helper.create_parameter(
@@ -280,11 +321,7 @@ def linear_chain_crf(input,
    return log_likelihood


-def crf_decoding(input,
-                 param_attr,
-                 label=None,
-                 main_program=None,
-                 startup_program=None):
+def crf_decoding(input, param_attr, label=None):
    helper = LayerHelper('crf_decoding', **locals())
    transition = helper.get_parameter(param_attr.name)
    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -419,8 +456,8 @@ def chunk_eval(input,
        },
        attrs={
            "num_chunk_types": num_chunk_types,
-            'chunk_scheme': chunk_scheme,
-            'excluded_chunk_types': excluded_chunk_types or []
+            "chunk_scheme": chunk_scheme,
+            "excluded_chunk_types": excluded_chunk_types or []
        })
    return precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks

@@ -432,9 +469,7 @@ def sequence_conv(input,
                  padding=None,
                  bias_attr=None,
                  param_attr=None,
-                  act=None,
-                  main_program=None,
-                  startup_program=None):
+                  act=None):
    """
    This function creates the op for sequence_conv, using the inputs and
    other convolutional configurations for the filters and stride as given
@@ -477,9 +512,7 @@ def conv2d(input,
           param_attr=None,
           bias_attr=None,
           act=None,
-           name=None,
-           main_program=None,
-           startup_program=None):
+           name=None):
    """
    This function creates the op for a 2-dimensional Convolution.
    This is performed using the parameters of filters(size, dimensionality etc)
@@ -565,9 +598,7 @@ def pool2d(input,
           pool_type,
           pool_stride=None,
           pool_padding=None,
-           global_pooling=False,
-           main_program=None,
-           startup_program=None):
+           global_pooling=False):
    """
    This function adds the operator for pooling in 2 dimensions, using the
    pooling configurations mentioned in input parameters.
@@ -613,9 +644,7 @@ def batch_norm(input,
               epsilon=1e-05,
               param_attr=None,
               bias_attr=None,
-               data_layout='NCHW',
-               main_program=None,
-               startup_program=None):
+               data_layout='NCHW'):
    """
    This function helps create an operator to implement
    the BatchNorm layer using the configurations from the input parameters.
@@ -685,7 +714,7 @@ def batch_norm(input,
    return helper.append_activation(batch_norm_out)


-def beam_search_decode(ids, scores, main_program=None, startup_program=None):
+def beam_search_decode(ids, scores):
    helper = LayerHelper('beam_search_decode', **locals())
    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
@@ -708,9 +737,8 @@ def conv2d_transpose(input,
                     filter_size=None,
                     padding=None,
                     stride=None,
-                     param_attr=None,
-                     main_program=None,
-                     startup_program=None):
+                     dilation=None,
+                     param_attr=None):
    """
    The transpose of conv2d layer.

@@ -733,6 +761,9 @@ def conv2d_transpose(input,
        stride(int|tuple): The stride size. If stride is a tuple, it must
            contain two integers, (stride_H, stride_W). Otherwise, the
            stride_H = stride_W = stride.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation.
        param_attr: Parameter Attribute.
        main_program(Program): the main program
        startup_program(Program): the startup program
@@ -753,10 +784,15 @@ def conv2d_transpose(input,
        op_attr['paddings'] = padding

    if isinstance(stride, int):
-        op_attr['strides'] = stride
+        op_attr['strides'] = [stride, stride]
    elif stride is not None:
        op_attr['strides'] = stride

+    if isinstance(dilation, int):
+        op_attr['dilations'] = [dilation, dilation]
+    elif dilation is not None:
+        op_attr['dilations'] = dilation
+
    if filter_size is None:
        if output_size is None:
            raise ValueError("output_size must be set when filter_size is None")
@@ -765,14 +801,17 @@ def conv2d_transpose(input,

        padding = op_attr.get('paddings', [0, 0])
        stride = op_attr.get('strides', [1, 1])
+        dilation = op_attr.get('dilations', [1, 1])

        h_in = input.shape[2]
        w_in = input.shape[3]
-        filter_size_h = output_size[0] - \
-                        (h_in - 1) * stride[0] + 2 * padding[0]
-        filter_size_w = output_size[1] - \
-                        (w_in - 1) * stride[1] + 2 * padding[1]
+
+        filter_size_h = (output_size[0] - (h_in - 1) * stride[0] + 2 *
+                         padding[0] - 1) / dilation[0] + 1
+        filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + 2 *
+                         padding[1] - 1) / dilation[1] + 1
        filter_size = [filter_size_h, filter_size_w]
+
    elif isinstance(filter_size, int):
        filter_size = [filter_size, filter_size]

@@ -789,3 +828,264 @@ def conv2d_transpose(input,
        attrs=op_attr)

    return out
+
+
+def sequence_expand(x, y):
+    """Sequence Expand Layer. This layer will expand the input variable **x**
+    according to LoD information of **y**. And the following examples will
+    explain how sequence_expand works:
+
+    .. code-block:: text
+
+        * Case 1
+            x is a LoDTensor:
+                x.lod = [[0,       2, 3],
+                         [0, 1,    3, 4]]
+                x.data = [a, b, c, d]
+                x.dims = [4, 1]
+
+            y is a LoDTensor:
+                y.lod = [[0,    2,    4],
+                         [0, 3, 6, 7, 8]]
+
+            with condition len(y.lod[-1]) - 1 == x.dims[0]
+
+            then output is a 2-level LoDTensor:
+                out.lod = [[0,                2,    4],
+                           [0,       3,       6, 7, 8]]
+                out.data = [a, a, a, b, b, b, c, d]
+                out.dims = [8, 1]
+
+        * Case 2
+            x is a Tensor:
+                x.data = [a, b, c]
+                x.dims = [3, 1]
+
+            y is a LoDTensor:
+                y.lod = [[0, 2, 3, 6]]
+
+            with condition len(y.lod[-1]) - 1 == x.dims[0]
+
+            then output is a 1-level LoDTensor:
+                out.lod = [[0,    2, 3,      6]]
+                out.data = [a, a, b, c, c, c]
+                out.dims = [6, 1]
+
+    Args:
+        x (Variable): The input variable which is a Tensor or LoDTensor.
+        y (Variable): The input variable which is a LoDTensor.
+
+    Returns:
+        Variable: The expanded variable which is a LoDTensor.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[10, 20],
+                             dtype='float32', lod_level=1)
+            out = layers.sequence_expand(x=x, y=y)
+    """
+    helper = LayerHelper('sequence_expand', input=x, **locals())
+    dtype = helper.input_dtype()
+    tmp = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='sequence_expand', inputs={'X': x,
+                                        'Y': y}, outputs={'Out': tmp})
+    return tmp
+
+
+def lstm_unit(x_t,
+              hidden_t_prev,
+              cell_t_prev,
+              forget_bias=0.0,
+              param_attr=None,
+              bias_attr=None):
+    """Lstm unit layer. The equation of a lstm step is:
+
+        .. math::
+
+            i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
+
+            f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
+
+            c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
+
+            o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
+
+            h_t & = o_t tanh(c_t)
+
+    The inputs of lstm unit includes :math:`x_t`, :math:`h_{t-1}` and
+    :math:`c_{t-1}`. The implementation separates the linear transformation
+    and non-linear transformation apart. Here, we take :math:`i_t` as an
+    example. The linear transformation is applied by calling a `fc` layer and
+    the equation is:
+
+        .. math::
+
+            L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i
+
+    The non-linear transformation is applied by calling `lstm_unit_op` and the
+    equation is:
+
+        .. math::
+
+            i_t = \sigma(L_{i_t})
+
+    This layer has two outputs including :math:`h_t` and :math:`o_t`.
+
+    Args:
+        x_t (Variable): The input value of current step.
+        hidden_t_prev (Variable): The hidden value of lstm unit.
+        cell_t_prev (Variable): The cell value of lstm unit.
+        forget_bias (float): The forget bias of lstm unit.
+        param_attr (ParamAttr): The attributes of parameter weights, used to set
+            initializer, name etc.
+        bias_attr (ParamAttr): The attributes of bias weights, if not False,
+            bias weights will be created and be set to default value.
+
+    Returns:
+        tuple: The hidden value and cell value of lstm unit.
+
+    Raises:
+        ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\
+                not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \
+                and **cell_t_prev** not be the same.
+
+    Examples:
+
+        .. code-block:: python
+
+             x_t = fluid.layers.fc(input=x_t_data, size=10)
+             prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=20)
+             prev_cell = fluid.layers.fc(input=prev_cell_data, size=30)
+             hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t,
+                                                    hidden_t_prev=prev_hidden,
+                                                    cell_t_prev=prev_cell)
+    """
+    helper = LayerHelper('lstm_unit', **locals())
+
+    if len(x_t.shape) != 2:
+        raise ValueError("Rank of x_t must be 2.")
+
+    if len(hidden_t_prev.shape) != 2:
+        raise ValueError("Rank of hidden_t_prev must be 2.")
+
+    if len(cell_t_prev.shape) != 2:
+        raise ValueError("Rank of cell_t_prev must be 2.")
+
+    if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[
+            0] != cell_t_prev.shape[0]:
+        raise ValueError("The 1s dimension of x_t, hidden_t_prev and "
+                         "cell_t_prev must be the same.")
+
+    if bias_attr is None:
+        bias_attr = ParamAttr()
+
+    size = cell_t_prev.shape[1]
+    concat_out = concat(input=[x_t, hidden_t_prev], axis=1)
+    fc_out = fc(input=concat_out,
+                size=4 * size,
+                param_attr=param_attr,
+                bias_attr=bias_attr)
+    dtype = x_t.dtype
+    c = helper.create_tmp_variable(dtype)
+    h = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='lstm_unit',
+        inputs={"X": fc_out,
+                "C_prev": cell_t_prev},
+        outputs={"C": c,
+                 "H": h},
+        attrs={"forget_bias": forget_bias})
+
+    return h, c
+
+
+def reduce_sum(input, dim=None, keep_dim=False):
+    """
+    Computes the sum of tensor elements over the given dimension. 
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the sum is performed. If 
+            :attr:`None`, sum all elements of :attr:`input` and return a 
+            Tensor variable with a single element, otherwise must be in the 
+            range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, 
+            the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the 
+            output Tensor. The result tensor will have one fewer dimension 
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+    
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_sum(x)  # [3.5]
+            fluid.layers.reduce_sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
+            fluid.layers.reduce_sum(x, dim=-1)  # [1.9, 1.6]
+            fluid.layers.reduce_sum(x, dim=1, keep_dim=True)  # [[1.9], [1.6]]
+    """
+    helper = LayerHelper('reduce_sum', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_sum',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def reduce_mean(input, dim=None, keep_dim=False):
+    """
+    Computes the mean of tensor elements over the given dimension. 
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the mean is computed. If 
+            :attr:`None`, compute the mean over all elements of :attr:`input` 
+            and return a Tensor variable with a single element, otherwise 
+            must be in the range :math:`[-rank(input), rank(input))`. If 
+            :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the 
+            output Tensor. The result tensor will have one fewer dimension 
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+    
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_mean(x)  # [0.4375]
+            fluid.layers.reduce_mean(x, dim=0)  # [0.15, 0.25, 0.55, 0.8]
+            fluid.layers.reduce_mean(x, dim=-1)  # [0.475, 0.4]
+            fluid.layers.reduce_mean(x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
+    """
+    helper = LayerHelper('reduce_mean', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_mean',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -2,7 +2,7 @@ from ..registry import register_layer
 __all__ = [
    'mean', 'mul', 'dropout', 'reshape', 'sigmoid', 'scale', 'transpose',
    'sigmoid_cross_entropy_with_logits', 'elementwise_add', 'elementwise_div',
-    'elementwise_sub', 'elementwise_mul', 'clip', 'abs'
+    'elementwise_sub', 'elementwise_mul', 'clip', 'abs', 'sequence_softmax'
 ]

 for _OP in set(__all__):

--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -6,12 +6,12 @@ __all__ = [
 ]


-def create_tensor(dtype, name=None, main_program=None, startup_program=None):
+def create_tensor(dtype, name=None):
    helper = LayerHelper("create_tensor", **locals())
    return helper.create_variable(name=helper.name, dtype=dtype)


-def cast(x, dtype, main_program=None):
+def cast(x, dtype):
    """
    This function takes in the input with input_dtype
    and casts it to the output_dtype as the output.
@@ -27,10 +27,23 @@ def cast(x, dtype, main_program=None):
    return out


-def concat(input, axis, main_program=None, startup_program=None):
+def concat(input, axis=0):
    """
-    This function concats the input along the axis mentioned
+    **Concat**
+
+    This function concatenates the input along the axis mentioned
    and returns that as the output.
+
+    Args:
+        input(list): List of tensors to be concatenated
+        axis(int): Integer axis along which the tensors will be concatenated
+
+    Returns:
+        Variable: Output variable of the concatenation
+
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
    """
    helper = LayerHelper('concat', **locals())
    out = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -42,10 +55,29 @@ def concat(input, axis, main_program=None, startup_program=None):
    return out


-def sums(input, out=None, main_program=None, startup_program=None):
-    """
-    This function takes in the input and performs the sum operation on it
-    and returns that as the output.
+def sums(input, out=None):
+    """This function performs the sum operation on the input and returns the
+    result as the output.
+
+    Args:
+        input (Variable|list): The input tensor that has the elements
+                               that need to be summed up.
+
+    Returns:
+        Variable: The tensor type variable that has the sum of input
+                  written to it.
+
+    Examples:
+        .. code-block::python
+
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          a0 = layers.array_read(array=tmp, i=i)
+          i = layers.increment(x=i)
+          a1 = layers.array_read(array=tmp, i=i)
+          mean_a0 = layers.mean(x=a0)
+          mean_a1 = layers.mean(x=a1)
+          a_sum = layers.sums(input=[mean_a0, mean_a1])
    """
    helper = LayerHelper('sum', **locals())
    if out is None:
@@ -54,7 +86,25 @@ def sums(input, out=None, main_program=None, startup_program=None):
    return out


-def assign(input, output, main_program=None, startup_program=None):
+def assign(input, output):
+    """
+    **Assign**
+
+    This function copies the *input* Variable to the *output* Variable.
+
+    Args:
+        input(Variable): The source variable
+        output(Variable): The destination variable
+
+    Returns:
+        Variable: The destination variable that was supplied as the *output*.
+
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.create_tensor(dtype='float32')
+          hidden = fluid.layers.fc(input=data, size=10)
+          fluid.layers.assign(hidden, out)
+    """
    helper = LayerHelper('assign', **locals())
    helper.append_op(
        type='scale',
@@ -64,16 +114,28 @@ def assign(input, output, main_program=None, startup_program=None):
    return output


-def fill_constant(shape,
-                  dtype,
-                  value,
-                  out=None,
-                  main_program=None,
-                  startup_program=None):
+def fill_constant(shape, dtype, value, out=None):
    """
-    This function creates a tensor , with shape as mentioned in the input and
-    specified dtype and fills this up with a constant value that
-    comes in the input. It also sets the stop_gradient to be True.
+    **fill_constant**
+
+    This function creates a tensor of specified *shape* and
+    *dtype*, and initializes this with a constant supplied in *value*.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+        value(float): Constant value to initialize the output tensor
+        out(Variable): Output Variable to initialize
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
    """
    helper = LayerHelper("fill_constant", **locals())
    if out is None:
@@ -94,9 +156,32 @@ def fill_constant_batch_size_like(input,
                                  dtype,
                                  value,
                                  input_dim_idx=0,
-                                  output_dim_idx=0,
-                                  main_program=None,
-                                  startup_program=None):
+                                  output_dim_idx=0):
+    """
+    **fill_constant_batch_size_like**
+
+    This function creates a tensor of specified *shape*, *dtype* and batch size,
+    and initializes this with a constant supplied in *value*. The batch size is
+    obtained from the `input` tensor.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        input(Variable): Tensor whose dimensions will be used to get batch size
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+        value(float): Constant value to initialize the output tensor
+        input_dim_idx(int): Index of input's batch size dimension
+        output_dim_idx(int): Index of output's batch size dimension
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
+    """
    helper = LayerHelper("fill_constant_batch_size_like", **locals())
    out = helper.create_tmp_variable(dtype=dtype)
    helper.append_op(
@@ -114,7 +199,7 @@ def fill_constant_batch_size_like(input,
    return out


-def ones(shape, dtype, main_program=None):
+def ones(shape, dtype):
    """
    This function performs the same function as fill_constant() declared above
    with the constant value being 1.0.
@@ -122,7 +207,7 @@ def ones(shape, dtype, main_program=None):
    return fill_constant(value=1.0, **locals())


-def zeros(shape, dtype, main_program=None):
+def zeros(shape, dtype):
    """
    This function performs the same function as fill_constant() declared above
    with the constant value being 0.0.

--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -10,25 +10,19 @@ def simple_img_conv_pool(input,
                         pool_stride,
                         act,
                         param_attr=None,
-                         pool_type='max',
-                         main_program=None,
-                         startup_program=None):
+                         pool_type='max'):
    conv_out = layers.conv2d(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
        param_attr=param_attr,
-        act=act,
-        main_program=main_program,
-        startup_program=startup_program)
+        act=act)

    pool_out = layers.pool2d(
        input=conv_out,
        pool_size=pool_size,
        pool_type=pool_type,
-        pool_stride=pool_stride,
-        main_program=main_program,
-        startup_program=startup_program)
+        pool_stride=pool_stride)
    return pool_out


@@ -42,9 +36,7 @@ def img_conv_group(input,
                   conv_with_batchnorm=False,
                   conv_batchnorm_drop_rate=None,
                   pool_stride=1,
-                   pool_type=None,
-                   main_program=None,
-                   startup_program=None):
+                   pool_type=None):
    """
    Image Convolution Group, Used for vgg net.
    """
@@ -75,31 +67,19 @@ def img_conv_group(input,
            filter_size=conv_filter_size[i],
            padding=conv_padding[i],
            param_attr=param_attr[i],
-            act=local_conv_act,
-            main_program=main_program,
-            startup_program=startup_program)
+            act=local_conv_act)

        if conv_with_batchnorm[i]:
-            tmp = layers.batch_norm(
-                input=tmp,
-                act=conv_act,
-                main_program=main_program,
-                startup_program=startup_program)
+            tmp = layers.batch_norm(input=tmp, act=conv_act)
            drop_rate = conv_batchnorm_drop_rate[i]
            if abs(drop_rate) > 1e-5:
-                tmp = layers.dropout(
-                    x=tmp,
-                    dropout_prob=drop_rate,
-                    main_program=main_program,
-                    startup_program=startup_program)
+                tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)

    pool_out = layers.pool2d(
        input=tmp,
        pool_size=pool_size,
        pool_type=pool_type,
-        pool_stride=pool_stride,
-        main_program=main_program,
-        startup_program=startup_program)
+        pool_stride=pool_stride)
    return pool_out


@@ -108,21 +88,13 @@ def sequence_conv_pool(input,
                       filter_size,
                       param_attr=None,
                       act="sigmoid",
-                       pool_type="max",
-                       main_program=None,
-                       startup_program=None):
+                       pool_type="max"):
    conv_out = layers.sequence_conv(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
        param_attr=param_attr,
-        act=act,
-        main_program=main_program,
-        startup_program=startup_program)
+        act=act)

-    pool_out = layers.sequence_pool(
-        input=conv_out,
-        pool_type=pool_type,
-        main_program=main_program,
-        startup_program=startup_program)
+    pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)
    return pool_out
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -2,10 +2,11 @@ from collections import defaultdict

 import framework
 from backward import append_backward_ops
-from framework import unique_name
+from framework import unique_name, program_guard
 from initializer import Constant
 from layer_helper import LayerHelper
 from regularizer import append_regularization_ops
+from clip import append_gradient_clip_ops

 __all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad']

@@ -159,10 +160,8 @@ class Optimizer(object):

        # Create any accumulators
        program = loss.block.program
-        self.helper = LayerHelper(
-            self.__class__.__name__,
-            main_program=program,
-            startup_program=startup_program)
+        with program_guard(program, startup_program):
+            self.helper = LayerHelper(self.__class__.__name__)
            self._create_accumulators(loss.block,
                                      [p[0] for p in parameters_and_grads])

@@ -199,12 +198,16 @@ class Optimizer(object):
        `create_optimization_pass()` into one.
        """
        params_grads = append_backward_ops(loss, parameter_list, no_grad_set)
+
+        params_grads = append_gradient_clip_ops(params_grads)
+
        # Add regularization if any
        params_grads = append_regularization_ops(params_grads,
                                                 self.regularization)
+
        optimize_ops = self.create_optimization_pass(params_grads, loss,
                                                     startup_program)
-        return optimize_ops
+        return optimize_ops, params_grads


 class SGDOptimizer(Optimizer):

--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
 from initializer import Initializer, Xavier, Constant
 from regularizer import WeightDecayRegularizer

+__all__ = ['ParamAttr']
+

 class ParamAttr(object):
    def __init__(self,
@@ -8,12 +10,14 @@ class ParamAttr(object):
                 initializer=None,
                 learning_rate=1.0,
                 regularizer=None,
-                 trainable=True):
+                 trainable=True,
+                 clip=None):
        self.name = name
        self.initializer = initializer
        self.learning_rate = learning_rate
        self.regularizer = regularizer
        self.trainable = trainable
+        self.clip = clip

    def set_default_initializer(self, initializer):
        if initializer is None:
@@ -54,9 +58,12 @@ class ParamAttr(object):
    def to_kwargs(self, with_initializer=False):
        kwargs = {
            'name': self.name,
-            'learning_rate': self.learning_rate,
+            'optimize_attr': {
+                'learning_rate': self.learning_rate
+            },
            'regularizer': self.regularizer,
-            'trainable': self.trainable
+            'trainable': self.trainable,
+            'clip_attr': self.clip
        }
        if with_initializer:
            kwargs['initializer'] = self.initializer

--- a/python/paddle/v2/fluid/tests/.gitignore
+++ b/python/paddle/v2/fluid/tests/.gitignore
 image/
 fit_a_line.model/
 tmp
+cuda_profiler.txt
--- a/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py
+++ b/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    input=images,
+    filter_size=5,
+    num_filters=20,
+    pool_size=2,
+    pool_stride=2,
+    act="relu")
+conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    input=conv_pool_1,
+    filter_size=5,
+    num_filters=50,
+    pool_size=2,
+    pool_stride=2,
+    act="relu")
+
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+BATCH_SIZE = 50
+PASS_NUM = 3
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+t = fluid.DistributeTranspiler()
+pserver_endpoints = os.getenv("PSERVERS")
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=1)
+
+if training_role == "PSERVER":
+    pserver_prog = t.get_pserver_program(pserver_endpoints, optimize_ops)
+    exe.run(fluid.default_startup_program())
+    exe.run(pserver_prog)
+elif training_role == "TRAINER":
+    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in range(PASS_NUM):
+        accuracy.reset(exe)
+        for data in train_reader():
+            loss, acc = exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+            pass_acc = accuracy.eval(exe)
+            # print loss, acc
+            if loss < 10.0 and pass_acc > 0.9:
+                # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
+                exit(0)
+
+        pass_acc = accuracy.eval(exe)
+        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
+else:
+    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+exit(1)
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
@@ -11,7 +11,9 @@ regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
 hidden1 = fluid.layers.fc(input=image,
                          size=128,
                          act='relu',
-                          param_attr=regularizer)
+                          param_attr=fluid.ParamAttr(
+                              regularizer=regularizer,
+                              clip=fluid.clip.ClipByValue(10)))
 hidden2 = fluid.layers.fc(input=hidden1,
                          size=64,
                          act='relu',
@@ -33,11 +35,10 @@ opts = optimizer.minimize(avg_cost)
 accuracy = fluid.evaluator.Accuracy(input=predict, label=label)

 inference_program = fluid.default_main_program().clone()
-test_accuracy = fluid.evaluator.Accuracy(
-    input=predict, label=label, main_program=inference_program)
-test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
-inference_program = fluid.io.get_inference_program(
-    test_target, main_program=inference_program)
+with fluid.program_guard(inference_program):
+    test_accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
+    inference_program = fluid.io.get_inference_program(test_target)

 train_reader = paddle.batch(
    paddle.reader.shuffle(

--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
@@ -4,12 +4,7 @@ import paddle.v2.fluid as fluid
 from paddle.v2.fluid.layer_helper import LayerHelper


-def lstm(x,
-         c_pre_init,
-         hidden_dim,
-         forget_bias=None,
-         main_program=None,
-         startup_program=None):
+def lstm(x, c_pre_init, hidden_dim, forget_bias=None):
    """
    This function helps create an operator for the LSTM (Long Short Term
    Memory) cell that can be used inside an RNN.
@@ -20,15 +15,8 @@ def lstm(x,
        c_pre = rnn.memory(init=c_pre_init)
        x_t = rnn.step_input(x)

-        before_fc = fluid.layers.concat(
-            input=[x_t, c_pre],
-            axis=1,
-            main_program=main_program,
-            startup_program=startup_program)
-        after_fc = fluid.layers.fc(input=before_fc,
-                                   size=hidden_dim * 4,
-                                   main_program=main_program,
-                                   startup_program=startup_program)
+        before_fc = fluid.layers.concat(input=[x_t, c_pre], axis=1)
+        after_fc = fluid.layers.fc(input=before_fc, size=hidden_dim * 4)

        dtype = x.dtype
        c = helper.create_tmp_variable(dtype)

--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -3,10 +3,7 @@ import numpy as np
 from op_test import OpTest
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.op import Operator
-
-
-def grad_var_name(var_name):
-    return var_name + "@GRAD"
+from paddle.v2.fluid.framework import grad_var_name


 def get_backward_op(scope, op, no_grad_set):
@@ -211,7 +208,7 @@ class TestBatchNormOp(OpTest):
        print 'python: NHWC, NCHW, backward checking passed'

    def test_forward_backward(self):
-        def test_with_place(place, tensor_format, shape):
+        def test_with_place(place, data_layout, shape):
            # attr
            epsilon = 0.00001
            momentum = 0.9
@@ -295,7 +292,7 @@ class TestBatchNormOp(OpTest):
                SavedVariance="saved_variance",
                # attrs
                is_test=False,
-                tensor_format=tensor_format,
+                data_layout=data_layout,
                momentum=momentum,
                epsilon=epsilon)

@@ -314,7 +311,7 @@ class TestBatchNormOp(OpTest):
                atol = 1e-4
            self.__assert_close(variance_out_tensor, variance_out,
                                "variance_out", atol)
-            print "op test forward passed: ", str(place), tensor_format
+            print "op test forward passed: ", str(place), data_layout

            # run backward
            batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set())
@@ -339,11 +336,15 @@ class TestBatchNormOp(OpTest):
            self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad")
            self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad")
            self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad")
-            print "op test backward passed: ", str(place), tensor_format
+            print "op test backward passed: ", str(place), data_layout

        places = [core.CPUPlace()]
        if core.is_compile_gpu() and core.op_support_gpu("batch_norm"):
            places.append(core.GPUPlace(0))
+
+            core.init_devices(["CPU", "GPU:0"])
+        else:
+            core.init_devices(["CPU"])
        for place in places:
            for data_format in ["NCHW", "NHWC"]:
                test_with_place(place, data_format, [2, 3, 4, 5])

--- a/python/paddle/v2/fluid/tests/test_const_value.py
+++ b/python/paddle/v2/fluid/tests/test_const_value.py
+import unittest
+import paddle.v2.fluid.framework as framework
+
+
+class ConditionalBlock(unittest.TestCase):
+    def test_const_value(self):
+        self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
+        self.assertEqual(framework.TEMP_VAR_NAME, "@TEMP@")
+        self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
+        self.assertEqual(framework.ZERO_VAR_SUFFIX, "@ZERO")
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
@@ -3,14 +3,17 @@ import numpy as np
 from op_test import OpTest


-def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
+def conv2dtranspose_forward_naive(input_, filter_, attrs):
    in_n, in_c, in_h, in_w = input_.shape
    f_c, out_c, f_h, f_w = filter_.shape
    assert in_c == f_c

-    stride, pad = conv2dtranspose_param['stride'], conv2dtranspose_param['pad']
-    out_h = (in_h - 1) * stride[0] + f_h
-    out_w = (in_w - 1) * stride[1] + f_w
+    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
+        'dilations']
+    d_bolck_h = dilations[0] * (f_h - 1) + 1
+    d_bolck_w = dilations[1] * (f_w - 1) + 1
+    out_h = (in_h - 1) * stride[0] + d_bolck_h
+    out_w = (in_w - 1) * stride[1] + d_bolck_w

    out = np.zeros((in_n, out_c, out_h, out_w))

@@ -23,9 +26,9 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):

                for k in range(out_c):
                    tmp_out = np.sum(input_masked * filter_[:, k, :, :], axis=0)
-                    i1, i2 = i * stride[0], i * stride[0] + f_h
-                    j1, j2 = j * stride[0], j * stride[0] + f_w
-                    out[n, k, i1:i2, j1:j2] += tmp_out
+                    i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
+                    j1, j2 = j * stride[0], j * stride[0] + d_bolck_h
+                    out[n, k, i1:i2:dilations[0], j1:j2:dilations[1]] += tmp_out

    out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
    return out
@@ -37,11 +40,8 @@ class TestConv2dTransposeOp(OpTest):
        self.init_op_type()
        self.init_test_case()

-        conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad}
        input_ = np.random.random(self.input_size).astype("float32")
        filter_ = np.random.random(self.filter_size).astype("float32")
-        output = conv2dtranspose_forward_naive(
-            input_, filter_, conv2dtranspose_param).astype('float32')

        self.inputs = {'Input': input_, 'Filter': filter_}
        self.attrs = {
@@ -49,6 +49,10 @@ class TestConv2dTransposeOp(OpTest):
            'paddings': self.pad,
            'dilations': self.dilations
        }
+
+        output = conv2dtranspose_forward_naive(input_, filter_,
+                                               self.attrs).astype('float32')
+
        self.outputs = {'Output': output}

    def test_check_output(self):
@@ -104,11 +108,60 @@ class TestWithStride(TestConv2dTransposeOp):
        self.filter_size = [f_c, 6, 3, 3]


+class TestWithDilation(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
 # ------------ test_cudnn ------------
 class TestCudnn(TestConv2dTransposeOp):
    def init_op_type(self):
        self.op_type = "conv2d_transpose_cudnn"


+class TestCudnnWithPad(TestWithPad):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv2d_transpose_cudnn"
+
+
+class TestCudnnWithStride(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv2d_transpose_cudnn"
+
+
+# #cudnn v5 does not support dilation conv.
+# class TestCudnnWithDilation(TestWithDilation):
+#     def init_test_case(self):
+#         self.pad = [1, 1]
+#         self.stride = [2, 2]
+#         self.dilations = [2, 2]
+#         self.input_size = [2, 3, 5, 5]  # NCHW
+#         f_c = self.input_size[1]
+#         self.filter_size = [f_c, 6, 3, 3]
+#
+#     def init_op_type(self):
+#         self.op_type = "conv2d_transpose_cudnn"
+
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
@@ -3,15 +3,20 @@ import numpy as np
 from op_test import OpTest


-def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
+def conv3dtranspose_forward_naive(input_, filter_, attrs):
    in_n, in_c, in_d, in_h, in_w = input_.shape
    f_c, out_c, f_d, f_h, f_w = filter_.shape
    assert in_c == f_c

-    stride, pad = conv3dtranspose_param['stride'], conv3dtranspose_param['pad']
-    out_d = (in_d - 1) * stride[0] + f_d
-    out_h = (in_h - 1) * stride[1] + f_h
-    out_w = (in_w - 1) * stride[2] + f_w
+    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
+        'dilations']
+
+    d_bolck_d = dilations[0] * (f_d - 1) + 1
+    d_bolck_h = dilations[1] * (f_h - 1) + 1
+    d_bolck_w = dilations[2] * (f_w - 1) + 1
+    out_d = (in_d - 1) * stride[0] + d_bolck_d
+    out_h = (in_h - 1) * stride[1] + d_bolck_h
+    out_w = (in_w - 1) * stride[2] + d_bolck_w
    out = np.zeros((in_n, out_c, out_d, out_h, out_w))

    for n in range(in_n):
@@ -25,10 +30,11 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
                    for k in range(out_c):
                        tmp_out = np.sum(input_masked * filter_[:, k, :, :, :],
                                         axis=0)
-                        d1, d2 = d * stride[0], d * stride[0] + f_d
-                        i1, i2 = i * stride[1], i * stride[1] + f_h
-                        j1, j2 = j * stride[2], j * stride[2] + f_w
-                        out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out
+                        d1, d2 = d * stride[0], d * stride[0] + d_bolck_d
+                        i1, i2 = i * stride[1], i * stride[1] + d_bolck_h
+                        j1, j2 = j * stride[2], j * stride[2] + d_bolck_w
+                        out[n, k, d1:d2:dilations[0], i1:i2:dilations[1], j1:j2:
+                            dilations[2]] += tmp_out

    out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
              pad[2]]
@@ -41,18 +47,19 @@ class TestConv3dTransposeOp(OpTest):
        self.init_op_type()
        self.init_test_case()

-        conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad}
        input_ = np.random.random(self.input_size).astype("float32")
        filter_ = np.random.random(self.filter_size).astype("float32")
-        output = conv3dtranspose_forward_naive(
-            input_, filter_, conv3dtranspose_param).astype("float32")

        self.inputs = {'Input': input_, 'Filter': filter_}
        self.attrs = {
            'strides': self.stride,
            'paddings': self.pad,
-            # 'dilations': self.dilations
+            'dilations': self.dilations
        }
+
+        output = conv3dtranspose_forward_naive(input_, filter_,
+                                               self.attrs).astype("float32")
+
        self.outputs = {'Output': output}

    def test_check_output(self):
@@ -108,11 +115,60 @@ class TestWithStride(TestConv3dTransposeOp):
        self.filter_size = [f_c, 6, 3, 3, 3]


+class TestWithDilation(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [2, 2, 2]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
 # ------------ test_cudnn ------------
 class TestCudnn(TestConv3dTransposeOp):
    def init_op_type(self):
        self.op_type = "conv3d_transpose_cudnn"


+class TestCudnnWithPad(TestWithPad):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose_cudnn"
+
+
+class TestCudnnWithStride(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose_cudnn"
+
+
+# #cudnn v5 does not support dilation conv.
+# class TestCudnnWithDilation(TestWithDilation):
+#     def init_test_case(self):
+#         self.pad = [1, 1, 1]
+#         self.stride = [2, 2, 2]
+#         self.dilations = [2, 2, 2]
+#         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+#         f_c = self.input_size[1]
+#         self.filter_size = [f_c, 6, 3, 3, 3]
+#
+#     def init_op_type(self):
+#         self.op_type = "conv3d_transpose_cudnn"
+
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_dropout_op.py
+++ b/python/paddle/v2/fluid/tests/test_dropout_op.py
@@ -47,7 +47,9 @@ class TestDropoutOp4(OpTest):
        self.op_type = "dropout"
        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
        self.attrs = {'dropout_prob': 0.35, 'is_test': True}
-        self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
+        self.outputs = {
+            'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
+        }

    def test_check_output(self):
        self.check_output()
@@ -58,7 +60,9 @@ class TestDropoutOp5(OpTest):
        self.op_type = "dropout"
        self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
        self.attrs = {'dropout_prob': 0.75, 'is_test': True}
-        self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
+        self.outputs = {
+            'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
+        }

    def test_check_output(self):
        self.check_output()

--- a/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
@@ -7,7 +7,7 @@ class TestFillZerosLikeOp(OpTest):
    def setUp(self):
        self.op_type = "fill_zeros_like"
        self.inputs = {'X': np.random.random((219, 232)).astype("float32")}
-        self.outputs = {'Y': np.zeros_like(self.inputs["X"])}
+        self.outputs = {'Out': np.zeros_like(self.inputs["X"])}

    def test_check_output(self):
        self.check_output()

--- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
 import unittest
+import numpy
+
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.op import Operator
-import numpy
+from paddle.v2.fluid.executor import Executor


 class TestGaussianRandomOp(unittest.TestCase):
+    def setUp(self):
+        self.op_type = "gaussian_random"
+        self.inputs = {}
+        self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10}
+
+        self.outputs = ["Out"]
+
    def test_cpu(self):
-        self.gaussian_random_test(place=core.CPUPlace())
+        self.gaussian_random_test(place=fluid.CPUPlace())

    def test_gpu(self):
        if core.is_compile_gpu():
-            self.gaussian_random_test(place=core.GPUPlace(0))
+            self.gaussian_random_test(place=fluid.GPUPlace(0))

    def gaussian_random_test(self, place):
-        scope = core.Scope()
-        scope.var('Out').get_tensor()
-
-        op = Operator(
-            "gaussian_random",
-            Out='Out',
-            shape=[1000, 784],
-            mean=.0,
-            std=1.,
-            seed=10)

        context = core.DeviceContext.create(place)
-        op.run(scope, context)
-        tensor = numpy.array(scope.find_var('Out').get_tensor())
+        program = fluid.Program()
+        block = program.global_block()
+        vout = block.create_var(name="Out")
+        op = block.append_op(
+            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
+
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+
+        fetch_list = []
+        for var_name in self.outputs:
+            fetch_list.append(block.var(var_name))
+
+        exe = Executor(place)
+        outs = exe.run(program, fetch_list=fetch_list)
+        tensor = outs[0]
+
        self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
        self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)


--- a/python/paddle/v2/fluid/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
@@ -5,12 +5,7 @@ import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.framework import Program


-def conv_block(input,
-               num_filter,
-               groups,
-               dropouts,
-               main_program=None,
-               startup_program=None):
+def conv_block(input, num_filter, groups, dropouts):
    return nets.img_conv_group(
        input=input,
        pool_size=2,
@@ -20,90 +15,54 @@ def conv_block(input,
        conv_act='relu',
        conv_with_batchnorm=True,
        conv_batchnorm_drop_rate=dropouts,
-        pool_type='max',
-        main_program=main_program,
-        startup_program=startup_program)
+        pool_type='max')


 class TestLayer(unittest.TestCase):
    def test_batch_norm_layer(self):
        main_program = Program()
        startup_program = Program()
+        with fluid.program_guard(main_program, startup_program):
            images = fluid.layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            dtype='float32',
-            main_program=main_program)
-        hidden1 = fluid.layers.batch_norm(
-            input=images,
-            main_program=main_program,
-            startup_program=startup_program)
-        hidden2 = fluid.layers.fc(input=hidden1,
-                                  size=128,
-                                  act='relu',
-                                  main_program=main_program)
-        hidden3 = fluid.layers.batch_norm(
-            input=hidden2,
-            main_program=main_program,
-            startup_program=startup_program)
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            hidden1 = fluid.layers.batch_norm(input=images)
+            hidden2 = fluid.layers.fc(input=hidden1, size=128, act='relu')
+            fluid.layers.batch_norm(input=hidden2)

        print str(main_program)

    def test_dropout_layer(self):
        main_program = Program()
        startup_program = Program()
+        with fluid.program_guard(main_program, startup_program):
            images = fluid.layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            dtype='float32',
-            main_program=main_program)
-        fluid.layers.dropout(
-            x=images,
-            dropout_prob=0.5,
-            main_program=main_program,
-            startup_program=startup_program)
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            fluid.layers.dropout(x=images, dropout_prob=0.5)

-        # print str(main_program)
+        print str(main_program)

    def test_img_conv_group(self):
        main_program = Program()
        startup_program = Program()

+        with fluid.program_guard(main_program, startup_program):
            images = fluid.layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            dtype='float32',
-            main_program=main_program,
-            startup_program=startup_program)
-        conv1 = conv_block(images, 64, 2, [0.3, 0], main_program,
-                           startup_program)
-        conv2 = conv_block(conv1, 256, 3, [0.4, 0.4, 0], main_program,
-                           startup_program)
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            conv1 = conv_block(images, 64, 2, [0.3, 0])
+            conv_block(conv1, 256, 3, [0.4, 0.4, 0])

-        # print str(main_program)
+        print str(main_program)

    def test_elementwise_add_with_act(self):
        main_program = Program()
        startup_program = Program()
+        with fluid.program_guard(main_program, startup_program):
            image1 = fluid.layers.data(
-            name='pixel1',
-            shape=[3, 48, 48],
-            dtype='float32',
-            main_program=main_program,
-            startup_program=startup_program)
+                name='pixel1', shape=[3, 48, 48], dtype='float32')
            image2 = fluid.layers.data(
-            name='pixel2',
-            shape=[3, 48, 48],
-            dtype='float32',
-            main_program=main_program,
-            startup_program=startup_program)
-        out = fluid.layers.elementwise_add(
-            x=image1,
-            y=image2,
-            act='relu',
-            main_program=main_program,
-            startup_program=startup_program)
-        # print(main_program)
+                name='pixel2', shape=[3, 48, 48], dtype='float32')
+            fluid.layers.elementwise_add(x=image1, y=image2, act='relu')
+        print(main_program)


 if __name__ == '__main__':

--- a/python/paddle/v2/fluid/tests/test_inference_model_io.py
+++ b/python/paddle/v2/fluid/tests/test_inference_model_io.py
@@ -6,7 +6,7 @@ import paddle.v2.fluid.core as core
 import paddle.v2.fluid.executor as executor
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.optimizer as optimizer
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.io import save_inference_model, load_inference_model


@@ -16,32 +16,15 @@ class TestBook(unittest.TestCase):

        init_program = Program()
        program = Program()
-        x = layers.data(
-            name='x',
-            shape=[2],
-            dtype='float32',
-            main_program=program,
-            startup_program=init_program)
-        y = layers.data(
-            name='y',
-            shape=[1],
-            dtype='float32',
-            main_program=program,
-            startup_program=init_program)
-
-        y_predict = layers.fc(input=x,
-                              size=1,
-                              act=None,
-                              main_program=program,
-                              startup_program=init_program)
-
-        cost = layers.square_error_cost(
-            input=y_predict,
-            label=y,
-            main_program=program,
-            startup_program=init_program)
-        avg_cost = layers.mean(
-            x=cost, main_program=program, startup_program=init_program)
+
+        with program_guard(program, init_program):
+            x = layers.data(name='x', shape=[2], dtype='float32')
+            y = layers.data(name='y', shape=[1], dtype='float32')
+
+            y_predict = layers.fc(input=x, size=1, act=None)
+
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(x=cost)

            sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
            sgd_optimizer.minimize(avg_cost, init_program)

--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -161,6 +161,41 @@ class TestBook(unittest.TestCase):
                    x=dat, label=lbl))
        print(str(program))

+    def test_sequence_expand(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10], dtype='float32')
+            y = layers.data(
+                name='y', shape=[10, 20], dtype='float32', lod_level=1)
+            self.assertIsNotNone(layers.sequence_expand(x=x, y=y))
+        print(str(program))
+
+    def test_lstm_unit(self):
+        program = Program()
+        with program_guard(program):
+            x_t_data = layers.data(
+                name='x_t_data', shape=[10, 10], dtype='float32')
+            x_t = layers.fc(input=x_t_data, size=10)
+            prev_hidden_data = layers.data(
+                name='prev_hidden_data', shape=[10, 20], dtype='float32')
+            prev_hidden = layers.fc(input=prev_hidden_data, size=20)
+            prev_cell_data = layers.data(
+                name='prev_cell', shape=[10, 30], dtype='float32')
+            prev_cell = layers.fc(input=prev_cell_data, size=30)
+            self.assertIsNotNone(
+                layers.lstm_unit(
+                    x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell))
+        print(str(program))
+
+    def test_sequence_softmax(self):
+        program = Program()
+        with program_guard(program):
+            seq_data = layers.data(
+                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
+            seq = layers.fc(input=seq_data, size=20)
+            self.assertIsNotNone(layers.sequence_softmax(x=seq))
+        print(str(program))
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.core as core
 import numpy
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops

@@ -118,15 +118,16 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
    def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
        place = self.place()
        program = Program()
-        x = layers.data(name='x', shape=[10], main_program=program)
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10])
            x.persistable = True
-        table = layers.lod_rank_table(x, level=level, main_program=program)
-        max_len = layers.max_sequence_len(table, main_program=program)
+            table = layers.lod_rank_table(x, level=level)
+            max_len = layers.max_sequence_len(table)
            max_len.persistable = True
-        array = layers.lod_tensor_to_array(x, table, main_program=program)
+            array = layers.lod_tensor_to_array(x, table)
            array.persistable = True

-        result = layers.array_to_lod_tensor(array, table, main_program=program)
+            result = layers.array_to_lod_tensor(array, table)
            result.persistable = True
        exe = Executor(place)
        scope = core.Scope()
@@ -160,17 +161,14 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
        place = core.CPUPlace()
        program = Program()

+        with program_guard(program):
            x = layers.data(
-            name='x',
-            shape=[1],
-            dtype='float32',
-            main_program=program,
-            stop_gradient=False)
-        table = layers.lod_rank_table(x, level=0, main_program=program)
-        array = layers.lod_tensor_to_array(x, table, main_program=program)
-        result = layers.array_to_lod_tensor(array, table, main_program=program)
-
-        mean = layers.mean(x=result, main_program=program)
+                name='x', shape=[1], dtype='float32', stop_gradient=False)
+            table = layers.lod_rank_table(x, level=0)
+            array = layers.lod_tensor_to_array(x, table)
+            result = layers.array_to_lod_tensor(array, table)
+
+            mean = layers.mean(x=result)

            append_backward_ops(mean)


--- a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard, default_main_program, default_startup_program
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.optimizer import MomentumOptimizer
 import paddle.v2.fluid.core as core
@@ -10,44 +10,42 @@ import numpy as np

 class TestMNISTIfElseOp(unittest.TestCase):
    def test_raw_api(self):
-        kwargs = {'startup_program': Program(), 'main_program': Program()}
-        image = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            image = layers.data(name='x', shape=[784], dtype='float32')

-        label = layers.data(name='y', shape=[1], dtype='int64', **kwargs)
+            label = layers.data(name='y', shape=[1], dtype='int64')

            limit = layers.fill_constant_batch_size_like(
-            input=label, dtype='int64', shape=[1], value=5.0, **kwargs)
-
-        cond = layers.less_than(x=label, y=limit, **kwargs)
+                input=label, dtype='int64', shape=[1], value=5.0)
+            cond = layers.less_than(x=label, y=limit)
            true_image, false_image = layers.split_lod_tensor(
-            input=image, mask=cond, **kwargs)
+                input=image, mask=cond)

-        true_out = layers.create_tensor(dtype='float32', **kwargs)
-        true_cond = layers.ConditionalBlock([true_image], **kwargs)
+            true_out = layers.create_tensor(dtype='float32')
+            true_cond = layers.ConditionalBlock([true_image])

            with true_cond.block():
-            hidden = layers.fc(input=true_image, size=100, act='tanh', **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
-            layers.assign(input=prob, output=true_out, **kwargs)
+                hidden = layers.fc(input=true_image, size=100, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                layers.assign(input=prob, output=true_out)

-        false_out = layers.create_tensor(dtype='float32', **kwargs)
-        false_cond = layers.ConditionalBlock([false_image], **kwargs)
+            false_out = layers.create_tensor(dtype='float32')
+            false_cond = layers.ConditionalBlock([false_image])

            with false_cond.block():
-            hidden = layers.fc(input=false_image,
-                               size=200,
-                               act='tanh',
-                               **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
-            layers.assign(input=prob, output=false_out, **kwargs)
+                hidden = layers.fc(input=false_image, size=200, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                layers.assign(input=prob, output=false_out)

            prob = layers.merge_lod_tensor(
-            in_true=true_out, in_false=false_out, mask=cond, x=image, **kwargs)
-        loss = layers.cross_entropy(input=prob, label=label, **kwargs)
-        avg_loss = layers.mean(x=loss, **kwargs)
+                in_true=true_out, in_false=false_out, mask=cond, x=image)
+            loss = layers.cross_entropy(input=prob, label=label)
+            avg_loss = layers.mean(x=loss)

            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-        optimizer.minimize(avg_loss, kwargs['startup_program'])
+            optimizer.minimize(avg_loss, startup_prog)

        train_reader = paddle.batch(
            paddle.reader.shuffle(
@@ -57,7 +55,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
        place = core.CPUPlace()
        exe = Executor(place)

-        exe.run(kwargs['startup_program'])
+        exe.run(startup_prog)
        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for data in train_reader():
@@ -65,7 +63,7 @@ class TestMNISTIfElseOp(unittest.TestCase):
                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                y_data = np.expand_dims(y_data, axis=1)

-                outs = exe.run(kwargs['main_program'],
+                outs = exe.run(prog,
                               feed={'x': x_data,
                                     'y': y_data},
                               fetch_list=[avg_loss])
@@ -75,39 +73,36 @@ class TestMNISTIfElseOp(unittest.TestCase):
        self.assertFalse(True)

    def test_ifelse(self):
-        kwargs = {'startup_program': Program(), 'main_program': Program()}
-        image = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            image = layers.data(name='x', shape=[784], dtype='float32')

-        label = layers.data(name='y', shape=[1], dtype='int64', **kwargs)
+            label = layers.data(name='y', shape=[1], dtype='int64')

            limit = layers.fill_constant_batch_size_like(
-            input=label, dtype='int64', shape=[1], value=5.0, **kwargs)
-
-        cond = layers.less_than(x=label, y=limit, **kwargs)
-
-        ie = layers.IfElse(cond, **kwargs)
+                input=label, dtype='int64', shape=[1], value=5.0)
+            cond = layers.less_than(x=label, y=limit)
+            ie = layers.IfElse(cond)

            with ie.true_block():
                true_image = ie.input(image)
-            hidden = layers.fc(input=true_image, size=100, act='tanh', **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
+                hidden = layers.fc(input=true_image, size=100, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
                ie.output(prob)

            with ie.false_block():
                false_image = ie.input(image)
-            hidden = layers.fc(input=false_image,
-                               size=200,
-                               act='tanh',
-                               **kwargs)
-            prob = layers.fc(input=hidden, size=10, act='softmax', **kwargs)
+                hidden = layers.fc(input=false_image, size=200, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
                ie.output(prob)

            prob = ie()
-        loss = layers.cross_entropy(input=prob[0], label=label, **kwargs)
-        avg_loss = layers.mean(x=loss, **kwargs)
+            loss = layers.cross_entropy(input=prob[0], label=label)
+            avg_loss = layers.mean(x=loss)

            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-        optimizer.minimize(avg_loss, kwargs['startup_program'])
+            optimizer.minimize(avg_loss, startup_prog)
        train_reader = paddle.batch(
            paddle.reader.shuffle(
                paddle.dataset.mnist.train(), buf_size=8192),
@@ -135,4 +130,5 @@ class TestMNISTIfElseOp(unittest.TestCase):


 if __name__ == '__main__':
-    unittest.main()
+    # temp disable if else unittest since it could be buggy.
+    exit(0)
--- a/python/paddle/v2/fluid/tests/test_operator.py
+++ b/python/paddle/v2/fluid/tests/test_operator.py
 import unittest
+
 import paddle.v2.fluid.op as op
-import paddle.v2.fluid.core as core
 import paddle.v2.fluid.proto.framework_pb2 as framework_pb2



--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -27,7 +27,7 @@ class TestOptimizer(unittest.TestCase):
        block.append_op(
            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
        sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
-        opts = sgd_optimizer.minimize(mean_out, init_program)
+        opts, _ = sgd_optimizer.minimize(mean_out, init_program)
        self.assertEqual(len(opts), 1)
        sgd_op = opts[0]
        self.assertEqual(sgd_op.type, "sgd")
@@ -57,7 +57,7 @@ class TestOptimizer(unittest.TestCase):
        learning_rate = 0.01
        sgd_optimizer = optimizer.SGDOptimizer(
            learning_rate=learning_rate, global_step=global_step)
-        opts = sgd_optimizer.minimize(mean_out, init_program)
+        opts, _ = sgd_optimizer.minimize(mean_out, init_program)
        self.assertEqual(len(opts), 2)
        sgd_op = opts[0]
        self.assertEqual(sgd_op.type, "sgd")

--- a/python/paddle/v2/fluid/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
 from __future__ import print_function
 import unittest

-from paddle.v2.fluid.framework import Program, default_main_program
+from paddle.v2.fluid.framework import Program, default_main_program, program_guard, grad_var_name
 import paddle.v2.fluid.layers as layers

 main_program = default_main_program()
@@ -109,12 +109,10 @@ class TestProgram(unittest.TestCase):
        self.assertEqual(add_op.idx, 1)
        param_to_grad = prog.append_backward(mean_out, set())

-        def grad_name(name):
-            return name + "@GRAD"
-
        for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out",
                         "mean.out"):
-            self.assertEqual(param_to_grad[var_name][0], grad_name(var_name))
+            self.assertEqual(param_to_grad[var_name][0],
+                             grad_var_name(var_name))
            self.assertEqual(param_to_grad[var_name][1], 0)

        expect_ops = [
@@ -129,13 +127,10 @@ class TestProgram(unittest.TestCase):
    def test_program_clone_with_parameter(self):
        main_program = Program()
        startup_program = Program()
-        kwargs = {
-            'main_program': main_program,
-            'startup_program': startup_program
-        }
-        d = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
-        hidden = layers.fc(input=d, size=100, **kwargs)
-        layers.fc(input=hidden, size=100, **kwargs)
+        with program_guard(main_program, startup_program):
+            d = layers.data(name='x', shape=[784], dtype='float32')
+            hidden = layers.fc(input=d, size=100)
+            layers.fc(input=hidden, size=100)

        new_program = main_program.clone()
        self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))

--- a/python/paddle/v2/fluid/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
 import unittest

 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, grad_var_name
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops
 import numpy as np
@@ -164,7 +164,7 @@ class RecurrentOpTest1(unittest.TestCase):
            for x in self.data_field
        }
        fetch_list = [
-            self.main_program.global_block().var(x + "@GRAD")
+            self.main_program.global_block().var(grad_var_name(x))
            for x in self.data_field
        ]


--- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
@@ -2,7 +2,7 @@ import unittest
 import paddle.v2.fluid.core as core
 import numpy as np
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops

@@ -75,24 +75,20 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             level=0):
        place = self.place()
        program = Program()
-        x = layers.data(name='x', shape=[1], main_program=program)
+        with program_guard(program):
+            x = layers.data(name='x', shape=[1])
            x.persistable = True

-        y = layers.data(name='y', shape=[1], main_program=program)
+            y = layers.data(name='y', shape=[1])
            y.persistable = True

            out_true, out_false = layers.split_lod_tensor(
-            input=x, mask=y, level=level, main_program=program)
+                input=x, mask=y, level=level)
            out_true.persistable = True
            out_false.persistable = True

            out = layers.merge_lod_tensor(
-            in_true=out_true,
-            in_false=out_false,
-            mask=y,
-            x=x,
-            level=level,
-            main_program=program)
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)

            out.persistable = True

@@ -123,32 +119,19 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
    def test_grad(self):
        place = core.CPUPlace()
        program = Program()
-
+        with program_guard(program):
            x = layers.data(
-            name='x',
-            shape=[1],
-            dtype='float32',
-            main_program=program,
-            stop_gradient=False)
+                name='x', shape=[1], dtype='float32', stop_gradient=False)
            y = layers.data(
-            name='y',
-            shape=[1],
-            dtype='bool',
-            main_program=program,
-            stop_gradient=False)
+                name='y', shape=[1], dtype='bool', stop_gradient=False)

            level = 0

            out_true, out_false = layers.split_lod_tensor(
-            input=x, mask=y, level=level, main_program=program)
+                input=x, mask=y, level=level)
            out = layers.merge_lod_tensor(
-            in_true=out_true,
-            in_false=out_false,
-            mask=y,
-            x=x,
-            level=level,
-            main_program=program)
-        mean = layers.mean(x=out, main_program=program)
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
+            mean = layers.mean(x=out)

            append_backward_ops(mean)


--- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
 import unittest
+import numpy
+
 from paddle.v2.fluid.op import Operator
 import paddle.v2.fluid.core as core
-import numpy
+import paddle.v2.fluid as fluid


 class TestUniformRandomOp(unittest.TestCase):
-    def test_uniform_random_cpu(self):
+    def setUp(self):
+        self.op_type = "uniform_random"
+        self.inputs = {}
+        self.attrs = {
+            "shape": [1000, 784],
+            "min": -5.0,
+            "max": 10.0,
+            "seed": 10
+        }
+        self.outputs = ["Out"]
+
+    def test_cpu(self):
        self.uniform_random_test(place=core.CPUPlace())

-    def test_uniform_random_gpu(self):
+    def test_gpu(self):
        if core.is_compile_gpu():
            self.uniform_random_test(place=core.GPUPlace(0))

    def uniform_random_test(self, place):
-        scope = core.Scope()
-        scope.var('X').get_tensor()
-
-        op = Operator(
-            "uniform_random",
-            Out='X',
-            shape=[1000, 784],
-            min=-5.0,
-            max=10.0,
-            seed=10)
-
-        ctx = core.DeviceContext.create(place)
-        op.run(scope, ctx)
-        tensor = numpy.array(scope.find_var('X').get_tensor())
+        context = core.DeviceContext.create(place)
+        program = fluid.Program()
+        block = program.global_block()
+        vout = block.create_var(name="Out")
+        op = block.append_op(
+            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
+
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+
+        fetch_list = []
+        for var_name in self.outputs:
+            fetch_list.append(block.var(var_name))
+
+        exe = fluid.Executor(place)
+        outs = exe.run(program, fetch_list=fetch_list)
+
+        tensor = outs[0]
+
        self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1)