Merge branch 'develop' of github.com:baidu/Paddle into feature/add_reorder_lod_tensor

540af318 · Yang Yu · 9189567a · eb612a82 · 540af318 · 540af318
92 changed file
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@@ -6,8 +6,18 @@ height = 227
 width = 227
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
+gp = get_config_arg('layer_num', int, 1)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer,
+    'num_samples': num_samples
+}
 define_py_data_sources2(
    "train.list", None, module="provider", obj="process", args=args)
@@ -31,7 +41,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)
 # conv2
 net = img_conv_layer(
-    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
 net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
@@ -40,11 +50,11 @@ net = img_conv_layer(
    input=net, filter_size=3, num_filters=384, stride=1, padding=1)
 # conv4
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
 # conv5
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
 net = fc_layer(
@@ -59,6 +69,9 @@ net = fc_layer(
    layer_attr=ExtraAttr(drop_rate=0.5))
 net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
-lab = data_layer('label', num_class)
+if is_infer:
-loss = cross_entropy(input=net, label=lab)
+    outputs(net)
-outputs(loss)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=net, label=lab)
+    outputs(loss)
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 use_gpu = get_config_arg('use_gpu', bool, True)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,

--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -14,6 +14,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
    else:
        settings.data_size = settings.height * settings.width
    settings.is_infer = kwargs.get('is_infer', False)
+    settings.num_samples = kwargs.get('num_samples', 2560)
    if settings.is_infer:
        settings.slots = [dense_vector(settings.data_size)]
    else:
@@ -23,7 +24,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
 @provider(
    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(2560 if settings.is_infer else 1024):
+    for i in xrange(settings.num_samples):
        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
        if settings.is_infer:
            yield img.astype('float32')

--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg("layer_num", int, 50)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,

--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@@ -37,7 +37,7 @@ function infer() {
      --trainer_count=1 \
      --num_passes=1 \
      --save_dir="models/${topology}-${layer_num}" \
-      --config_args="batch_size=128,layer_num=${layer_num}" \
+      --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
      > /dev/null 2>&1
    echo "Done"
  fi
@@ -79,8 +79,9 @@ fi
 # inference benchmark
 for use_mkldnn in True False; do
  for batchsize in 1 2 4 8 16; do
-    infer googlenet v1 $batchsize $use_mkldnn
-    infer resnet 50 $batchsize $use_mkldnn
    infer vgg 19 $batchsize $use_mkldnn
+    infer resnet 50 $batchsize $use_mkldnn
+    infer googlenet v1 $batchsize $use_mkldnn
+    infer alexnet 2 $batchsize $use_mkldnn
  done
 done
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@@ -47,5 +47,6 @@ for use_mkldnn in True False; do
    train vgg 19 $batchsize $use_mkldnn
    train resnet 50 $batchsize $use_mkldnn
    train googlenet v1 $batchsize $use_mkldnn
+    train alexnet 2 $batchsize $use_mkldnn
  done
 done
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -23,24 +23,25 @@ function infer() {
    echo "./run_mkl_infer.sh to save the model first"
    exit 0
  fi
-  log_period=$((256 / bs))
+  log_period=$((32 / bs))
  paddle train --job=test \
    --config="${topology}.py" \
+    --use_mkldnn=False \
    --use_gpu=False \
    --trainer_count=$thread \
    --log_period=$log_period \
-    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
    --init_model_path=$models_in \
    2>&1 | tee ${log}
-  # calculate the last 5 logs period time of 1280 samples,
+  # calculate the last 5 logs period time of 160(=32*5) samples,
  # the time before are burning time.
  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
  start_sec=`clock_to_seconds $start`
  end_sec=`clock_to_seconds $end`
-  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
+  fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
-  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }
@@ -56,7 +57,8 @@ fi
 # inference benchmark
 for batchsize in 1 2 4 8 16; do
-  infer googlenet v1 $batchsize
-  infer resnet 50 $batchsize
  infer vgg 19 $batchsize
+  infer resnet 50 $batchsize 
+  infer googlenet v1 $batchsize
+  infer alexnet 2 $batchsize
 done
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
@@ -12,10 +12,11 @@ function train() {
  config="${topology}.py"
  paddle train --job=time \
    --config=$config \
+    --use_mkldnn=False \
    --use_gpu=False \
    --trainer_count=$thread \
-    --log_period=10 \
+    --log_period=3 \
-    --test_period=100 \
+    --test_period=30 \
    --config_args=$args \
    2>&1 | tee ${log} 
@@ -36,4 +37,5 @@ for batchsize in 64 128 256; do
  train vgg 19 $batchsize
  train resnet 50 $batchsize
  train googlenet v1 $batchsize
+  train alexnet 2 $batchsize
 done
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
 args = {
    'height': height,
    'width': width,
    'color': True,
    'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
    "train.list" if not is_infer else None,

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -253,9 +253,9 @@ IF(NOT PROTOBUF_FOUND)
    IF(WITH_C_API)
        INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
        IF(ANDROID)
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
        ELSE()
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib)
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
        ENDIF()
    ENDIF()

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -467,7 +467,7 @@ lambda_cost
    :noindex:
 square_error_cost
--------
+-----------------
 ..  autoclass:: paddle.v2.layer.square_error_cost
    :noindex:
@@ -533,7 +533,7 @@ Miscs
 =====
 dropout
--------------
+--------
 ..  autoclass:: paddle.v2.layer.dropout
    :noindex:

--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -19,17 +19,17 @@ dynamic_lstm
    :noindex:
 data
---------
+----
 ..  autofunction:: paddle.v2.fluid.layers.data
    :noindex:
 mean
---------
+----
 ..  autofunction:: paddle.v2.fluid.layers.mean
    :noindex:
 mul
---------
+---
 ..  autofunction:: paddle.v2.fluid.layers.mul
    :noindex:
@@ -45,13 +45,13 @@ elementwise_div
 dropout
---------
+-------
 ..  autofunction:: paddle.v2.fluid.layers.dropout
    :noindex:
 reshape
---------
+--------
 ..  autofunction:: paddle.v2.fluid.layers.reshape
    :noindex:
@@ -81,67 +81,67 @@ transpose
 sigmoid_cross_entropy_with_logits
---------
+---------------------------------
 ..  autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
    :noindex:
 cast
---------
+----
 ..  autofunction:: paddle.v2.fluid.layers.cast
    :noindex:
 concat
---------
+-------
 ..  autofunction:: paddle.v2.fluid.layers.concat
    :noindex:
 sums
---------
+----
 ..  autofunction:: paddle.v2.fluid.layers.sums
    :noindex:
 linear_chain_crf
---------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
    :noindex:
 assign
---------
+-------
 ..  autofunction:: paddle.v2.fluid.layers.embedding
    :noindex:
 split_lod_tensor
---------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
    :noindex:
 merge_lod_tensor
---------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
    :noindex:
 cos_sim
---------
+--------
 ..  autofunction:: paddle.v2.fluid.layers.cos_sim
    :noindex:
 cross_entropy
---------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.cross_entropy
    :noindex:
 square_error_cost
---------
+-----------------
 ..  autofunction:: paddle.v2.fluid.layers.square_error_cost
    :noindex:
@@ -153,68 +153,68 @@ accuracy
 sequence_conv
---------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_conv
    :noindex:
 conv2d
---------
+------
 ..  autofunction:: paddle.v2.fluid.layers.conv2d
    :noindex:
 sequence_pool
---------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_pool
    :noindex:
 pool2d
---------
+------
 ..  autofunction:: paddle.v2.fluid.layers.pool2d
    :noindex:
 batch_norm
---------
+----------
 ..  autofunction:: paddle.v2.fluid.layers.batch_norm
    :noindex:
 beam_search_decode
---------
+------------------
 ..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
    :noindex:
 lod_rank_table
---------
+--------------
 ..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
    :noindex:
 max_sequence_len
---------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
    :noindex:
 topk
---------
+-----
 ..  autofunction:: paddle.v2.fluid.layers.topk
    :noindex:
 lod_tensor_to_array
---------
+-------------------
 ..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
    :noindex:
 array_to_lod_tensor
---------
+-------------------
 ..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
    :noindex:
@@ -222,26 +222,26 @@ array_to_lod_tensor
 fill_constant
---------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.fill_constant
    :noindex:
 fill_constant_batch_size_like
---------
+-----------------------------
 ..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
    :noindex:
 ones
---------
+----
 ..  autofunction:: paddle.v2.fluid.layers.ones
    :noindex:
 zeros
---------
+-----
 ..  autofunction:: paddle.v2.fluid.layers.zeros
    :noindex:
@@ -253,14 +253,14 @@ increment
 array_write
---------
+-----------
 ..  autofunction:: paddle.v2.fluid.layers.array_write
    :noindex:
 create_array
---------
+------------
 ..  autofunction:: paddle.v2.fluid.layers.create_array
    :noindex:
@@ -272,31 +272,31 @@ less_than
 array_read
---------
+----------
 ..  autofunction:: paddle.v2.fluid.layers.array_read
    :noindex:
 shrink_memory
---------
+--------------
 ..  autofunction:: paddle.v2.fluid.layers.shrink_memory
    :noindex:
 array_length
---------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.array_length
    :noindex:
 conv2d_transpose
---------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
    :noindex:
 sequence_expand
---------
+---------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_expand
    :noindex:
@@ -308,13 +308,13 @@ lstm_unit
 sequence_softmax
---------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
    :noindex:
 reduce_sum
---------
+----------
 ..  autofunction:: paddle.v2.fluid.layers.reduce_sum
    :noindex:
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@@ -3,19 +3,19 @@ Nets
 ===========
 simple_img_conv_pool
-----------
+--------------------
 ..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
    :noindex:
 img_conv_group
-----------
+---------------
 ..  autofunction:: paddle.v2.fluid.nets.img_conv_group
    :noindex:
 sequence_conv_pool
-----------
+------------------
 ..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
    :noindex:

--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
@@ -18,7 +18,7 @@ SGDOptimizer
 MomentumOptimizer
-----------
+-----------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: MomentumOptimizer
    :noindex:
@@ -26,14 +26,14 @@ MomentumOptimizer
 AdagradOptimizer
-----------
+----------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: AdagradOptimizer
    :noindex:
 AdamOptimizer
-----------
+-------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: AdamOptimizer
    :noindex:
@@ -47,7 +47,7 @@ AdamaxOptimizer
 DecayedAdagradOptimizer
-----------
+-----------------------
 ..  automodule:: paddle.v2.fluid.optimizer
    :members: DecayedAdagradOptimizer
    :noindex:

--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
@@ -3,14 +3,14 @@ Regularizer
 ===========
 WeightDecayRegularizer
-----------
+----------------------
 ..  automodule:: paddle.v2.fluid.regularizer
    :members: WeightDecayRegularizer
    :noindex:
 L2DecayRegularizer
-----------
+------------------
 ..  automodule:: paddle.v2.fluid.regularizer
    :members: L2DecayRegularizer
    :noindex:
@@ -18,7 +18,7 @@ L2DecayRegularizer
 L1DecayRegularizer
-----------
+-------------------
 ..  automodule:: paddle.v2.fluid.regularizer
    :members: L1DecayRegularizer

--- a/doc/design/operator_kernel_type.md
+++ b/doc/design/operator_kernel_type.md
+# Design Doc: The Keys of Operator Kernel Type
+## Problem
+An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique Kernel. Before an operator runs, an certain kernel must be chosen by a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  proto::DataType data_type_;
+};
+```
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
+It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys are not enough. We need a more complete representation of `OpKernelType`. 
+We often implement a kernel of an operator with some computing library in certain device(place). Please remind that computing library and device are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. 
+For example, Eigen library can support Nvidia GPU/AMD GPU/CPU. And MKLDNN library can support Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
+It's obvious that different DataTypes, like fp64/fp32/int8 will have different kernels. But the data layout of a Tensor will also lead to different implementation. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209). Data Layout should also be taken into consideration.
+## Solution
+There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  platform::Library library_;
+  proto::DataType data_type_;
+  framework::Layout layout_;
+};
+```
+Following is the details:
+### Place
+`Place` is defined as follows:
+```cpp
+typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
+```
+`Place` is to represent the device memory where data is locating.
+### Library
+One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
+```cpp
+enum Library { Plain, MKLDNN, CUDNN };
+```
+We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
+A library usually has a corresponding `DeviceContext` which contains some handles needed by computation. Fluid now have two default DeviceContexts in CPU and CUDA, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains a Eigen library handle and `CDUADeviceContext` contains a Eigen library handle and cuBLAS handle.
+If we want to support new Library, a new enumerator need to be added to `Library` and a new corresponding `LibraryDeviceContext` will be created.
+### DataType
+`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
+### Layout
+Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
+Different layout leads to different implementation of operator kernel. There are mainly 4 principles we have to follow to support layout in our fluid framework.
+- We take layout as a data member of Tensor. Layout is actually a enum variable. If fluid is built with MKLDNN, then, the memory format in MKLDNN will be added into this enum variable too.
+- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout of generating data. Of course, we can have some default layout, like NCHW.
+- The inference of Layout is at run-time, not compile-time.
+- Every operator have to implement different kernels for different layouts. Let's take MKLDNN as an example, if we want to implement a MKLDNN convolution operator, we have to realize all the kernels for different layout, list at [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to do registering kernels for MKLDNN operators.
+`Layout` is also defined as a enum variable:
+```cpp
+enum Layout {
+  kNCHW,
+  kNHWC,
+#ifdef PADDLE_WITH_MKLDNN
+  knChw8c
+  ...
+#endif
+};
+```
--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -37,11 +37,11 @@ PaddlePaddle可以使用常用的Python包管理工具
    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
    :widths: 1, 3, 3, 3
-    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
 .. _pip_dependency:

--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -40,11 +40,11 @@ If the links below shows up the login form, just click "Log in as guest" to star
    :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
    :widths: 1, 3, 3, 3
-    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
 .. _pip_dependency:

--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -42,7 +42,7 @@ static std::unordered_set<std::string>& CtrlFlowOps() {
 static inline std::unique_ptr<OperatorBase> CreateGradOp(
    const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
    std::unordered_map<std::string, std::string>* grad_to_var) {
-  OpDescBind op_desc;
+  OpDesc op_desc;
  op_desc.SetInputMap(op.Inputs());
  op_desc.SetOutputMap(op.Outputs());
  op_desc.SetType(op.Type());
@@ -53,7 +53,7 @@ static inline std::unique_ptr<OperatorBase> CreateGradOp(
  grad_ops.reserve(grad_descs.size());
  std::transform(grad_descs.begin(), grad_descs.end(),
                 std::back_inserter(grad_ops),
-                 [](const std::unique_ptr<OpDescBind>& grad_desc) {
+                 [](const std::unique_ptr<OpDesc>& grad_desc) {
                   return OpRegistry::CreateOp(*grad_desc);
                 });
  PADDLE_ENFORCE(!grad_ops.empty());
@@ -217,7 +217,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
        // If part of input gradient of that operator is not calculated, fill
        // zero variables to that input gradient.
        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
-                                           {{"Y", {grad_input}}},
+                                           {{"Out", {grad_input}}},
                                           AttributeMap{}));
      }
      return false;
@@ -296,7 +296,7 @@ static std::string FwdName(const std::string& grad_name) {
 static void CreateGradVarInBlock(
    size_t grad_op_start_index,
    const std::unordered_map<std::string, std::string>& param_name_map,
-    BlockDescBind* block_desc,
+    BlockDesc* block_desc,
    std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
  auto ops = block_desc->AllOps();
  for (size_t op_index = grad_op_start_index; op_index < ops.size();
@@ -350,12 +350,11 @@ static void CreateGradVarInBlock(
  }
 }
-std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
+std::vector<std::unique_ptr<OpDesc>> MakeOpGrad(
-    const OpDescBind* op_desc, std::unordered_set<std::string>* no_grad_vars,
+    const OpDesc* op_desc, std::unordered_set<std::string>* no_grad_vars,
    std::unordered_map<std::string, std::string>* grad_to_var,
-    const std::vector<BlockDescBind*>& grad_block =
+    const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>()) {
-        std::vector<BlockDescBind*>()) {
+  std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
-  std::vector<std::unique_ptr<OpDescBind>> grad_op_descs;
  // All input gradients of forwarding operator do not need to calculate.
  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
  if (AllGradInSet(inputs, *no_grad_vars)) {
@@ -386,7 +385,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
          .Get(op_desc->Type())
          .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block);
-  std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
+  std::list<std::unique_ptr<OpDesc>> pending_fill_zeros_ops;
  for (auto& desc : grad_op_descs) {
    for (const std::string& in_name : desc->InputArgumentNames()) {
      if (no_grad_vars->count(in_name)) {
@@ -394,9 +393,9 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
        std::string new_name = prefix + kZeroVarSuffix;
        desc->Rename(in_name, new_name);
-        std::unique_ptr<OpDescBind> fill_zeros_op(
+        std::unique_ptr<OpDesc> fill_zeros_op(
-            new OpDescBind("fill_zeros_like", {{"X", {prefix}}},
+            new OpDesc("fill_zeros_like", {{"X", {prefix}}},
-                           {{"Y", {new_name}}}, AttributeMap{}));
+                       {{"Out", {new_name}}}, AttributeMap{}));
        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
      }
    }
@@ -408,34 +407,33 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
  return grad_op_descs;
 }
-static BlockDescBind* CreateStepBlock(
+static BlockDesc* CreateStepBlock(
-    ProgramDescBind& program_desc,
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_set<std::string>* no_grad_vars,
    std::unordered_map<std::string, std::string>* grad_to_var,
    int step_block_idx);
-std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
+std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
-    ProgramDescBind& program_desc, int block_idx,
+    ProgramDesc& program_desc, int block_idx,
    std::unordered_set<std::string>* no_grad_vars,
    std::unordered_map<std::string, std::string>* grad_to_var) {
  VLOG(5) << "MakeBlockBackward";
-  BlockDescBind* cur_block = program_desc.MutableBlock(block_idx);
+  BlockDesc* cur_block = program_desc.MutableBlock(block_idx);
-  std::vector<OpDescBind*> op_descs = cur_block->AllOps();
+  std::vector<OpDesc*> op_descs = cur_block->AllOps();
  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
  size_t grad_desc_idx = 0;
-  std::vector<std::unique_ptr<OpDescBind>> backward_descs;
+  std::vector<std::unique_ptr<OpDesc>> backward_descs;
  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
    VLOG(5) << "Making backward " << (*it)->Type() << " op";
-    std::vector<std::unique_ptr<OpDescBind>> op_grads;
+    std::vector<std::unique_ptr<OpDesc>> op_grads;
    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
      int step_block_idx = (*it)->GetBlockAttr("sub_block");
-      BlockDescBind* backward_block = CreateStepBlock(
+      BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
-          program_desc, no_grad_vars, grad_to_var, step_block_idx);
+                                                  grad_to_var, step_block_idx);
      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
    } else if ((*it)->Type() == "conditional_block") {
-      BlockDescBind* backward_block =
+      BlockDesc* backward_block =
          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
                          (*it)->GetBlockAttr("sub_block"));
      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
@@ -463,14 +461,14 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
      }
      ++grad_desc_idx;
    }
-    std::transform(
+    std::transform(op_grads.begin(), op_grads.end(),
-        op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
+                   std::back_inserter(backward_descs),
-        [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
+                   [](std::unique_ptr<OpDesc>& ptr) { return std::move(ptr); });
  }
  VLOG(5) << "Appending Sums";
  // Check whether some variables are written more than once
-  std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
+  std::list<std::pair<size_t, std::unique_ptr<OpDesc>>> pending_sum_ops;
  for (const auto& dup : dup_out_ops) {
    const std::string& out_name = dup.first;
    const std::vector<size_t> dup_op = dup.second;
@@ -486,16 +484,15 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
        sum_op_inputs.emplace_back(new_name);
        next_g_name = sum_op_inputs.back();
      }
-      std::unique_ptr<OpDescBind> sum_op(
+      std::unique_ptr<OpDesc> sum_op(new OpDesc("sum", {{"X", sum_op_inputs}},
-          new OpDescBind("sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}},
+                                                {{"Out", {out_name}}},
                                                AttributeMap{}));
      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
    }
  }
-  pending_sum_ops.sort(
+  pending_sum_ops.sort([](const std::pair<size_t, std::unique_ptr<OpDesc>>& a,
-      [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
+                          const std::pair<size_t, std::unique_ptr<OpDesc>>& b) {
-         const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
    return a.first > b.first;
  });
  for (auto& p : pending_sum_ops) {
@@ -508,14 +505,13 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
  return backward_descs;
 }
-static BlockDescBind* CreateStepBlock(
+static BlockDesc* CreateStepBlock(
-    ProgramDescBind& program_desc,
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_set<std::string>* no_grad_vars,
    std::unordered_map<std::string, std::string>* grad_to_var,
    int step_block_idx) {
  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
                                                   no_grad_vars, grad_to_var);
-  BlockDescBind* backward_block =
+  BlockDesc* backward_block =
      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
  for (auto& ptr : backward_block_op_descs) {
    backward_block->AppendAllocatedOp(move(ptr));
@@ -524,7 +520,7 @@ static BlockDescBind* CreateStepBlock(
 }
 ParamGradInfoMap AppendBackward(
-    ProgramDescBind& program_desc, const VarDescBind& target,
+    ProgramDesc& program_desc, const VarDesc& target,
    const std::unordered_set<std::string>& no_grad_vars) {
  std::unordered_set<std::string> no_grad_var_names;
  no_grad_var_names.reserve(no_grad_vars.size() + 1);
@@ -541,8 +537,8 @@ ParamGradInfoMap AppendBackward(
  PADDLE_ENFORCE(is_scalar, "target should be scalar");
  VLOG(3) << "backward from loss=" << target.Name()
          << " data_type=" << target.GetDataType();
-  std::unique_ptr<OpDescBind> fill_one_op(
+  std::unique_ptr<OpDesc> fill_one_op(
-      new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
+      new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}},
                 {{"shape", std::vector<int>{1}},
                  {"value", static_cast<float>(1.0)},
                  {"dtype", target.GetDataType()}}));

--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@@ -49,7 +49,7 @@ using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
                                            GradVarInfo /*grad_var_info*/>;
 ParamGradInfoMap AppendBackward(
-    ProgramDescBind& program_desc, const VarDescBind& target,
+    ProgramDesc& program_desc, const VarDesc& target,
    const std::unordered_set<std::string>& no_grad_vars);
 }  // namespace framework

--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -58,13 +58,13 @@ class RowWiseAddGradMaker : public SingleGradOpDescMaker {
  using SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<OpDescBind> Apply() const override {
+  std::unique_ptr<OpDesc> Apply() const override {
-    auto grad_op = new OpDescBind();
+    auto grad_op = new OpDesc();
    grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
    grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
    grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
    grad_op->SetType("rowwise_add_grad");
-    return std::unique_ptr<OpDescBind>(grad_op);
+    return std::unique_ptr<OpDesc>(grad_op);
  }
 };
@@ -159,7 +159,7 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
  FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "x");
-    AddOutput("Y", "out");
+    AddOutput("Out", "out");
    AddComment("");
  }
 };
@@ -190,11 +190,11 @@ class MinusGradOpDescMaker : public GradOpDescMakerBase {
 public:
  using GradOpDescMakerBase::GradOpDescMakerBase;
-  std::vector<std::unique_ptr<OpDescBind>> operator()() const override {
+  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
-    std::vector<std::unique_ptr<OpDescBind>> retv;
+    std::vector<std::unique_ptr<OpDesc>> retv;
    auto x_g = InputGrad("X");
    if (!x_g.empty()) {
-      auto *op_desc = new OpDescBind();
+      auto *op_desc = new OpDesc();
      op_desc->SetType("scale");
      op_desc->SetInput("X", OutputGrad("Out"));
      op_desc->SetOutput("Out", x_g);
@@ -204,7 +204,7 @@ class MinusGradOpDescMaker : public GradOpDescMakerBase {
    auto y_g = InputGrad("Y");
    if (!y_g.empty()) {
-      auto *op_desc = new OpDescBind();
+      auto *op_desc = new OpDesc();
      op_desc->SetType("scale");
      op_desc->SetInput("X", OutputGrad("Out"));
      op_desc->SetOutput("Out", y_g);
@@ -430,8 +430,8 @@ TEST(Backward, op_part_of_output_are_not_need) {
  ASSERT_EQ("fill_zeros_like", fill_zero.Type());
  ASSERT_EQ(1UL, fill_zero.Inputs("X").size());
  ASSERT_EQ("Z", fill_zero.Input("X"));
-  ASSERT_EQ(1UL, fill_zero.Outputs("Y").size());
+  ASSERT_EQ(1UL, fill_zero.Outputs("Out").size());
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Y"));
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out"));
  auto &d_many_out = *net->ops_[1];
  ASSERT_EQ("many_output_op_grad", d_many_out.Type());
@@ -505,25 +505,25 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
 }
 TEST(Backward, simple_single_op) {
-  f::ProgramDescBind program;
+  f::ProgramDesc program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDescBind *op = block->AppendOp();
+  f::OpDesc *op = block->AppendOp();
  op->SetType("rowwise_add");
  op->SetInput("X", {"x"});
  op->SetInput("b", {"b"});
  op->SetOutput("Out", {"out"});
-  auto target = f::VarDescBind("out");
+  auto target = f::VarDesc("out");
  target.SetShape({1});
  auto var_to_grad =
      AppendBackward(program, target, std::unordered_set<std::string>{});
  ASSERT_EQ(block->AllOps().size(), 3UL);
-  f::OpDescBind *fill_op = block->AllOps()[1];
+  f::OpDesc *fill_op = block->AllOps()[1];
  EXPECT_EQ(fill_op->Type(), "fill_constant");
-  f::OpDescBind *grad_op = block->AllOps()[2];
+  f::OpDesc *grad_op = block->AllOps()[2];
  EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
@@ -543,16 +543,16 @@ TEST(Backward, simple_single_op) {
 }
 TEST(Backward, default_attribute) {
-  f::ProgramDescBind program;
+  f::ProgramDesc program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDescBind *op = block->AppendOp();
+  f::OpDesc *op = block->AppendOp();
  op->SetType("mul");
  op->SetInput("X", {"x"});
  op->SetInput("Y", {"y"});
  op->SetOutput("Out", {"out"});
  op->CheckAttrs();
-  auto target = f::VarDescBind("out");
+  auto target = f::VarDesc("out");
  target.SetShape({1});
  AppendBackward(program, target, std::unordered_set<std::string>{});
@@ -560,47 +560,47 @@ TEST(Backward, default_attribute) {
  EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
  EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
-  f::OpDescBind *fill_op = block->AllOps()[1];
+  f::OpDesc *fill_op = block->AllOps()[1];
  EXPECT_EQ(fill_op->Type(), "fill_constant");
-  f::OpDescBind *grad_op = block->AllOps()[2];
+  f::OpDesc *grad_op = block->AllOps()[2];
  ASSERT_EQ(grad_op->Type(), "mul_grad");
  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
 }
 TEST(Backward, simple_mult_op) {
-  f::ProgramDescBind program;
+  f::ProgramDesc program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::OpDesc *op1 = block->AppendOp();
  op1->SetType("rowwise_add");
  op1->SetInput("X", {"x1"});
  op1->SetInput("b", {"b1"});
  op1->SetOutput("Out", {"out1"});
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
  op2->SetType("mul");
  op2->SetInput("X", {"out1"});
  op2->SetInput("Y", {"y2"});
  op2->SetOutput("Out", {"out2"});
-  f::OpDescBind *op3 = block->AppendOp();
+  f::OpDesc *op3 = block->AppendOp();
  op3->SetType("rowwise_add");
  op3->SetInput("X", {"out2"});
  op3->SetInput("b", {"b3"});
  op3->SetOutput("Out", {"out3"});
-  auto target = f::VarDescBind("out3");
+  auto target = f::VarDesc("out3");
  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad =
      AppendBackward(program, target, std::unordered_set<std::string>{});
  ASSERT_EQ(block->AllOps().size(), 6UL + 1);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
  EXPECT_EQ(fill_op->Type(), "fill_constant");
-  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  f::OpDesc *grad_op1 = block->AllOps()[6];
  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -611,7 +611,7 @@ TEST(Backward, simple_mult_op) {
  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
            std::vector<std::string>({f::GradVarName("b1")}));
-  f::OpDescBind *grad_op2 = block->AllOps()[5];
+  f::OpDesc *grad_op2 = block->AllOps()[5];
  EXPECT_EQ(grad_op2->Type(), "mul_grad");
  ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
@@ -625,7 +625,7 @@ TEST(Backward, simple_mult_op) {
  EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
            std::vector<std::string>({f::GradVarName("y2")}));
-  f::OpDescBind *grad_op3 = block->AllOps()[4];
+  f::OpDesc *grad_op3 = block->AllOps()[4];
  EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
@@ -655,42 +655,42 @@ TEST(Backward, simple_mult_op) {
 }
 TEST(Backward, intermedia_var_no_grad) {
-  f::ProgramDescBind program;
+  f::ProgramDesc program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::OpDesc *op1 = block->AppendOp();
  op1->SetType("rowwise_add");
  op1->SetInput("X", {"x1"});
  op1->SetInput("b", {"b1"});
  op1->SetOutput("Out", {"out1"});
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
  op2->SetType("mul");
  op2->SetInput("X", {"x2"});
  op2->SetInput("Y", {"y2"});
  op2->SetOutput("Out", {"out2"});
-  f::OpDescBind *op3 = block->AppendOp();
+  f::OpDesc *op3 = block->AppendOp();
  op3->SetType("rowwise_add");
  op3->SetInput("X", {"out2"});
  op3->SetInput("b", {"b3"});
  op3->SetOutput("Out", {"out3"});
-  f::OpDescBind *op4 = block->AppendOp();
+  f::OpDesc *op4 = block->AppendOp();
  op4->SetType("mul");
  op4->SetInput("X", {"out1"});
  op4->SetInput("Y", {"out3"});
  op4->SetOutput("Out", {"out4"});
-  auto target = f::VarDescBind("out4");
+  auto target = f::VarDesc("out4");
  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {"out3"});
  ASSERT_EQ(block->AllOps().size(), 7UL);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
  EXPECT_EQ(fill_op->Type(), "fill_constant");
-  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  f::OpDesc *grad_op1 = block->AllOps()[6];
  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -701,7 +701,7 @@ TEST(Backward, intermedia_var_no_grad) {
  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
            std::vector<std::string>({f::GradVarName("b1")}));
-  f::OpDescBind *grad_op4 = block->AllOps()[5];
+  f::OpDesc *grad_op4 = block->AllOps()[5];
  EXPECT_EQ(grad_op4->Type(), "mul_grad");
  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
@@ -726,32 +726,32 @@ TEST(Backward, intermedia_var_no_grad) {
 }
 TEST(Backward, var_no_grad) {
-  f::ProgramDescBind program;
+  f::ProgramDesc program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::OpDesc *op1 = block->AppendOp();
  op1->SetType("mult_in_out");
  op1->SetInput("X", {"x1"});
  op1->SetInput("H", {"h1"});
  op1->SetOutput("Y", {"y1"});
  op1->SetOutput("Z", {"z1"});
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
  op2->SetType("mult_in_out");
  op2->SetInput("X", {"y1"});
  op2->SetInput("H", {"z1"});
  op2->SetOutput("Y", {"y2"});
  op2->SetOutput("Z", {"z2"});
-  auto target = f::VarDescBind("z2");
+  auto target = f::VarDesc("z2");
  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {"z1"});
  ASSERT_EQ(block->AllOps().size(), 6UL);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
  EXPECT_EQ(fill_op->Type(), "fill_constant");
-  f::OpDescBind *grad_op2 = block->AllOps()[3];
+  f::OpDesc *grad_op2 = block->AllOps()[3];
  ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
  ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
@@ -767,15 +767,15 @@ TEST(Backward, var_no_grad) {
            std::vector<std::string>({f::GradVarName("y1")}));
  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());
-  f::OpDescBind *fill_zero_op = block->AllOps()[4];
+  f::OpDesc *fill_zero_op = block->AllOps()[4];
  ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
  ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
  ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
  EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(fill_zero_op->Output("Y"),
+  EXPECT_EQ(fill_zero_op->Output("Out"),
            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
-  f::OpDescBind *grad_op1 = block->AllOps()[5];
+  f::OpDesc *grad_op1 = block->AllOps()[5];
  ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
  ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -803,37 +803,37 @@ TEST(Backward, var_no_grad) {
 }
 TEST(Backward, shared_var) {
-  f::ProgramDescBind program;
+  f::ProgramDesc program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::OpDesc *op1 = block->AppendOp();
  op1->SetType("rowwise_add");
  op1->SetInput("X", {"x1"});
  op1->SetInput("b", {"b1"});
  op1->SetOutput("Out", {"out1"});
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
  op2->SetType("mul");
  op2->SetInput("X", {"out1"});
  op2->SetInput("Y", {"y2"});
  op2->SetOutput("Out", {"out2"});
-  f::OpDescBind *op3 = block->AppendOp();
+  f::OpDesc *op3 = block->AppendOp();
  op3->SetType("rowwise_add");
  op3->SetInput("X", {"out1"});
  op3->SetInput("b", {"b3"});
  op3->SetOutput("Out", {"out3"});
-  auto target = f::VarDescBind("out3");
+  auto target = f::VarDesc("out3");
  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad =
      AppendBackward(program, target, std::unordered_set<std::string>{});
  ASSERT_EQ(block->AllOps().size(), 8UL);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
  EXPECT_EQ(fill_op->Type(), "fill_constant");
-  f::OpDescBind *grad_op3 = block->AllOps()[4];
+  f::OpDesc *grad_op3 = block->AllOps()[4];
  ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
@@ -844,7 +844,7 @@ TEST(Backward, shared_var) {
  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
            std::vector<std::string>({f::GradVarName("b3")}));
-  f::OpDescBind *grad_op4 = block->AllOps()[5];
+  f::OpDesc *grad_op4 = block->AllOps()[5];
  ASSERT_EQ(grad_op4->Type(), "mul_grad");
  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
@@ -858,7 +858,7 @@ TEST(Backward, shared_var) {
  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
            std::vector<std::string>({f::GradVarName("y2")}));
-  f::OpDescBind *sum_op = block->AllOps()[6];
+  f::OpDesc *sum_op = block->AllOps()[6];
  ASSERT_EQ(sum_op->Type(), "sum");
  ASSERT_EQ(sum_op->InputNames().size(), 1UL);
  ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
@@ -868,7 +868,7 @@ TEST(Backward, shared_var) {
  EXPECT_EQ(sum_op->Output("Out"),
            std::vector<std::string>({f::GradVarName("out1")}));
-  f::OpDescBind *grad_op1 = block->AllOps()[7];
+  f::OpDesc *grad_op1 = block->AllOps()[7];
  ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -895,19 +895,19 @@ TEST(Backward, shared_var) {
 }
 TEST(Backward, half_backward) {
-  f::ProgramDescBind program;
+  f::ProgramDesc program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::BlockDesc *block = program.MutableBlock(0);
  auto *op1 = block->AppendOp();
  op1->SetType("minus");
  op1->SetInput("X", {"a"});
  op1->SetInput("Y", {"b"});
  op1->SetOutput("Out", {"out"});
-  auto target = f::VarDescBind("out");
+  auto target = f::VarDesc("out");
  target.SetShape({1});
  size_t forward_len = block->AllOps().size();
  auto var_to_grad = AppendBackward(program, target, {"b"});
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
  EXPECT_EQ(fill_op->Type(), "fill_constant");
  auto ops = block->AllOps();
  ASSERT_EQ(3UL, ops.size());

--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -19,18 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-VarDescBind *BlockDescBind::Var(const std::string &name) {
+VarDesc *BlockDesc::Var(const std::string &name) {
  auto it = vars_.find(name);
  if (it != vars_.end()) {
    return it->second.get();
  }
  need_update_ = true;
-  auto *var = new VarDescBind(name);
+  auto *var = new VarDesc(name);
  vars_[name].reset(var);
  return var;
 }
-VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
+VarDesc *BlockDesc::FindVar(const std::string &name) const {
  auto it = vars_.find(name);
  if (it == vars_.end()) {
    return nullptr;
@@ -38,11 +38,11 @@ VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
  return it->second.get();
 }
-bool BlockDescBind::HasVar(const std::string &name) const {
+bool BlockDesc::HasVar(const std::string &name) const {
  return vars_.find(name) != vars_.end();
 }
-VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
+VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const {
  if (name == kEmptyVarName) return nullptr;
  auto it = vars_.find(name);
@@ -53,53 +53,52 @@ VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
  return it->second.get();
 }
-VarDescBind *BlockDescBind::FindRecursiveOrCreateVar(
+VarDesc *BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
-    const std::string &name_bytes) {
+  VarDesc *res = FindVarRecursive(name_bytes);
-  VarDescBind *res = FindVarRecursive(name_bytes);
  if (res == nullptr) {
    res = Var(name_bytes);
  }
  return res;
 }
-bool BlockDescBind::HasVarRecursive(const std::string &name) const {
+bool BlockDesc::HasVarRecursive(const std::string &name) const {
  return FindVarRecursive(name) != nullptr;
 }
-std::vector<VarDescBind *> BlockDescBind::AllVars() const {
+std::vector<VarDesc *> BlockDesc::AllVars() const {
-  std::vector<VarDescBind *> res;
+  std::vector<VarDesc *> res;
  for (const auto &p : vars_) {
    res.push_back(p.second.get());
  }
  return res;
 }
-OpDescBind *BlockDescBind::AppendOp() {
+OpDesc *BlockDesc::AppendOp() {
  need_update_ = true;
-  ops_.emplace_back(new OpDescBind());
+  ops_.emplace_back(new OpDesc());
  return ops_.back().get();
 }
-void BlockDescBind::AppendAllocatedOp(std::unique_ptr<OpDescBind> &&op_desc) {
+void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
  need_update_ = true;
  ops_.emplace_back(std::move(op_desc));
 }
-OpDescBind *BlockDescBind::PrependOp() {
+OpDesc *BlockDesc::PrependOp() {
  need_update_ = true;
-  ops_.emplace_front(new OpDescBind());
+  ops_.emplace_front(new OpDesc());
  return ops_.front().get();
 }
-std::vector<OpDescBind *> BlockDescBind::AllOps() const {
+std::vector<OpDesc *> BlockDesc::AllOps() const {
-  std::vector<OpDescBind *> res;
+  std::vector<OpDesc *> res;
  for (const auto &op : ops_) {
    res.push_back(op.get());
  }
  return res;
 }
-void BlockDescBind::Flush() {
+void BlockDesc::Flush() {
  for (auto &op_desc : ops_) {
    op_desc->Flush();
  }
@@ -121,43 +120,43 @@ void BlockDescBind::Flush() {
  }
 }
-BlockDescBind *BlockDescBind::ParentBlock() const {
+BlockDesc *BlockDesc::ParentBlock() const {
  if (this->desc_->parent_idx() == kNoneBlockIndex) {
    return nullptr;
  }
  return prog_->MutableBlock(static_cast<size_t>(this->desc_->parent_idx()));
 }
-proto::BlockDesc *BlockDescBind::Proto() {
+proto::BlockDesc *BlockDesc::Proto() {
  Flush();
  return desc_;
 }
-BlockDescBind::BlockDescBind(ProgramDescBind *prog, proto::BlockDesc *desc)
+BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
    : prog_(prog), desc_(desc), need_update_(false) {
  for (const proto::VarDesc &var_desc : desc_->vars()) {
-    vars_[var_desc.name()].reset(new VarDescBind(var_desc));
+    vars_[var_desc.name()].reset(new VarDesc(var_desc));
  }
  for (const proto::OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDescBind(op_desc, prog));
+    ops_.emplace_back(new OpDesc(op_desc, prog));
  }
 }
-BlockDescBind::BlockDescBind(const BlockDescBind &other, proto::BlockDesc *desc,
+BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
-                             ProgramDescBind *prog)
+                     ProgramDesc *prog)
    : prog_(prog), desc_(desc) {
  need_update_ = true;
  for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDescBind(*op));
+    ops_.emplace_back(new OpDesc(*op));
  }
  for (auto &it : other.vars_) {
-    auto *var = new VarDescBind(*it.second);
+    auto *var = new VarDesc(*it.second);
    vars_[it.first].reset(var);
  }
 }
-void BlockDescBind::ClearPBOps() {
+void BlockDesc::ClearPBOps() {
  auto ops = this->desc_->mutable_ops();
  while (!ops->empty()) {
    // we do not own the OpDesc, so release the ownership.
@@ -165,7 +164,7 @@ void BlockDescBind::ClearPBOps() {
  }
 }
-void BlockDescBind::ClearPBVars() {
+void BlockDesc::ClearPBVars() {
  auto vars = this->desc_->mutable_vars();
  while (!vars->empty()) {
    // we do not own the VarDesc, so release the ownership.

--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -28,20 +28,19 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-class ProgramDescBind;
+class ProgramDesc;
 // Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
 // read/write speed. Only when we want the protobuf message, the local changes
 // will be synchronized (by `Sync` method).
-class BlockDescBind {
+class BlockDesc {
 public:
-  BlockDescBind(ProgramDescBind *prog, proto::BlockDesc *desc);
+  BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc);
-  BlockDescBind(const BlockDescBind &other, proto::BlockDesc *desc,
+  BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);
-                ProgramDescBind *prog);
-  ~BlockDescBind() {
+  ~BlockDesc() {
    this->ClearPBVars();
    this->ClearPBOps();
  }
@@ -50,15 +49,15 @@ class BlockDescBind {
  int32_t Parent() const { return desc_->parent_idx(); }
-  VarDescBind *Var(const std::string &name_bytes);
+  VarDesc *Var(const std::string &name_bytes);
-  VarDescBind *FindVar(const std::string &name_bytes) const;
+  VarDesc *FindVar(const std::string &name_bytes) const;
  bool HasVar(const std::string &var_name) const;
-  VarDescBind *FindVarRecursive(const std::string &name_bytes) const;
+  VarDesc *FindVarRecursive(const std::string &name_bytes) const;
-  VarDescBind *FindRecursiveOrCreateVar(const std::string &name_bytes);
+  VarDesc *FindRecursiveOrCreateVar(const std::string &name_bytes);
  bool HasVarRecursive(const std::string &var_name) const;
@@ -70,41 +69,41 @@ class BlockDescBind {
    return var_names;
  }
-  std::vector<VarDescBind *> AllVars() const;
+  std::vector<VarDesc *> AllVars() const;
-  BlockDescBind *ParentBlock() const;
+  BlockDesc *ParentBlock() const;
-  OpDescBind *AppendOp();
+  OpDesc *AppendOp();
-  void AppendAllocatedOp(std::unique_ptr<OpDescBind> &&op_desc);
+  void AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc);
-  OpDescBind *PrependOp();
+  OpDesc *PrependOp();
-  std::vector<OpDescBind *> AllOps() const;
+  std::vector<OpDesc *> AllOps() const;
  size_t OpSize() const { return ops_.size(); }
-  OpDescBind *Op(int idx) { return ops_.at(idx).get(); }
+  OpDesc *Op(int idx) { return ops_.at(idx).get(); }
  void Flush();
  proto::BlockDesc *Proto();
-  ProgramDescBind *Program() { return this->prog_; }
+  ProgramDesc *Program() { return this->prog_; }
 private:
  void ClearPBOps();
  void ClearPBVars();
 private:
-  ProgramDescBind *prog_;   // not_own
+  ProgramDesc *prog_;       // not_own
  proto::BlockDesc *desc_;  // not_own
  bool need_update_;
-  std::deque<std::unique_ptr<OpDescBind>> ops_;
+  std::deque<std::unique_ptr<OpDesc>> ops_;
-  std::unordered_map<std::string, std::unique_ptr<VarDescBind>> vars_;
+  std::unordered_map<std::string, std::unique_ptr<VarDesc>> vars_;
-  DISABLE_COPY_AND_ASSIGN(BlockDescBind);
+  DISABLE_COPY_AND_ASSIGN(BlockDesc);
 };
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
@@ -106,10 +106,10 @@ template <typename T>
 struct OpInfoFiller<T, kGradOpDescMaker> {
  void operator()(const char* op_type, OpInfo* info) const {
    info->grad_op_maker_ = [](
-        const OpDescBind& fwd_op,
+        const OpDesc& fwd_op,
        const std::unordered_set<std::string>& no_grad_set,
        std::unordered_map<std::string, std::string>* grad_to_var,
-        const std::vector<BlockDescBind*>& grad_block) {
+        const std::vector<BlockDesc*>& grad_block) {
      T maker(fwd_op, no_grad_set, grad_to_var, grad_block);
      return maker();
    };
@@ -119,7 +119,7 @@ struct OpInfoFiller<T, kGradOpDescMaker> {
 template <typename T>
 struct OpInfoFiller<T, kVarTypeInference> {
  void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_var_type_ = [](const OpDescBind& fwd_op, BlockDescBind* block) {
+    info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) {
      T inference;
      inference(fwd_op, block);
    };

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -64,7 +64,7 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
  }
 }
-void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
+void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                   bool create_local_scope) {
  // TODO(tonyyang-svail):
  //    - only runs on the first device (i.e. no interdevice communication)

--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -40,6 +40,16 @@ class DeviceContextPool {
    return *pool;
  }
+  const platform::DeviceContext* Borrow(const platform::Place& place) {
+    auto range = device_contexts_.equal_range(place);
+    if (range.first == range.second) {
+      PADDLE_THROW(
+          "'Place' is not supported, Please re-compile with WITH_GPU "
+          "option");
+    }
+    return range.first->second;
+  }
  std::vector<const platform::DeviceContext*> Borrow(
      const std::vector<platform::Place>& places) {
    PADDLE_ENFORCE_GT(places.size(), 0);
@@ -114,7 +124,7 @@ class Executor {
   *  ProgramDesc
   *  Scope
   */
-  void Run(const ProgramDescBind&, Scope*, int, bool create_local_scope = true);
+  void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true);
 private:
  std::vector<const platform::DeviceContext*> device_contexts_;

--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -22,21 +22,27 @@
 namespace paddle {
 namespace framework {
+/*
+  This functor class is responsible for creating the gradient ops for the given
+  operator fwd_op. After it is called (through operator()), the pairs of
+  (gradient variable, corresponding input variable of fwd_op) will be added to
+  grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its
+  gradient varialbe will be ignored or kEmptyVarName depending on the template
+  argument DropEmptyIG in the derived classes.
+ */
 class GradOpDescMakerBase {
 public:
  explicit GradOpDescMakerBase(
-      const OpDescBind& fwd_op,
+      const OpDesc& fwd_op, const std::unordered_set<std::string>& no_grad_set,
-      const std::unordered_set<std::string>& no_grad_set,
      std::unordered_map<std::string, std::string>* grad_to_var,
-      const std::vector<BlockDescBind*>& grad_block =
+      const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>())
-          std::vector<BlockDescBind*>())
      : fwd_op_(fwd_op),
        no_grad_set_(no_grad_set),
        grad_to_var_(grad_to_var),
        grad_block_(grad_block) {}
  virtual ~GradOpDescMakerBase() = default;
-  virtual std::vector<std::unique_ptr<OpDescBind>> operator()() const = 0;
+  virtual std::vector<std::unique_ptr<OpDesc>> operator()() const = 0;
 protected:
  std::vector<std::string> InputGrad(const std::string& name,
@@ -58,6 +64,16 @@ class GradOpDescMakerBase {
    if (!drop_empty_grad) {
      return ret_val;
    }
+    PADDLE_ENFORCE_LE(var_names.size(), 1UL,
+                      "BUG from operator developer:"
+                      " for input argument with a list of variables, "
+                      " drop_empty_grad is not allowed because it makes"
+                      " the correspondence bewteen a variable and its gradient"
+                      " ambiguous. Use REGISTER_OP_EX to register the op"
+                      " or call InputGrad(?,false) in GradOpDescMaker."
+                      " Op type %s",
+                      fwd_op_.Type());
    std::vector<std::string> dropped_ret_val;
    dropped_ret_val.reserve(ret_val.size());
    std::copy_if(ret_val.begin(), ret_val.end(),
@@ -105,26 +121,26 @@ class GradOpDescMakerBase {
  std::string ForwardOpType() const { return this->fwd_op_.Type(); }
 private:
-  const OpDescBind& fwd_op_;
+  const OpDesc& fwd_op_;
  const std::unordered_set<std::string>& no_grad_set_;
  std::unordered_map<std::string, std::string>* grad_to_var_;
 protected:
-  std::vector<BlockDescBind*> grad_block_;
+  std::vector<BlockDesc*> grad_block_;
 };
 class SingleGradOpDescMaker : public GradOpDescMakerBase {
 public:
  using GradOpDescMakerBase::GradOpDescMakerBase;
-  std::vector<std::unique_ptr<OpDescBind>> operator()() const {
+  std::vector<std::unique_ptr<OpDesc>> operator()() const {
-    std::vector<std::unique_ptr<OpDescBind>> retv;
+    std::vector<std::unique_ptr<OpDesc>> retv;
    retv.emplace_back(this->Apply());
    return retv;
  }
 protected:
-  virtual std::unique_ptr<OpDescBind> Apply() const = 0;
+  virtual std::unique_ptr<OpDesc> Apply() const = 0;
 };
 template <bool DropEmptyIG = true>
@@ -133,8 +149,8 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
  using SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  virtual std::unique_ptr<OpDescBind> Apply() const {
+  virtual std::unique_ptr<OpDesc> Apply() const {
-    auto* grad = new OpDescBind();
+    auto* grad = new OpDesc();
    grad->SetType(this->GradOpType());
    for (auto& input_param : this->InputNames()) {
@@ -150,7 +166,7 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
    grad->SetAttrMap(this->Attrs());
-    return std::unique_ptr<OpDescBind>(grad);
+    return std::unique_ptr<OpDesc>(grad);
  }
  virtual std::string GradOpType() const {
@@ -161,7 +177,7 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
 class EmptyGradOpMaker : public GradOpDescMakerBase {
 public:
  using GradOpDescMakerBase::GradOpDescMakerBase;
-  std::vector<std::unique_ptr<OpDescBind>> operator()() const override {
+  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
    return {};
  }
 };

--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-class OpDescBind;
+class OpDesc;
-class BlockDescBind;
+class BlockDesc;
 class CompileTimeInferShapeContext : public InferShapeContext {
 public:
-  CompileTimeInferShapeContext(const OpDescBind &op,
+  CompileTimeInferShapeContext(const OpDesc &op, const BlockDesc &block);
-                               const BlockDescBind &block);
  bool HasInput(const std::string &name) const override;
@@ -76,13 +75,12 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  void SetDim(const std::string &name, const DDim &dim) override;
-  const OpDescBind &op_;
+  const OpDesc &op_;
-  const BlockDescBind &block_;
+  const BlockDesc &block_;
 };
-OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
+OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
+               const VariableNameMap &outputs, const AttributeMap &attrs) {
-                       const AttributeMap &attrs) {
  desc_.set_type(type);
  inputs_ = inputs;
  outputs_ = outputs;
@@ -90,7 +88,7 @@ OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
  need_update_ = true;
 }
-OpDescBind::OpDescBind(const proto::OpDesc &desc, ProgramDescBind *prog)
+OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
    : desc_(desc), need_update_(false) {
  // restore inputs_
  int input_size = desc_.inputs_size();
@@ -126,20 +124,19 @@ OpDescBind::OpDescBind(const proto::OpDesc &desc, ProgramDescBind *prog)
  }
 }
-proto::OpDesc *OpDescBind::Proto() {
+proto::OpDesc *OpDesc::Proto() {
  Flush();
  return &desc_;
 }
-const std::vector<std::string> &OpDescBind::Input(
+const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
-    const std::string &name) const {
  auto it = inputs_.find(name);
  PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name,
                 Type());
  return it->second;
 }
-std::vector<std::string> OpDescBind::InputArgumentNames() const {
+std::vector<std::string> OpDesc::InputArgumentNames() const {
  std::vector<std::string> retv;
  for (auto &ipt : this->inputs_) {
    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
@@ -147,21 +144,20 @@ std::vector<std::string> OpDescBind::InputArgumentNames() const {
  return retv;
 }
-void OpDescBind::SetInput(const std::string &param_name,
+void OpDesc::SetInput(const std::string &param_name,
                      const std::vector<std::string> &args) {
  need_update_ = true;
  inputs_[param_name] = args;
 }
-const std::vector<std::string> &OpDescBind::Output(
+const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
-    const std::string &name) const {
  auto it = outputs_.find(name);
  PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
                 name, Type());
  return it->second;
 }
-std::vector<std::string> OpDescBind::OutputArgumentNames() const {
+std::vector<std::string> OpDesc::OutputArgumentNames() const {
  std::vector<std::string> retv;
  for (auto &ipt : this->outputs_) {
    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
@@ -169,19 +165,19 @@ std::vector<std::string> OpDescBind::OutputArgumentNames() const {
  return retv;
 }
-void OpDescBind::SetOutput(const std::string &param_name,
+void OpDesc::SetOutput(const std::string &param_name,
                       const std::vector<std::string> &args) {
  need_update_ = true;
  this->outputs_[param_name] = args;
 }
-proto::AttrType OpDescBind::GetAttrType(const std::string &name) const {
+proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
  auto it = attrs_.find(name);
  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
  return static_cast<proto::AttrType>(it->second.which() - 1);
 }
-std::vector<std::string> OpDescBind::AttrNames() const {
+std::vector<std::string> OpDesc::AttrNames() const {
  std::vector<std::string> retv;
  retv.reserve(attrs_.size());
  for (auto &attr : attrs_) {
@@ -190,41 +186,39 @@ std::vector<std::string> OpDescBind::AttrNames() const {
  return retv;
 }
-void OpDescBind::SetAttr(const std::string &name, const Attribute &v) {
+void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
  this->attrs_[name] = v;
  need_update_ = true;
 }
-void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
+void OpDesc::SetBlockAttr(const std::string &name, BlockDesc &block) {
  this->attrs_[name] = &block;
  need_update_ = true;
 }
-void OpDescBind::SetAttrMap(
+void OpDesc::SetAttrMap(
    const std::unordered_map<std::string, Attribute> &attr_map) {
  attrs_ = attr_map;
  need_update_ = true;
 }
-Attribute OpDescBind::GetAttr(const std::string &name) const {
+Attribute OpDesc::GetAttr(const std::string &name) const {
  auto it = attrs_.find(name);
  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
  return it->second;
 }
-int OpDescBind::GetBlockAttr(const std::string &name) const {
+int OpDesc::GetBlockAttr(const std::string &name) const {
  auto it = attrs_.find(name);
  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return boost::get<BlockDescBind *>(it->second)->ID();
+  return boost::get<BlockDesc *>(it->second)->ID();
 }
-const std::unordered_map<std::string, Attribute> &OpDescBind::GetAttrMap()
+const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
-    const {
  return attrs_;
 }
-void OpDescBind::Rename(const std::string &old_name,
+void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
-                        const std::string &new_name) {
  for (auto &input : inputs_) {
    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
  }
@@ -235,7 +229,7 @@ void OpDescBind::Rename(const std::string &old_name,
  need_update_ = true;
 }
-void OpDescBind::RenameOutput(const std::string &old_name,
+void OpDesc::RenameOutput(const std::string &old_name,
                          const std::string &new_name) {
  for (auto &output : outputs_) {
    std::replace(output.second.begin(), output.second.end(), old_name,
@@ -244,7 +238,7 @@ void OpDescBind::RenameOutput(const std::string &old_name,
  need_update_ = true;
 }
-void OpDescBind::RenameInput(const std::string &old_name,
+void OpDesc::RenameInput(const std::string &old_name,
                         const std::string &new_name) {
  for (auto &input : inputs_) {
    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
@@ -278,7 +272,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
 };
-void OpDescBind::Flush() {
+void OpDesc::Flush() {
  if (need_update_) {
    this->desc_.mutable_inputs()->Clear();
    for (auto &ipt : inputs_) {
@@ -330,7 +324,7 @@ static void InitInferShapeFuncs() {
  });
 }
-void OpDescBind::CheckAttrs() {
+void OpDesc::CheckAttrs() {
  PADDLE_ENFORCE(!Type().empty(),
                 "CheckAttr() can not be called before type is setted.");
  auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
@@ -342,7 +336,7 @@ void OpDescBind::CheckAttrs() {
  checker->Check(attrs_);
 }
-void OpDescBind::InferShape(const BlockDescBind &block) const {
+void OpDesc::InferShape(const BlockDesc &block) const {
  VLOG(3) << "CompileTime infer shape on " << Type();
  InitInferShapeFuncs();
  auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
@@ -365,7 +359,7 @@ void OpDescBind::InferShape(const BlockDescBind &block) const {
  infer_shape(&ctx);
 }
-void OpDescBind::InferVarType(BlockDescBind *block) const {
+void OpDesc::InferVarType(BlockDesc *block) const {
  auto &info = OpInfoMap::Instance().Get(this->Type());
  if (info.infer_var_type_) {
    info.infer_var_type_(*this, block);
@@ -384,7 +378,7 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
 }
 CompileTimeInferShapeContext::CompileTimeInferShapeContext(
-    const OpDescBind &op, const BlockDescBind &block)
+    const OpDesc &op, const BlockDesc &block)
    : op_(op), block_(block) {}
 bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {

--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -23,17 +23,17 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-class BlockDescBind;
+class BlockDesc;
-class ProgramDescBind;
+class ProgramDesc;
-class OpDescBind {
+class OpDesc {
 public:
-  OpDescBind() {}
+  OpDesc() {}
-  OpDescBind(const std::string &type, const VariableNameMap &inputs,
+  OpDesc(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const AttributeMap &attrs);
-  OpDescBind(const proto::OpDesc &desc, ProgramDescBind *prog);
+  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog);
  proto::OpDesc *Proto();
@@ -65,7 +65,7 @@ class OpDescBind {
  void SetAttr(const std::string &name, const Attribute &v);
-  void SetBlockAttr(const std::string &name, BlockDescBind &block);
+  void SetBlockAttr(const std::string &name, BlockDesc &block);
  Attribute GetAttr(const std::string &name) const;
@@ -107,9 +107,9 @@ class OpDescBind {
  void CheckAttrs();
-  void InferShape(const BlockDescBind &block) const;
+  void InferShape(const BlockDesc &block) const;
-  void InferVarType(BlockDescBind *block) const;
+  void InferVarType(BlockDesc *block) const;
  void MarkAsTarget() { desc_.set_is_target(true); }
@@ -127,7 +127,9 @@ class OpDescBind {
  }
  proto::OpDesc desc_;
+  // input arg name => output variable names
  VariableNameMap inputs_;
+  // output arg name => output variable names
  VariableNameMap outputs_;
  AttributeMap attrs_;

--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -47,7 +47,7 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap(
 std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
    const proto::OpDesc& op_desc) {
  VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be"
-             "used in unit tests. Use CreateOp(const OpDescBind& op_desc) "
+             "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
             "instead.";
  VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
  VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
@@ -59,7 +59,7 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
  return CreateOp(op_desc.type(), inputs, outputs, attrs);
 }
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDescBind& op_desc) {
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
  return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(),
                  op_desc.GetAttrMap());
 }

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -79,7 +79,7 @@ class OpRegistry {
  static std::unique_ptr<OperatorBase> CreateOp(const proto::OpDesc& op_desc);
-  static std::unique_ptr<OperatorBase> CreateOp(const OpDescBind& op_desc);
+  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
 };
 template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
@@ -126,6 +126,14 @@ class OpKernelRegistrar : public Registrar {
                             __test_global_namespace_##uniq_name##__>::value, \
                msg)
+/*
+  The variadic arguments should be class types derived from one of the
+  following classes:
+    OpProtoAndCheckerMaker
+    GradOpDescMakerBase
+    VarTypeInference
+    InferShapeBase
+*/
 #define REGISTER_OPERATOR(op_type, op_class, ...)                      \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
      __reg_op__##op_type,                                             \
@@ -144,15 +152,24 @@ class OpKernelRegistrar : public Registrar {
  }
 /**
- * Macro to register Operator.
+ * Macro to register Operator. When the input is duplicable, you should
+ * use REGISTER_OP_EX with deop_empty_grad=false instead.
 */
 #define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
                    grad_op_class)                                   \
+  REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,    \
+                 grad_op_class, true)
+// When an argument is duplicable, we need to use this version.
+// Perhaps we can omit DropEmptyIG template parameter and
+// only have one version of REGISTER_OP.
+#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,       \
+                       grad_op_class, drop_empty_grad)                        \
  REGISTER_OPERATOR(grad_op_type, grad_op_class);                             \
  class _GradOpDescMaker_##grad_op_type##_                                    \
-      : public ::paddle::framework::DefaultGradOpDescMaker<true> {         \
+      : public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
    using ::paddle::framework::DefaultGradOpDescMaker<                        \
-        true>::DefaultGradOpDescMaker;                                     \
+        drop_empty_grad>::DefaultGradOpDescMaker;                             \
                                                                              \
   protected:                                                                 \
    virtual std::string GradOpType() const { return #grad_op_type; }          \

--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -18,49 +18,49 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-BlockDescBind *ProgramDescBind::AppendBlock(const BlockDescBind &parent) {
+BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
  auto *b = desc_.add_blocks();
  b->set_parent_idx(parent.ID());
  b->set_idx(desc_.blocks_size() - 1);
-  blocks_.emplace_back(new BlockDescBind(this, b));
+  blocks_.emplace_back(new BlockDesc(this, b));
  return blocks_.back().get();
 }
-proto::ProgramDesc *ProgramDescBind::Proto() {
+proto::ProgramDesc *ProgramDesc::Proto() {
  for (auto &block : blocks_) {
    block->Flush();
  }
  return &desc_;
 }
-ProgramDescBind::ProgramDescBind() {
+ProgramDesc::ProgramDesc() {
  auto *block = desc_.mutable_blocks()->Add();
  block->set_idx(kRootBlockIndex);
  block->set_parent_idx(kNoneBlockIndex);
-  blocks_.emplace_back(new BlockDescBind(this, block));
+  blocks_.emplace_back(new BlockDesc(this, block));
 }
-ProgramDescBind::ProgramDescBind(const ProgramDescBind &o) {
+ProgramDesc::ProgramDesc(const ProgramDesc &o) {
  desc_ = o.desc_;
  for (int i = 0; i < desc_.blocks_size(); ++i) {
    auto *block = desc_.mutable_blocks(i);
-    blocks_.emplace_back(new BlockDescBind(*o.blocks_[i], block, this));
+    blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
  }
 }
-ProgramDescBind::ProgramDescBind(const proto::ProgramDesc &desc) {
+ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
  desc_ = desc;
  for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDescBind(this, &block_desc));
+    blocks_.emplace_back(new BlockDesc(this, &block_desc));
  }
 }
-ProgramDescBind::ProgramDescBind(const std::string &binary_str) {
+ProgramDesc::ProgramDesc(const std::string &binary_str) {
  PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
                 "Fail to parse program_desc from binary string.");
  for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDescBind(this, &block_desc));
+    blocks_.emplace_back(new BlockDesc(this, &block_desc));
  }
 }

--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -23,23 +23,23 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-class BlockDescBind;
+class BlockDesc;
-class ProgramDescBind {
+class ProgramDesc {
 public:
-  ProgramDescBind();
+  ProgramDesc();
-  explicit ProgramDescBind(const proto::ProgramDesc &desc);
+  explicit ProgramDesc(const proto::ProgramDesc &desc);
-  ProgramDescBind(const ProgramDescBind &o);
+  ProgramDesc(const ProgramDesc &o);
-  explicit ProgramDescBind(const std::string &binary_str);
+  explicit ProgramDesc(const std::string &binary_str);
-  BlockDescBind *AppendBlock(const BlockDescBind &parent);
+  BlockDesc *AppendBlock(const BlockDesc &parent);
-  BlockDescBind *MutableBlock(size_t idx) { return blocks_[idx].get(); }
+  BlockDesc *MutableBlock(size_t idx) { return blocks_[idx].get(); }
-  const BlockDescBind &Block(size_t idx) const { return *blocks_[idx]; }
+  const BlockDesc &Block(size_t idx) const { return *blocks_[idx]; }
  size_t Size() const { return blocks_.size(); }
@@ -48,7 +48,7 @@ class ProgramDescBind {
 private:
  proto::ProgramDesc desc_;
-  std::vector<std::unique_ptr<BlockDescBind>> blocks_;
+  std::vector<std::unique_ptr<BlockDesc>> blocks_;
 };
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace framework {
 TEST(ProgramDesc, copy_ctor) {
-  ProgramDescBind program;
+  ProgramDesc program;
  auto* global_block = program.MutableBlock(0);
  auto* x = global_block->Var("X");
  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
@@ -42,12 +42,12 @@ TEST(ProgramDesc, copy_ctor) {
  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
  op->SetOutput("Y", {out->Name()});
-  ProgramDescBind program_copy(program);
+  ProgramDesc program_copy(program);
  auto* global_block_copy = program_copy.MutableBlock(0);
  ASSERT_NE(global_block, global_block_copy);
-  auto assert_same_var = [&](const std::string& name, VarDescBind* var_before) {
+  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
    ASSERT_TRUE(global_block_copy->HasVar(name));
    auto* copy = global_block_copy->Var(name);
    ASSERT_NE(copy, var_before);
@@ -81,7 +81,7 @@ TEST(ProgramDesc, copy_ctor) {
 }
 TEST(ProgramDescBind, serialize_and_deserialize) {
-  ProgramDescBind program_origin;
+  ProgramDesc program_origin;
  auto* global_block = program_origin.MutableBlock(0);
  auto* x = global_block->Var("X");
  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
@@ -107,11 +107,11 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
  std::string binary_str;
  program_origin.Proto()->SerializeToString(&binary_str);
-  ProgramDescBind program_restored(binary_str);
+  ProgramDesc program_restored(binary_str);
  auto* global_block_restored = program_restored.MutableBlock(0);
  ASSERT_NE(global_block, global_block_restored);
-  auto assert_same_var = [&](const std::string& name, VarDescBind* var_before) {
+  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
    ASSERT_TRUE(global_block_restored->HasVar(name));
    auto* restored = global_block_restored->Var(name);
    ASSERT_NE(restored, var_before);

--- a/paddle/framework/prune_test.cc
+++ b/paddle/framework/prune_test.cc
@@ -29,7 +29,7 @@ namespace ops = paddle::operators;
 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
           const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           paddle::framework::BlockDescBind *block) {
+           paddle::framework::BlockDesc *block) {
  // insert output
  for (auto kv : outputs) {
    for (auto v : kv.second) {
@@ -51,8 +51,8 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
 }
 TEST(Prune, one_operator) {
-  f::ProgramDescBind program;
+  f::ProgramDesc program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::BlockDesc *block = program.MutableBlock(0);
  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
        block);
@@ -69,8 +69,8 @@ TEST(Prune, one_operator) {
 }
 TEST(Prune, forward) {
-  f::ProgramDescBind program;
+  f::ProgramDesc program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::BlockDesc *block = program.MutableBlock(0);
  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
        block);
@@ -92,8 +92,8 @@ TEST(Prune, forward) {
 }
 TEST(Prune, multi_input_op) {
-  f::ProgramDescBind program;
+  f::ProgramDesc program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::BlockDesc *block = program.MutableBlock(0);
  AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{},
        block);
@@ -113,8 +113,8 @@ TEST(Prune, multi_input_op) {
 }
 TEST(Prune, multi_output_op) {
-  f::ProgramDescBind program;
+  f::ProgramDesc program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::BlockDesc *block = program.MutableBlock(0);
  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
        f::AttributeMap{}, block);
@@ -132,8 +132,8 @@ TEST(Prune, multi_output_op) {
 }
 TEST(Prune, multi_target) {
-  f::ProgramDescBind program;
+  f::ProgramDesc program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::BlockDesc *block = program.MutableBlock(0);
  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
        f::AttributeMap{}, block);

--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
@@ -25,11 +25,9 @@
 namespace paddle {
 namespace framework {
 class OperatorBase;
-class OpDescBind;
+class OpDesc;
-class BlockDescBind;
-class BlockDesc;
 class InferShapeContext;
-class BlockDescBind;
+class BlockDesc;
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
@@ -37,7 +35,7 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using Attribute =
    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                   std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDescBind*>;
+                   std::vector<bool>, BlockDesc*>;
 using AttributeMap = std::unordered_map<std::string, Attribute>;
@@ -45,13 +43,13 @@ using OpCreator = std::function<OperatorBase*(
    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
-using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDescBind>>(
+using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
-    const OpDescBind&, const std::unordered_set<std::string>& /*no_grad_set*/,
+    const OpDesc&, const std::unordered_set<std::string>& /*no_grad_set*/,
    std::unordered_map<std::string, std::string>* /*grad_to_var*/,
-    const std::vector<BlockDescBind*>& grad_block)>;
+    const std::vector<BlockDesc*>& grad_block)>;
-using InferVarTypeFN = std::function<void(const OpDescBind& /*op_desc*/,
+using InferVarTypeFN =
-                                          BlockDescBind* /*block*/)>;
+    std::function<void(const OpDesc& /*op_desc*/, BlockDesc* /*block*/)>;
 using InferShapeFN = std::function<void(InferShapeContext*)>;

--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -18,29 +18,27 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-proto::VarDesc::VarType VarDescBind::GetType() const { return desc_.type(); }
+proto::VarDesc::VarType VarDesc::GetType() const { return desc_.type(); }
-void VarDescBind::SetType(proto::VarDesc::VarType type) {
+void VarDesc::SetType(proto::VarDesc::VarType type) { desc_.set_type(type); }
-  desc_.set_type(type);
-}
-void VarDescBind::SetShape(const std::vector<int64_t> &dims) {
+void VarDesc::SetShape(const std::vector<int64_t> &dims) {
  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
 }
-void VarDescBind::SetDataType(proto::DataType data_type) {
+void VarDesc::SetDataType(proto::DataType data_type) {
  mutable_tensor_desc()->set_data_type(data_type);
 }
-std::vector<int64_t> VarDescBind::Shape() const {
+std::vector<int64_t> VarDesc::Shape() const {
  return RepeatedToVector(tensor_desc().dims());
 }
-proto::DataType VarDescBind::GetDataType() const {
+proto::DataType VarDesc::GetDataType() const {
  return tensor_desc().data_type();
 }
-void VarDescBind::SetLoDLevel(int32_t lod_level) {
+void VarDesc::SetLoDLevel(int32_t lod_level) {
  switch (desc_.type()) {
    case proto::VarDesc::LOD_TENSOR:
      desc_.mutable_lod_tensor()->set_lod_level(lod_level);
@@ -54,7 +52,7 @@ void VarDescBind::SetLoDLevel(int32_t lod_level) {
  }
 }
-int32_t VarDescBind::GetLodLevel() const {
+int32_t VarDesc::GetLodLevel() const {
  switch (desc_.type()) {
    case proto::VarDesc::LOD_TENSOR:
      return desc_.lod_tensor().lod_level();
@@ -66,7 +64,7 @@ int32_t VarDescBind::GetLodLevel() const {
  }
 }
-const proto::TensorDesc &VarDescBind::tensor_desc() const {
+const proto::TensorDesc &VarDesc::tensor_desc() const {
  PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
  switch (desc_.type()) {
    case proto::VarDesc::SELECTED_ROWS:
@@ -80,7 +78,7 @@ const proto::TensorDesc &VarDescBind::tensor_desc() const {
  }
 }
-proto::TensorDesc *VarDescBind::mutable_tensor_desc() {
+proto::TensorDesc *VarDesc::mutable_tensor_desc() {
  PADDLE_ENFORCE(desc_.has_type(),
                 "invoke MutableTensorDesc must after set type");
  switch (desc_.type()) {

--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -53,14 +53,14 @@ inline void VectorToRepeated(const std::vector<bool> &vec,
  }
 }
-class VarDescBind {
+class VarDesc {
 public:
-  explicit VarDescBind(const std::string &name) {
+  explicit VarDesc(const std::string &name) {
    desc_.set_name(name);
    desc_.set_type(proto::VarDesc::LOD_TENSOR);
  }
-  explicit VarDescBind(const proto::VarDesc &desc) : desc_(desc) {}
+  explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {}
  proto::VarDesc *Proto() { return &desc_; }

--- a/paddle/framework/var_type_inference.h
+++ b/paddle/framework/var_type_inference.h
@@ -21,8 +21,7 @@ namespace framework {
 class VarTypeInference {
 public:
  virtual ~VarTypeInference() {}
-  virtual void operator()(const OpDescBind& op_desc,
+  virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
-                          BlockDescBind* block) const = 0;
 };
 }  // namespace framework

--- a/paddle/framework/var_type_inference_test.cc
+++ b/paddle/framework/var_type_inference_test.cc
@@ -33,8 +33,7 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 class SumOpVarTypeInference : public VarTypeInference {
 public:
-  void operator()(const OpDescBind &op_desc,
+  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
-                  BlockDescBind *block) const override {
    auto &inputs = op_desc.Input("X");
    auto default_var_type = proto::VarDesc::SELECTED_ROWS;
@@ -62,7 +61,7 @@ namespace paddle {
 namespace framework {
 TEST(InferVarType, sum_op) {
-  ProgramDescBind prog;
+  ProgramDesc prog;
  auto *op = prog.MutableBlock(0)->AppendOp();
  op->SetType("sum");
  op->SetInput("X", {"test_a", "test_b", "test_c"});
@@ -85,7 +84,7 @@ TEST(InferVarType, sum_op) {
 }
 TEST(InferVarType, sum_op_without_infer_var_type) {
-  ProgramDescBind prog;
+  ProgramDesc prog;
  auto *op = prog.MutableBlock(0)->AppendOp();
  op->SetType("sum_without_infer_var_type");
  op->SetInput("X", {"test2_a", "test2_b", "test2_c"});

--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -62,33 +62,6 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
  }
 }
-template <>
-void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num) {
-  platform::SetDeviceId(src_place.device);
-  platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
-}
-template <>
-void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::CPUPlace src_place,
-                                                  const void* src, size_t num) {
-  platform::SetDeviceId(dst_place.device);
-  platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
-}
-template <>
-void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num) {
-  platform::SetDeviceId(dst_place.device);
-  platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
-}
 #endif
 }  // namespace memory

--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -149,14 +149,14 @@ class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("lod_tensor_to_array");
    grad_op->SetInput("X", OutputGrad("Out"));
    grad_op->SetInput("RankTable", Input("RankTable"));
    grad_op->SetOutput("Out", InputGrad("X"));
    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };

--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -121,12 +121,12 @@ class AssignGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *op = new framework::OpDescBind();
+    auto *op = new framework::OpDesc();
    op->SetType("assign");
    op->SetInput("X", OutputGrad("Out"));
    op->SetOutput("Out", InputGrad("X"));
-    return std::unique_ptr<framework::OpDescBind>(op);
+    return std::unique_ptr<framework::OpDesc>(op);
  }
 };

--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -119,8 +119,8 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase {
 class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDescBind& op_desc,
+  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDescBind* block) const override {
+                  framework::BlockDesc* block) const override {
    for (auto& o : op_desc.Output("SentenceIds")) {
      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
    }

--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -52,14 +52,14 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto grad = new framework::OpDescBind();
+    auto grad = new framework::OpDesc();
    grad->SetType("cast");
    grad->SetInput("X", OutputGrad("Out"));
    grad->SetOutput("Out", InputGrad("X"));
    grad->SetAttr("out_dtype", GetAttr("in_dtype"));
    grad->SetAttr("in_dtype", GetAttr("out_dtype"));
-    return std::unique_ptr<framework::OpDescBind>(grad);
+    return std::unique_ptr<framework::OpDesc>(grad);
  }
 };

--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -98,8 +98,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
+REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
-            ops::ConcatOpGrad)
+               ops::ConcatOpGrad, false)
 REGISTER_OP_CPU_KERNEL(concat,
                       ops::ConcatKernel<paddle::platform::CPUPlace, float>)
 REGISTER_OP_CPU_KERNEL(concat_grad,

--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
@@ -65,7 +65,7 @@ class ConditionalBlockOp : public ConditionalOp {
      scopes->front() = &scope.NewScope();
      auto &cur_scope = *scopes->front();
-      auto *block = Attr<framework::BlockDescBind *>("sub_block");
+      auto *block = Attr<framework::BlockDesc *>("sub_block");
      framework::Executor exec(dev_ctx);
      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
    }
@@ -86,7 +86,7 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
              "(std::vector<Scope*>) The step scope of conditional block. To "
              "unify the conditional block, rnn and while op, the type of "
              "scope is std::vector<Scope*>");
-    AddAttr<framework::BlockDescBind *>(
+    AddAttr<framework::BlockDesc *>(
        "sub_block", "The step block of conditional block operator");
    AddComment(R"DOC(Conditional block operator
@@ -116,7 +116,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
      auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
      framework::Scope &cur_scope = *scopes[0];
-      auto *block = Attr<framework::BlockDescBind *>("sub_block");
+      auto *block = Attr<framework::BlockDesc *>("sub_block");
      framework::Executor exec(dev_ctx);
      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
@@ -170,18 +170,19 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto grad_op = new framework::OpDescBind();
+    auto grad_op = new framework::OpDesc();
    grad_op->SetType("conditional_block_grad");
    grad_op->SetInput("X", Input("X"));
    grad_op->SetInput("Params", Input("Params"));
    grad_op->SetInput("Out", Output("Out"));
    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
    grad_op->SetInput("Scope", Output("Scope"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
-    grad_op->SetOutput(framework::GradVarName("Params"), InputGrad("Params"));
+    grad_op->SetOutput(framework::GradVarName("Params"),
+                       InputGrad("Params", false));
    grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };

--- a/paddle/operators/conv_transpose_cudnn_op.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cc
@@ -21,8 +21,6 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
 public:
  CudnnConv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : Conv2DTransposeOpMaker(proto, op_checker) {
-    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault({1, 1});
    AddAttr<int>("workspace_size_MB",
                 "workspace size for cudnn, in MB, "
                 "workspace is a section of GPU memory which will be "
@@ -37,8 +35,6 @@ class CudnnConv3DTransposeOpMaker : public Conv3DTransposeOpMaker {
 public:
  CudnnConv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : Conv3DTransposeOpMaker(proto, op_checker) {
-    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault({1, 1, 1});
    AddAttr<int>("workspace_size_MB",
                 "workspace size for cudnn, in MB, "
                 "workspace is a section of GPU memory which will be "

--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -29,6 +29,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
  auto filter_dims = ctx->GetInputDim("Filter");
  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                 "ConvTransposeOp intput should be 4-D or 5-D tensor.");
@@ -41,14 +42,18 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
                    "ConvTransposeOp paddings dimension and strides "
                    "dimension should be the same.");
+  PADDLE_ENFORCE_EQ(paddings.size(), dilations.size(),
+                    "ConvTransposeOp paddings dimension and dilations "
+                    "dimension should be the same.");
  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
                    "In ConvTransposeOp, The input channel should be the same "
                    "as the number of filters.");
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
  for (size_t i = 0; i < strides.size(); ++i) {
+    auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
-                           filter_dims[i + 2]);
+                           filter_extent);
  }
  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
 }
@@ -73,6 +78,12 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto,
  AddOutput("Output",
            "(Tensor) The output tensor of convolution transpose operator. "
            "The format of output tensor is also NCHW.");
+  AddAttr<std::vector<int>>("dilations",
+                            "(vector<int> default:{1, 1}), the "
+                            "dilations(h_dilation, w_dilation) of convolution "
+                            "transpose operator.")
+      .SetDefault({1, 1});
  AddAttr<std::vector<int>>(
      "strides",
      "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
@@ -87,7 +98,7 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto,
 Convolution2D Transpose Operator.
 The convolution transpose operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and dilations, strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
 Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
 number of channels, H is the height of the feature, and W is the width of the feature.
@@ -136,6 +147,13 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto,
            "Where N is batch size, C is "
            "the number of channels, D is the depth of the feature, H is the "
            "height of the feature, and W is the width of the feature.");
+  AddAttr<std::vector<int>>(
+      "dilations",
+      "(vector<int> default:{1, 1, 1}), the "
+      "dilations(d_dilation,h_dilation, w_dilation) of convolution "
+      "transpose operator.")
+      .SetDefault({1, 1, 1});
  AddAttr<std::vector<int>>("strides",
                            "(vector<int> default:{1, 1, 1}), the "
                            "strides{d_stride, h_stride, w_stride} of "
@@ -149,7 +167,7 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto,
 Convolution3D Transpose Operator.
 The convolution transpose operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and dilations, strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
 Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
 number of channels, D is the depth of the feature, H is the height of the feature,

--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -61,6 +61,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
    // groups will alway be disabled in conv2dtranspose.
    const int batch_size = static_cast<int>(input->dims()[0]);
@@ -113,7 +114,6 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
    math::Col2VolFunctor<DeviceContext, T> col2vol;
-    std::vector<int> dilations({1, 1, 1});
    // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
    // on input)
@@ -165,6 +165,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
    const int batch_size = static_cast<int>(input->dims()[0]);
@@ -219,7 +220,6 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
      math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      std::vector<int> dilations({1, 1, 1});
      if (input_grad) {
        input_grad->mutable_data<T>(context.GetPlace());

--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -24,10 +24,10 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of FillZerosLikeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"),
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Y) of FillZerosLikeOp should not be null.");
+                   "Output(Out) of FillZerosLikeOp should not be null.");
-    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Y");
+    ctx->ShareLoD("X", /*->*/ "Out");
  }
 };
@@ -36,7 +36,7 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
  FillZerosLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of fill-zeros-like op.");
-    AddOutput("Y", "The variable will be filled up with zeros.");
+    AddOutput("Out", "The variable will be filled up with zeros.");
    AddComment(R"DOC(
 FillZerosLike Operator.

--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -23,7 +23,7 @@ template <typename DeviceContext, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
    out->mutable_data<T>(context.GetPlace());
    math::SetConstant<DeviceContext, T> setter;

--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@@ -93,13 +93,13 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("increment");
    grad_op->SetInput("X", Output("Out"));
    grad_op->SetOutput("Out", Input("X"));
    grad_op->SetAttr("step", -boost::get<float>(GetAttr("step")));
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };

--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -63,8 +63,8 @@ class LoDRankTableInferShape : public framework::InferShapeBase {
 class LoDRankTableInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDescBind &op_desc,
+  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDescBind *block) const override {
+                  framework::BlockDesc *block) const override {
    for (auto &o : op_desc.Output("Out")) {
      block->FindRecursiveOrCreateVar(o)->SetType(
          framework::proto::VarDesc::LOD_RANK_TABLE);

--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -127,8 +127,8 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase {
 class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDescBind &op_desc,
+  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDescBind *block) const override {
+                  framework::BlockDesc *block) const override {
    for (auto &out_var : op_desc.Output("Out")) {
      block->Var(out_var)->SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
    }
@@ -140,14 +140,14 @@ class LoDTensorToArrayGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("array_to_lod_tensor");
    grad_op->SetInput("X", OutputGrad("Out"));
    grad_op->SetInput("RankTable", Input("RankTable"));
    grad_op->SetOutput("Out", InputGrad("X"));
    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };

--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -108,8 +108,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
 class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDescBind& op_desc,
+  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDescBind* block) const override {
+                  framework::BlockDesc* block) const override {
    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
    auto attr = op_desc.GetAttr("is_sparse");
    bool is_sparse = boost::get<bool>(attr);

--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -67,18 +67,45 @@ void RowwiseAdd<DeviceContext, T>::operator()(const DeviceContext& context,
 template <typename DeviceContext, typename T>
 void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                              const framework::Tensor& input,
-                                              framework::Tensor* vector) {
+                                              framework::Tensor* out) {
  auto in_dims = input.dims();
  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), size);
+  PADDLE_ENFORCE_EQ(out->numel(), size);
-  auto vec = framework::EigenMatrix<T>::From(*vector);
  auto in = framework::EigenMatrix<T>::From(input);
-  Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
+  auto vec = framework::EigenVector<T>::Flatten(*out);
-  vec.reshape(shape).device(*context.eigen_device()) =
-      in.sum(Eigen::array<int, 1>({{0}})).reshape(shape);
+  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
 }
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// colwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class ColwiseSum<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), size);
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+    for (size_t i = 0; i < height; ++i) {
+      for (size_t j = 0; j < size; ++j) {
+        if (i == 0) {
+          out_buf[j] = in_buf[i * size + j];
+        } else {
+          out_buf[j] += in_buf[i * size + j];
+        }
+      }
+    }
+  }
+};
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -60,13 +60,13 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* grad_op = new framework::OpDescBind();
+    auto* grad_op = new framework::OpDesc();
    grad_op->SetType("mean_grad");
    grad_op->SetInput("X", Input("X"));
    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };

--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -161,15 +161,15 @@ class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("split_lod_tensor");
    grad_op->SetInput("X", OutputGrad("Out"));
    grad_op->SetInput("Mask", Input("Mask"));
    grad_op->SetOutput("OutTrue", InputGrad("InTrue"));
    grad_op->SetOutput("OutFalse", InputGrad("InFalse"));
    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };

--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -70,12 +70,11 @@ class MinusGradMaker : public framework::GradOpDescMakerBase {
 public:
  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
-  std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
+  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
-      const override {
+    std::vector<std::unique_ptr<framework::OpDesc>> ops;
-    std::vector<std::unique_ptr<framework::OpDescBind>> ops;
    auto x_g = InputGrad("X");
    if (!x_g.empty()) {
-      auto *x_g_op = new framework::OpDescBind();
+      auto *x_g_op = new framework::OpDesc();
      x_g_op->SetType("scale");
      x_g_op->SetInput("X", OutputGrad("Out"));
      x_g_op->SetOutput("Out", x_g);
@@ -85,7 +84,7 @@ class MinusGradMaker : public framework::GradOpDescMakerBase {
    auto y_g = InputGrad("Y");
    if (!y_g.empty()) {
-      auto *y_g_op = new framework::OpDescBind();
+      auto *y_g_op = new framework::OpDesc();
      y_g_op->SetType("scale");
      y_g_op->SetInput("X", OutputGrad("Out"));
      y_g_op->SetOutput("Out", y_g);

--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -73,39 +73,50 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  MulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of mul op");
+    AddInput("X", "(Tensor), The first input tensor of mul op.");
-    AddInput("Y", "The second input of mul op");
+    AddInput("Y", "(Tensor), The second input tensor of mul op.");
-    AddOutput("Out", "The output of mul op");
+    AddOutput("Out", "(Tensor), The output tensor of mul op.");
    AddAttr<int>(
        "x_num_col_dims",
-        "(int, default 1) "
+        R"DOC((int, default 1), The mul_op can take tensors with more than two
-        R"DOC(mul_op can take tensors with more than two dimensions as input `X`,
+              dimensions as its inputs. If the input $X$ is a tensor with more
-            in that case, tensors will be reshaped to a matrix. The matrix's first
+              than two dimensions, $X$ will be flattened into a two-dimensional
-            dimension(column length) will be the product of tensor's last
+              matrix first. The flattening rule is: the first `num_col_dims`
-            `num_col_dims` dimensions, and the matrix's second dimension(row length)
+              will be flattened to form the first dimension of the final matrix
-            will be the product of tensor's first `rank - num_col_dims` dimensions.
+              (the height of the matrix), and the rest `rank(X) - num_col_dims`
+              dimensions are flattened to form the second dimension of the final
+              matrix (the width of the matrix). As a result, height of the
+              flattened matrix is equal to the product of $X$'s first
+              `x_num_col_dims` dimensions' sizes, and width of the flattened
+              matrix is equal to the product of $X$'s last `rank(x) - num_col_dims`
+              dimensions' size. For example, suppose $X$ is a 6-dimensional
+              tensor with the shape [2, 3, 4, 5, 6], and `x_num_col_dims` = 3.
+              Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] =
+              [24, 30].
        )DOC")
        .SetDefault(1)
        .EqualGreaterThan(1);
    AddAttr<int>(
        "y_num_col_dims",
-        "(int, default 1) "
+        R"DOC((int, default 1), The mul_op can take tensors with more than two,
-        R"DOC(mul_op can take tensors with more than two dimensions as input `Y`,
+              dimensions as its inputs. If the input $Y$ is a tensor with more
-             in that case, tensors will be reshaped to a matrix. Just like input `X`.
+              than two dimensions, $Y$ will be flattened into a two-dimensional
+              matrix first. The attribute `y_num_col_dims` determines how $Y$ is
+              flattened. See comments of `x_num_col_dims` for more details.
        )DOC")
        .SetDefault(1)
        .EqualGreaterThan(1);
    AddComment(R"DOC(
 Mul Operator.
-This operator is used to perform matrix multiplication for input X and Y.
+This operator is used to perform matrix multiplication for input $X$ and $Y$.
 The equation is:
    $$Out = X * Y$$
-Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+Both the input $X$ and $Y$ can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input `X`.
+or not. But the output only shares the LoD information with input $X$.
 )DOC");
  }

--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -65,7 +65,7 @@ class NCCLTester : public ::testing::Test {
  }
  void NCCLInitOp() {
-    std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
+    std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
    op1->SetType("ncclInit");
    op1->SetOutput("Communicator", {"comm"});
@@ -81,10 +81,9 @@ class NCCLTester : public ::testing::Test {
  }
  template <class T>
-  void PerThreadProgram(int gpu_id, const f::OpDescBind &op_desc,
+  void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
-                        f::Scope *scope) {
    std::unique_lock<std::mutex> lk(mu);
-    const f::OpDescBind *op1 = &op_desc;
+    const f::OpDesc *op1 = &op_desc;
    p::GPUPlace place(gpu_id);
    auto &ctx = dev_ctxs.at(gpu_id);
@@ -125,7 +124,7 @@ class NCCLTester : public ::testing::Test {
 // ncclInitOp with desc
 TEST(NCCL, ncclInitOp) {
-  std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);
  op_desc->SetType("ncclInit");
  op_desc->SetOutput("Communicator", {"x1"});
@@ -145,7 +144,7 @@ TEST(NCCL, ncclInitOp) {
 // ncclAllReduceOp with desc
 TEST_F(NCCLTester, ncclAllReduceOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
  op2->SetType("ncclAllReduce");
  op2->SetInput("X", {"st"});
  op2->SetInput("Communicator", {"comm"});
@@ -192,7 +191,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
 // ncclReduceOp with desc
 TEST_F(NCCLTester, ncclReduceOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
  const int kRoot = 0;
  op2->SetType("ncclReduce");
  op2->SetInput("X", {"st"});
@@ -240,7 +239,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
 // ncclBcastOp with desc
 TEST_F(NCCLTester, ncclBcastOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
  const int kRoot = 5;
  op2->SetType("ncclBcast");
  op2->SetInput("X", {"st"});

--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -116,14 +116,14 @@ class PadOpGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* bind = new framework::OpDescBind();
+    auto* bind = new framework::OpDesc();
    bind->SetInput("X", Input("X"));
    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
    bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
    bind->SetAttrMap(Attrs());
    bind->SetType("pad_grad");
-    return std::unique_ptr<framework::OpDescBind>(bind);
+    return std::unique_ptr<framework::OpDesc>(bind);
  }
 };

--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -234,7 +234,7 @@ class RecurrentOp : public RecurrentBase {
    auto reverse = Attr<bool>(kReverse);
    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();
    for (size_t i = 0; i < seq_len; ++i) {
@@ -317,7 +317,7 @@ class RecurrentGradOp : public RecurrentBase {
    auto reverse = Attr<bool>(kReverse);
    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();
    for (size_t step_id = 0; step_id < seq_len; ++step_id) {
@@ -522,8 +522,7 @@ The ex-state means the state value in the ex-timestep or the previous time step
        string::Sprintf(
            "The state variable names. [%s, %s, %s] must be the same order",
            kExStates, kStates, kInitStateGrads));
-    AddAttr<framework::BlockDescBind *>(kStepBlock,
+    AddAttr<framework::BlockDesc *>(kStepBlock, "The step block inside RNN");
-                                        "The step block inside RNN");
    AddAttr<bool>(kReverse, R"DOC(Calculate RNN reversely or not.
 By default reverse=False
@@ -565,13 +564,13 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  virtual std::unique_ptr<framework::OpDescBind> Apply() const {
+  virtual std::unique_ptr<framework::OpDesc> Apply() const {
-    auto *grad = new framework::OpDescBind();
+    auto *grad = new framework::OpDesc();
    grad->SetType("recurrent_grad");
    for (auto &input_param : this->InputNames()) {
      grad->SetInput(input_param, this->Input(input_param));
      grad->SetOutput(framework::GradVarName(input_param),
-                      this->InputGrad(input_param));
+                      this->InputGrad(input_param, false));
    }
    for (auto &output_param : this->OutputNames()) {
@@ -588,7 +587,7 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
    grad->SetAttrMap(this->Attrs());
    grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
-    return std::unique_ptr<framework::OpDescBind>(grad);
+    return std::unique_ptr<framework::OpDesc>(grad);
  }
 };

--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -58,13 +58,13 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("scale");
    grad_op->SetInput("X", OutputGrad("Out"));
    grad_op->SetOutput("Out", InputGrad("X"));
    grad_op->SetAttr("scale", GetAttr("scale"));
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };

--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -124,8 +124,9 @@ class SequenceConcatGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_concat, ops::SequenceConcatOp, ops::SequenceConcatOpMaker,
+REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp,
-            sequence_concat_grad, ops::SequenceConcatGradOp);
+               ops::SequenceConcatOpMaker, sequence_concat_grad,
+               ops::SequenceConcatGradOp, false);
 REGISTER_OP_CPU_KERNEL(
    sequence_concat,
    ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -136,14 +136,14 @@ class ShrinkRNNGradOpMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *op = new framework::OpDescBind();
+    auto *op = new framework::OpDesc();
    op->SetType("shrink_rnn_memory_grad");
    op->SetInput("X", Input("X"));
    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
    op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(op);
+    return std::unique_ptr<framework::OpDesc>(op);
  }
 };

--- a/paddle/operators/sign_op.cc
+++ b/paddle/operators/sign_op.cc
@@ -50,13 +50,13 @@ class SignGradMaker : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("scale");
    grad_op->SetInput("X", OutputGrad("Out"));
    grad_op->SetOutput("Out", InputGrad("X"));
    grad_op->SetAttr("scale", 0.0f);
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };

--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -173,8 +173,8 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto* grad_op = new framework::OpDescBind();
+    auto* grad_op = new framework::OpDesc();
    grad_op->SetType("softmax_with_cross_entropy_grad");
    grad_op->SetInput("Label", Input("Label"));
    grad_op->SetInput("Softmax", Output("Softmax"));
@@ -183,7 +183,7 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
    grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
    grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };

--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -163,8 +163,8 @@ class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("merge_lod_tensor");
    grad_op->SetInput("InTrue", OutputGrad("OutTrue"));
    grad_op->SetInput("InFalse", OutputGrad("OutFalse"));
@@ -172,7 +172,7 @@ class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
    grad_op->SetInput("X", Input("X"));
    grad_op->SetOutput("Out", InputGrad("X"));
    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };

--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
@@ -108,13 +108,13 @@ class SplitGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto op = new framework::OpDescBind();
+    auto op = new framework::OpDesc();
    op->SetType("concat");
    op->SetInput("X", OutputGrad("Out"));
    op->SetOutput("Out", InputGrad("X"));
    op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(op);
+    return std::unique_ptr<framework::OpDesc>(op);
  }
 };

--- a/paddle/operators/strided_memcpy_test.cc
+++ b/paddle/operators/strided_memcpy_test.cc
@@ -85,8 +85,10 @@ TEST(StridedMemcpy, GPUCrop) {
  platform::GPUPlace gpu0(0);
  platform::CPUPlace cpu;
+  platform::CUDADeviceContext ctx(gpu0);
  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
  framework::DDim src_stride({5, 1});
@@ -96,7 +98,6 @@ TEST(StridedMemcpy, GPUCrop) {
  framework::DDim dst_dim({2, 2});
  framework::DDim dst_stride({2, 1});
-  platform::CUDADeviceContext ctx(gpu0);
  StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
                     gpu_dst);
@@ -122,9 +123,10 @@ TEST(StridedMemcpy, GPUConcat) {
  platform::GPUPlace gpu0(0);
  platform::CPUPlace cpu;
+  platform::CUDADeviceContext ctx(gpu0);
  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
  int dst[8];
  int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
@@ -132,7 +134,6 @@ TEST(StridedMemcpy, GPUConcat) {
  framework::DDim src_stride({2, 1});
  framework::DDim dst_dim({2, 2});
  framework::DDim dst_stride({4, 1});
-  platform::CUDADeviceContext ctx(gpu0);
  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,

--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -115,8 +115,8 @@ the LoD information with the first input.
 class SumOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDescBind& op_desc,
+  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDescBind* block) const override {
+                  framework::BlockDesc* block) const override {
    auto& inputs = op_desc.Input("X");
    auto var_type = framework::proto::VarDesc::SELECTED_ROWS;
@@ -169,20 +169,19 @@ class SumGradMaker : public framework::GradOpDescMakerBase {
 public:
  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
-  std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
+  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
-      const override {
+    auto x_grads = InputGrad("X", false);
-    auto x_grads = InputGrad("X");
+    std::vector<std::unique_ptr<framework::OpDesc>> grad_ops;
-    std::vector<std::unique_ptr<framework::OpDescBind>> grad_ops;
    grad_ops.reserve(x_grads.size());
    auto og = OutputGrad("Out");
    std::transform(x_grads.begin(), x_grads.end(), std::back_inserter(grad_ops),
                   [&og](const std::string& x_grad) {
-                     auto* grad_op = new framework::OpDescBind();
+                     auto* grad_op = new framework::OpDesc();
                     grad_op->SetType("scale");
                     grad_op->SetInput("X", og);
                     grad_op->SetOutput("Out", {x_grad});
                     grad_op->SetAttr("scale", 1.0f);
-                     return std::unique_ptr<framework::OpDescBind>(grad_op);
+                     return std::unique_ptr<framework::OpDesc>(grad_op);
                   });
    return grad_ops;
  }

--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -96,8 +96,8 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
 class WriteToArrayInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDescBind &op_desc,
+  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDescBind *block) const override {
+                  framework::BlockDesc *block) const override {
    auto x_name = op_desc.Input("X")[0];
    auto out_name = op_desc.Output("Out")[0];
    VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
@@ -175,14 +175,14 @@ class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("read_from_array");
    grad_op->SetInput("I", Input("I"));
    grad_op->SetInput("X", OutputGrad("Out"));
    grad_op->SetOutput("Out", InputGrad("X"));
    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };
@@ -191,14 +191,14 @@ class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+    auto *grad_op = new framework::OpDesc();
    grad_op->SetType("write_to_array");
    grad_op->SetInput("I", Input("I"));
    grad_op->SetInput("X", OutputGrad("Out"));
    grad_op->SetOutput("Out", InputGrad("X"));
    grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
  }
 };

--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -46,7 +46,7 @@ class WhileOp : public framework::OperatorBase {
    PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();
    auto step_scopes =
@@ -82,7 +82,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
              "(StepScopeVar) A vector of local scope, which size equals the "
              "step number of While Op. The i'th scope storages temporary "
              "variables generated in the i'th step.");
-    AddAttr<framework::BlockDescBind *>(kStepBlock,
+    AddAttr<framework::BlockDesc *>(kStepBlock,
                                    "The step block inside WhileOp");
    AddComment(R"DOC(
 )DOC");
@@ -99,7 +99,7 @@ class WhileGradOp : public framework::OperatorBase {
  void Run(const framework::Scope &scope,
           const platform::DeviceContext &dev_ctx) const override {
    framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();
    auto *step_scopes =
@@ -209,8 +209,8 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto *grad = new framework::OpDescBind();
+    auto *grad = new framework::OpDesc();
    grad->SetType("while_grad");
    grad->SetInput(kParameters, Input(kParameters));
@@ -279,14 +279,14 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
    // while operator could be renamed.
    grad->SetAttr("original_output_grad", extra_inputs_list);
-    return std::unique_ptr<framework::OpDescBind>(grad);
+    return std::unique_ptr<framework::OpDesc>(grad);
  }
 };
 class WhileGradOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDescBind &op_desc,
+  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDescBind *block) const override {
+                  framework::BlockDesc *block) const override {
    auto p_names = op_desc.Input(kParameters);
    auto pg_names = op_desc.Output(framework::GradVarName(kParameters));

--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -97,17 +97,6 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
 }
-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum cudaMemcpyKind kind) {
-  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
-                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync");
-  // note: cudaMemcpy may actually be asynchronous with respect to the caller,
-  //       block on stream 0 to make sure the copy has completed
-  PADDLE_ENFORCE(
-      cudaStreamSynchronize(0),
-      "cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync");
-}
 void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
                   size_t count, cudaStream_t stream) {
  PADDLE_ENFORCE(

--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -52,10 +52,6 @@ size_t GpuMaxChunkSize();
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind, cudaStream_t stream);
-//! Copy memory from address src to dst synchronously.
-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum cudaMemcpyKind kind);
 //! Copy memory from one device to another device.
 void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
                   size_t count, cudaStream_t stream);

--- a/paddle/platform/transform_test.cu
+++ b/paddle/platform/transform_test.cu
@@ -53,11 +53,11 @@ TEST(Transform, GPUUnary) {
  CUDADeviceContext ctx(gpu0);
  float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
  float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
-  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
+  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
  Transform<paddle::platform::CUDADeviceContext> trans;
  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
  ctx.Wait();
-  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
+  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
  Free(gpu0, gpu_buf);
  for (int i = 0; i < 4; ++i) {
    ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
@@ -83,11 +83,11 @@ TEST(Transform, GPUBinary) {
  GPUPlace gpu0(0);
  CUDADeviceContext ctx(gpu0);
  int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
-  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
+  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
  Transform<paddle::platform::CUDADeviceContext> trans;
  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
  ctx.Wait();
-  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
+  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
  Free(gpu0, gpu_buf);
  for (int i = 0; i < 4; ++i) {
    ASSERT_EQ((i + 1) * (i + 1), buf[i]);

--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -108,21 +108,21 @@ static py::bytes SerializeMessage(T &self) {
 // Bind Methods
 void BindProgramDesc(py::module &m) {
-  py::class_<ProgramDescBind>(m, "ProgramDesc", "")
+  py::class_<ProgramDesc>(m, "ProgramDesc", "")
      .def(py::init<>())
      .def("__init__",
-           [](ProgramDescBind &self, const ProgramDescBind &other) {
+           [](ProgramDesc &self, const ProgramDesc &other) {
-             new (&self) ProgramDescBind(other);
+             new (&self) ProgramDesc(other);
           })
      .def("__init__",
-           [](ProgramDescBind &self, const py::bytes &binary_str) {
+           [](ProgramDesc &self, const py::bytes &binary_str) {
             std::string str(binary_str);
-             new (&self) ProgramDescBind(str);
+             new (&self) ProgramDesc(str);
           })
-      .def("append_block", &ProgramDescBind::AppendBlock,
+      .def("append_block", &ProgramDesc::AppendBlock,
           py::return_value_policy::reference)
      .def("append_backward",
-           [](ProgramDescBind &program_desc, const VarDescBind &target,
+           [](ProgramDesc &program_desc, const VarDesc &target,
              const std::unordered_set<std::string> &no_grad_vars) {
             ParamGradInfoMap param_grad_map =
                 AppendBackward(program_desc, target, no_grad_vars);
@@ -138,12 +138,12 @@ void BindProgramDesc(py::module &m) {
             }
             return retv;
           })
-      .def("block", &ProgramDescBind::MutableBlock,
+      .def("block", &ProgramDesc::MutableBlock,
           py::return_value_policy::reference)
-      .def("num_blocks", &ProgramDescBind::Size)
+      .def("num_blocks", &ProgramDesc::Size)
-      .def("serialize_to_string", SerializeMessage<ProgramDescBind>)
+      .def("serialize_to_string", SerializeMessage<ProgramDesc>)
      .def("parse_from_string",
-           [](ProgramDescBind &program_desc, const std::string &data) {
+           [](ProgramDesc &program_desc, const std::string &data) {
             proto::ProgramDesc *desc = program_desc.Proto();
             PADDLE_ENFORCE(desc->ParseFromString(data),
                            "Fail to parse ProgramDesc from string. This could "
@@ -152,35 +152,34 @@ void BindProgramDesc(py::module &m) {
 }
 void BindBlockDesc(py::module &m) {
-  py::class_<BlockDescBind>(m, "BlockDesc", "")
+  py::class_<BlockDesc>(m, "BlockDesc", "")
-      .def_property_readonly("id", &BlockDescBind::ID)
+      .def_property_readonly("id", &BlockDesc::ID)
-      .def_property_readonly("parent", &BlockDescBind::Parent)
+      .def_property_readonly("parent", &BlockDesc::Parent)
-      .def("append_op", &BlockDescBind::AppendOp,
+      .def("append_op", &BlockDesc::AppendOp,
           py::return_value_policy::reference)
-      .def("prepend_op", &BlockDescBind::PrependOp,
+      .def("prepend_op", &BlockDesc::PrependOp,
           py::return_value_policy::reference)
      .def("var",
-           [](BlockDescBind &self, py::bytes byte_name) {
+           [](BlockDesc &self, py::bytes byte_name) {
             std::string name = byte_name;
             return self.Var(name);
           },
           py::return_value_policy::reference)
      .def("has_var",
-           [](BlockDescBind &self, py::bytes byte_name) {
+           [](BlockDesc &self, py::bytes byte_name) {
             std::string name = byte_name;
             return self.HasVar(name);
           })
      .def("find_var",
-           [](BlockDescBind &self, py::bytes byte_name) {
+           [](BlockDesc &self, py::bytes byte_name) {
             std::string name = byte_name;
             return self.FindVar(name);
           },
           py::return_value_policy::reference)
-      .def("all_vars", &BlockDescBind::AllVars,
+      .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
-           py::return_value_policy::reference)
+      .def("op_size", &BlockDesc::OpSize)
-      .def("op_size", &BlockDescBind::OpSize)
+      .def("op", &BlockDesc::Op, py::return_value_policy::reference)
-      .def("op", &BlockDescBind::Op, py::return_value_policy::reference)
+      .def("serialize_to_string", SerializeMessage<BlockDesc>);
-      .def("serialize_to_string", SerializeMessage<BlockDescBind>);
 }
 void BindVarDsec(py::module &m) {
@@ -193,25 +192,25 @@ void BindVarDsec(py::module &m) {
      .value("FP32", proto::DataType::FP32)
      .value("FP64", proto::DataType::FP64);
-  py::class_<VarDescBind> var_desc(m, "VarDesc", "");
+  py::class_<VarDesc> var_desc(m, "VarDesc", "");
  var_desc
      .def("name",
-           [](const VarDescBind &self) {
+           [](const VarDesc &self) {
             py::bytes name = self.Name();
             return name;
           },
           py::return_value_policy::reference)
-      .def("set_shape", &VarDescBind::SetShape)
+      .def("set_shape", &VarDesc::SetShape)
-      .def("set_dtype", &VarDescBind::SetDataType)
+      .def("set_dtype", &VarDesc::SetDataType)
-      .def("shape", &VarDescBind::Shape, py::return_value_policy::reference)
+      .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
-      .def("dtype", &VarDescBind::GetDataType)
+      .def("dtype", &VarDesc::GetDataType)
-      .def("lod_level", &VarDescBind::GetLodLevel)
+      .def("lod_level", &VarDesc::GetLodLevel)
-      .def("set_lod_level", &VarDescBind::SetLoDLevel)
+      .def("set_lod_level", &VarDesc::SetLoDLevel)
-      .def("type", &VarDescBind::GetType)
+      .def("type", &VarDesc::GetType)
-      .def("set_type", &VarDescBind::SetType)
+      .def("set_type", &VarDesc::SetType)
-      .def("serialize_to_string", SerializeMessage<VarDescBind>)
+      .def("serialize_to_string", SerializeMessage<VarDesc>)
-      .def("persistable", &VarDescBind::Persistable)
+      .def("persistable", &VarDesc::Persistable)
-      .def("set_persistable", &VarDescBind::SetPersistable);
+      .def("set_persistable", &VarDesc::SetPersistable);
  py::enum_<proto::VarDesc::VarType>(var_desc, "VarType", "")
      .value("LOD_TENSOR", proto::VarDesc::LOD_TENSOR)
@@ -235,26 +234,26 @@ void BindOpDesc(py::module &m) {
      .value("BOOLS", proto::AttrType::BOOLEANS)
      .value("BLOCK", proto::AttrType::BLOCK);
-  py::class_<OpDescBind> op_desc(m, "OpDesc", "");
+  py::class_<OpDesc> op_desc(m, "OpDesc", "");
-  op_desc.def("type", &OpDescBind::Type)
+  op_desc.def("type", &OpDesc::Type)
-      .def("set_type", &OpDescBind::SetType)
+      .def("set_type", &OpDesc::SetType)
-      .def("input", &OpDescBind::Input)
+      .def("input", &OpDesc::Input)
-      .def("input_names", &OpDescBind::InputNames)
+      .def("input_names", &OpDesc::InputNames)
-      .def("set_input", &OpDescBind::SetInput)
+      .def("set_input", &OpDesc::SetInput)
-      .def("output", &OpDescBind::Output)
+      .def("output", &OpDesc::Output)
-      .def("output_names", &OpDescBind::OutputNames)
+      .def("output_names", &OpDesc::OutputNames)
-      .def("set_output", &OpDescBind::SetOutput)
+      .def("set_output", &OpDesc::SetOutput)
-      .def("has_attr", &OpDescBind::HasAttr)
+      .def("has_attr", &OpDesc::HasAttr)
-      .def("attr_type", &OpDescBind::GetAttrType)
+      .def("attr_type", &OpDesc::GetAttrType)
-      .def("attr_names", &OpDescBind::AttrNames)
+      .def("attr_names", &OpDesc::AttrNames)
-      .def("set_attr", &OpDescBind::SetAttr)
+      .def("set_attr", &OpDesc::SetAttr)
-      .def("attr", &OpDescBind::GetAttr)
+      .def("attr", &OpDesc::GetAttr)
-      .def("set_block_attr", &OpDescBind::SetBlockAttr)
+      .def("set_block_attr", &OpDesc::SetBlockAttr)
-      .def("block_attr", &OpDescBind::GetBlockAttr)
+      .def("block_attr", &OpDesc::GetBlockAttr)
-      .def("check_attrs", &OpDescBind::CheckAttrs)
+      .def("check_attrs", &OpDesc::CheckAttrs)
-      .def("infer_shape", &OpDescBind::InferShape)
+      .def("infer_shape", &OpDesc::InferShape)
-      .def("infer_var_type", &OpDescBind::InferVarType)
+      .def("infer_var_type", &OpDesc::InferVarType)
-      .def("serialize_to_string", SerializeMessage<OpDescBind>);
+      .def("serialize_to_string", SerializeMessage<OpDesc>);
 }
 }  // namespace pybind

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -266,36 +266,36 @@ All parameter, weight, gradient are variables in Paddle.
    return ret_values;
  });
  m.def("get_grad_op_descs",
-        [](const OpDescBind &op_desc,
+        [](const OpDesc &op_desc,
           const std::unordered_set<std::string> &no_grad_set,
           std::unordered_map<std::string, std::string> &grad_to_var,
-           const std::vector<BlockDescBind *> &grad_sub_block) {
+           const std::vector<BlockDesc *> &grad_sub_block) {
-          std::vector<std::unique_ptr<OpDescBind>> grad_op_descs =
+          std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
              framework::OpInfoMap::Instance()
                  .Get(op_desc.Type())
                  .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
                                 grad_sub_block);
-          std::vector<OpDescBind *> grad_op_desc_ptrs(grad_op_descs.size());
+          std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
          std::transform(
              grad_op_descs.begin(), grad_op_descs.end(),
              grad_op_desc_ptrs.begin(),
-              [](std::unique_ptr<OpDescBind> &p) { return p.release(); });
+              [](std::unique_ptr<OpDesc> &p) { return p.release(); });
          return grad_op_desc_ptrs;
        });
-  m.def("prune", [](const ProgramDescBind &origin,
+  m.def("prune", [](const ProgramDesc &origin,
                    const std::vector<std::array<size_t, 2>> &targets) {
-    ProgramDescBind prog_with_targets(origin);
+    ProgramDesc prog_with_targets(origin);
    for (const auto &t : targets) {
      prog_with_targets.MutableBlock(t[0])->Op(t[1])->MarkAsTarget();
    }
    proto::ProgramDesc pruned_desc;
    Prune(*prog_with_targets.Proto(), &pruned_desc);
-    return new ProgramDescBind(pruned_desc);
+    return new ProgramDesc(pruned_desc);
  });
-  m.def("inference_optimize", [](ProgramDescBind &origin) {
+  m.def("inference_optimize", [](ProgramDesc &origin) {
    proto::ProgramDesc pruned_desc;
    InferenceOptimize(*(origin.Proto()), &pruned_desc);
-    return new ProgramDescBind(pruned_desc);
+    return new ProgramDesc(pruned_desc);
  });
  m.def_submodule(
       "var_names",

--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -14,6 +14,7 @@
 #pragma once
 #include <string>
+#include "paddle/framework/executor.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/memory/memcpy.h"
 #include "pybind11/numpy.h"
@@ -61,11 +62,15 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
        auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
            tensor.dims(), platform::CPUPlace()));
-        // TODO(qijun): Here we use default CUDA stream to set GPU Tensor to
-        // a Python numpy array. It's better to manage CDUA stream unifiedly.
+        framework::DeviceContextPool &pool =
-        paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
+            framework::DeviceContextPool::Get();
-                                        sizeof(CUR_TYPE) * tensor.numel(),
+        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
-                                        cudaMemcpyDeviceToHost);
+            pool.Borrow(tensor.place()));
+        paddle::platform::GpuMemcpyAsync(
+            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
+            cudaMemcpyDeviceToHost, dev_ctx->stream());
 #else
        PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
 #endif
@@ -132,10 +137,12 @@ void PyCUDATensorSetFromArray(
  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<T>(place);
-  // TODO(qijun): Here we use default CUDA stream to set a Python numpy
-  // array to a GPU Tensor. It's better to manage CDUA stream unifiedly.
+  framework::DeviceContextPool &pool = framework::DeviceContextPool::Get();
-  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
+  auto dev_ctx =
-                                  cudaMemcpyHostToDevice);
+      static_cast<const platform::CUDADeviceContext *>(pool.Borrow(place));
+  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
+                                   cudaMemcpyHostToDevice, dev_ctx->stream());
 }
 #endif

--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -441,9 +441,25 @@ def topk(input, k):
 def lod_tensor_to_array(x, table):
-    """
+    """This function performs the operation that converts an LOD_Tensor to
-    This function creates an operator to convert an LOD_Tensor to
       an array.
+    Args:
+        x (Variable|list): The tensor that needs to be converted to an array.
+        table (ParamAttr|list): The variable that stores the level of lod
+                                which is ordered by sequence length in
+                                descending order.
+    Returns:
+        Variable: The variable of type array that has been converted from a
+                  tensor.
+    Examples:
+        .. code-block:: python
+          x = fluid.layers.data(name='x', shape=[10])
+          table = fluid.layers.lod_rank_table(x, level=0)
+          array = fluid.layers.lod_tensor_to_array(x, table)
    """
    helper = LayerHelper("lod_tensor_to_array", **locals())
    array = helper.create_variable(
@@ -459,9 +475,26 @@ def lod_tensor_to_array(x, table):
 def array_to_lod_tensor(x, table):
-    """
+    """This function performs the operations that converts an array to
-    This function creates an operator to convert an array to a
+       an LOD_Tensor.
-    LOD_Tensor.
+    Args:
+        x (Variable|list): The array that needs to be converted to a tensor.
+        table (ParamAttr|list): The variable that stores the level of lod
+                                which is ordered by sequence length in
+                                descending order.
+    Returns:
+        Variable: The variable of type tensor that has been converted
+                  from an array.
+    Examples:
+        .. code-block:: python
+          x = fluid.layers.data(name='x', shape=[10])
+          table = fluid.layers.lod_rank_table(x, level=0)
+          array = fluid.layers.lod_tensor_to_array(x, table)
+          lod_tensor = fluid.layers.array_to_lod_tensor(array, table)
    """
    helper = LayerHelper("array_to_lod_tensor", **locals())
    tmp = helper.create_tmp_variable(dtype=x.dtype)
@@ -474,10 +507,24 @@ def array_to_lod_tensor(x, table):
 def increment(x, value=1.0, in_place=True):
-    """
+    """This function performs an operation that increments each value in the
-    This function creates an operator to increment each value in the input
+    input :math:`x` by an amount: :math:`value` as mentioned in the input
-    `x` by an amount: `value` as mentioned in the input parameter. This
+    parameter. This operation is performed in-place by default.
-    operation is performed in-place by default.
+    Args:
+        x (Variable|list): The tensor that has the input values.
+        value (float): The amount by which the values should be incremented.
+        in_place (bool): If the increment should be performed in-place.
+    Returns:
+        Variable: The tensor variable storing the transformation of
+                  element-wise increment of each value in the input.
+    Examples:
+        .. code-block:: python
+          data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32')
+          data = fluid.layers.increment(x=data, value=3.0, in_place=True)
    """
    helper = LayerHelper("increment", **locals())
    if not in_place:
@@ -493,9 +540,24 @@ def increment(x, value=1.0, in_place=True):
 def array_write(x, i, array=None):
-    """
+    """This function performs the operation to write the data out as an
-    This function creates an operator to write the data out as a
    LOD_TENSOR_ARRAY.
+    Args:
+        x (Variable|list): The input tensor from which the data will be read.
+        i (Variable|list): The subscript index in tensor array, that points the
+                           place from which data will be read.
+        array (Variable|list): The data can be read into this variable if
+                               this is assigned.
+    Returns:
+        Variable: The tensor type variable that has the data written to it.
+    Examples:
+        .. code-block::python
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = layers.array_write(tmp, i=i)
    """
    helper = LayerHelper('array_write', **locals())
    if array is None:
@@ -512,6 +574,21 @@ def array_write(x, i, array=None):
 def create_array(dtype):
+    """This function creates an array of type :math:`LOD_TENSOR_ARRAY` using the
+    LayerHelper.
+    Args:
+        dtype (int|float): The data type of the elements in the array.
+    Returns:
+        Variable: The tensor variable storing the elements of data type.
+    Examples:
+        .. code-block:: python
+          data = fluid.layers.create_array(dtype='float32')
+    """
    helper = LayerHelper("array", **locals())
    return helper.create_variable(
        name="{0}.out".format(helper.name),
@@ -550,9 +627,19 @@ def less_than(x, y, cond=None, **ignored):
 def array_read(array, i):
-    """
+    """This function performs the operation to read the data in as an
-    This function creates an operator to read the data in as a
    LOD_TENSOR_ARRAY.
+    Args:
+        array (Variable|list): The input tensor that will be written to an array.
+        i (Variable|list): The subscript index in tensor array, that points the
+                           place where data will be written to.
+    Returns:
+        Variable: The tensor type variable that has the data written to it.
+    Examples:
+        .. code-block::python
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = layers.array_read(tmp, i=i)
    """
    helper = LayerHelper('array_read', **locals())
    if not isinstance(
@@ -586,9 +673,23 @@ def shrink_memory(x, i, table):
 def array_length(array):
-    """
+    """This function performs the operation to find the length of the input
-    This function creates an operator to find the length of the
    LOD_TENSOR_ARRAY.
+    Args:
+        array (LOD_TENSOR_ARRAY): The input array that will be used
+                                  to compute the length.
+    Returns:
+        Variable: The length of the input LoDTensorArray.
+    Examples:
+        .. code-block::python
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = fluid.layers.array_write(tmp, i=i)
+          arr_len = fluid.layers.array_length(arr)
    """
    helper = LayerHelper('array_length', **locals())
    tmp = helper.create_tmp_variable(dtype='int64')

--- a/python/paddle/v2/fluid/layers/io.py
+++ b/python/paddle/v2/fluid/layers/io.py
@@ -12,20 +12,9 @@ def data(name,
         type=core.VarDesc.VarType.LOD_TENSOR,
         stop_gradient=True):
    """
-    Data Layer.
+    **Data Layer**
-    Args:
+    This function takes in the input and based on whether data has
-       name: The name/alias of the function
-       shape: Tuple declaring the shape.
-       append_batch_size: Whether or not to append the data as a batch.
-       dtype: The type of data : float32, float_16, int etc
-       type: The output type. By default it is LOD_TENSOR.
-       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
-       stop_gradient: A boolean that mentions whether gradient should flow.
-    This function takes in input and based on whether data has
    to be returned back as a minibatch, it creates the global variable using
    the helper functions. The global variables can be accessed by all the
    following operations and layers in the graph.
@@ -33,6 +22,24 @@ def data(name,
    All the input variables of this function are passed in as local variables
    to the LayerHelper constructor.
+    Args:
+       name(str): The name/alias of the function
+       shape(list): Tuple declaring the shape.
+       append_batch_size(bool): Whether or not to append the data as a batch.
+       dtype(int|float): The type of data : float32, float_16, int etc
+       type(VarType): The output type. By default it is LOD_TENSOR.
+       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
+       main_program(Program): Name of the main program that calls this
+       startup_program(Program): Name of the startup program
+       stop_gradient(bool): A boolean that mentions whether gradient should flow.
+    Returns:
+        Variable: The global variable that gives access to the data.
+    Examples:
+        .. code-block:: python
+          data = fluid.layers.data(name='x', shape=[784], dtype='float32')
    """
    helper = LayerHelper('data', **locals())
    shape = list(shape)

--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -27,48 +27,81 @@ def fc(input,
    """
    **Fully Connected Layer**
-    This layer accepts multiple inputs and applies a linear transformation to each input.
+    The fully connected layer can take multiple tensors as its inputs. It
-    If activation type is provided, the corresponding activation function is applied to the
+    creates a variable (one for each input tensor) called weights for each input
-    output of the linear transformation. For each input :math:`X`, the equation is:
+    tensor, which represents a fully connected weight matrix from each input
+    unit to each output unit. The fully connected layer multiplies each input
+    tensor with its coresponding weight to produce an output Tensor. If
+    multiple input tensors are given, the results of multiple multiplications
+    will be sumed up. If bias_attr is not None, a biases variable will be
+    created and added to the output. Finally, if activation is not None,
+    it will be applied to the output as well.
+    This process can be formulated as follows:
    .. math::
-        Out = Act(WX + b)
+        Out = Act\left({\sum_{i=0}^{N-1}W_iX_i + b}\right)
    In the above equation:
-        * :math:`X`: Input value, a tensor with rank at least 2.
+    * :math:`N`: Number of the input.
-        * :math:`W`: Weight, a 2-D tensor with shape [M, N].
+    * :math:`X_i`: The input tensor.
-        * :math:`b`: Bias, a 2-D tensor with shape [M, 1].
+    * :math:`W`: The weights created by this layer.
-        * :math:`Act`: Activation function.
+    * :math:`b`: The bias parameter created by this layer (if needed).
-        * :math:`Out`: Output value, same shape with :math:`X`.
+    * :math:`Act`: The activation funtion.
+    * :math:`Out`: The output tensor.
-    All the input variables are passed in as local variables to the LayerHelper
-    constructor.
    Args:
-       input(Variable|list): Input tensors. Each tensor has a rank of atleast 2
+       input(Variable|list): The input tensor(s) to the fully connected layer.
-       size(int): Output size
+       size(int): The number of output units in the fully connected layer.
-       num_flatten_dims(int): Number of columns in input
+       num_flatten_dims(int): The fc layer can accept an input tensor with more
-       param_attr(ParamAttr|list): The parameters/weights to the FC Layer
+                              than two dimensions. If this happens, the
-       bias_attr(ParamAttr|list): Bias parameter for the FC layer
+                              multidimensional tensor will first be flattened
-       act(str): Activation type
+                              into a 2-dimensional matrix. The parameter
-       name(str): Name/alias of the function
+                              `num_flatten_dims` determines how the input tensor
+                              is flattened: the first `num_flatten_dims`
+                              dimensions will be flatten to form the first
+                              dimension of the final matrix (height of the
+                              matrix), and the rest `rank(X) - num_col_dims`
+                              dimensions are flattened to form the second
+                              dimension of the final matrix (width of the matrix).
+                              For example, suppose `X` is a 6-dimensional tensor
+                              with a shape [2, 3, 4, 5, 6], and
+                              `x_num_col_dims` = 3. Then, the flattened matrix
+                              will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+                              By default, `x_num_col_dims` is set to 1.
+       param_attr(ParamAttr|list): The parameter attribute for learnable
+                                   parameters/weights of the fully connected
+                                   layer.
+       param_initializer(ParamAttr|list): The initializer used for the
+                                          weight/parameter. If set None,
+                                          XavierInitializer() will be used.
+       bias_attr(ParamAttr|list): The parameter attribute for the bias parameter
+                                  for this layer. If set None, no bias will be
+                                  added to the output units.
+       bias_initializer(ParamAttr|list): The initializer used for the bias.
+                                        If set None, then ConstantInitializer()
+                                        will be used.
+       act(str): Activation to be applied to the output of the fully connected
+                 layer.
+       name(str): Name/alias of the fully connected layer.
    Returns:
-        Variable: The tensor variable storing the transformation and \
+        Variable: The output tensor variable.
-                  non-linearity activation result.
    Raises:
-        ValueError: If rank of input tensor is less than 2.
+        ValueError: If rank of the input tensor is less than 2.
    Examples:
        .. code-block:: python
-          data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32')
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
          fc = fluid.layers.fc(input=data, size=1000, act="tanh")
    """
-    helper = LayerHelper('fc', **locals())
+    helper = LayerHelper("fc", **locals())
    dtype = helper.input_dtype()
@@ -88,8 +121,8 @@ def fc(input,
                "Y": w,
            },
            outputs={"Out": tmp},
-            attrs={'x_num_col_dims': num_flatten_dims,
+            attrs={"x_num_col_dims": num_flatten_dims,
-                   'y_num_col_dims': 1})
+                   "y_num_col_dims": 1})
        mul_results.append(tmp)
    # sum
@@ -117,7 +150,7 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'):
    Args:
       input(Variable): Input to the function
-       size(int): Output size
+       size(tuple|list|None): Shape of the look up table parameter 
       is_sparse(bool): Boolean flag that specifying whether the input is sparse
       param_attr(ParamAttr): Parameters for this layer
       dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc
@@ -704,6 +737,7 @@ def conv2d_transpose(input,
                     filter_size=None,
                     padding=None,
                     stride=None,
+                     dilation=None,
                     param_attr=None):
    """
    The transpose of conv2d layer.
@@ -727,6 +761,9 @@ def conv2d_transpose(input,
        stride(int|tuple): The stride size. If stride is a tuple, it must
            contain two integers, (stride_H, stride_W). Otherwise, the
            stride_H = stride_W = stride.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation.
        param_attr: Parameter Attribute.
        main_program(Program): the main program
        startup_program(Program): the startup program
@@ -747,10 +784,15 @@ def conv2d_transpose(input,
        op_attr['paddings'] = padding
    if isinstance(stride, int):
-        op_attr['strides'] = stride
+        op_attr['strides'] = [stride, stride]
    elif stride is not None:
        op_attr['strides'] = stride
+    if isinstance(dilation, int):
+        op_attr['dilations'] = [dilation, dilation]
+    elif dilation is not None:
+        op_attr['dilations'] = dilation
    if filter_size is None:
        if output_size is None:
            raise ValueError("output_size must be set when filter_size is None")
@@ -759,14 +801,17 @@ def conv2d_transpose(input,
        padding = op_attr.get('paddings', [0, 0])
        stride = op_attr.get('strides', [1, 1])
+        dilation = op_attr.get('dilations', [1, 1])
        h_in = input.shape[2]
        w_in = input.shape[3]
-        filter_size_h = output_size[0] - \
-                        (h_in - 1) * stride[0] + 2 * padding[0]
+        filter_size_h = (output_size[0] - (h_in - 1) * stride[0] + 2 *
-        filter_size_w = output_size[1] - \
+                         padding[0] - 1) / dilation[0] + 1
-                        (w_in - 1) * stride[1] + 2 * padding[1]
+        filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + 2 *
+                         padding[1] - 1) / dilation[1] + 1
        filter_size = [filter_size_h, filter_size_w]
    elif isinstance(filter_size, int):
        filter_size = [filter_size, filter_size]

--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -27,10 +27,23 @@ def cast(x, dtype):
    return out
-def concat(input, axis):
+def concat(input, axis=0):
    """
-    This function concats the input along the axis mentioned
+    **Concat**
+    This function concatenates the input along the axis mentioned
    and returns that as the output.
+    Args:
+        input(list): List of tensors to be concatenated
+        axis(int): Integer axis along which the tensors will be concatenated
+    Returns:
+        Variable: Output variable of the concatenation
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
    """
    helper = LayerHelper('concat', **locals())
    out = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -43,9 +56,28 @@ def concat(input, axis):
 def sums(input, out=None):
-    """
+    """This function performs the sum operation on the input and returns the
-    This function takes in the input and performs the sum operation on it
+    result as the output.
-    and returns that as the output.
+    Args:
+        input (Variable|list): The input tensor that has the elements
+                               that need to be summed up.
+    Returns:
+        Variable: The tensor type variable that has the sum of input
+                  written to it.
+    Examples:
+        .. code-block::python
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          a0 = layers.array_read(array=tmp, i=i)
+          i = layers.increment(x=i)
+          a1 = layers.array_read(array=tmp, i=i)
+          mean_a0 = layers.mean(x=a0)
+          mean_a1 = layers.mean(x=a1)
+          a_sum = layers.sums(input=[mean_a0, mean_a1])
    """
    helper = LayerHelper('sum', **locals())
    if out is None:
@@ -55,6 +87,24 @@ def sums(input, out=None):
 def assign(input, output):
+    """
+    **Assign**
+    This function copies the *input* Variable to the *output* Variable.
+    Args:
+        input(Variable): The source variable
+        output(Variable): The destination variable
+    Returns:
+        Variable: The destination variable that was supplied as the *output*.
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.create_tensor(dtype='float32')
+          hidden = fluid.layers.fc(input=data, size=10)
+          fluid.layers.assign(hidden, out)
+    """
    helper = LayerHelper('assign', **locals())
    helper.append_op(
        type='scale',

--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
@@ -58,7 +58,9 @@ class ParamAttr(object):
    def to_kwargs(self, with_initializer=False):
        kwargs = {
            'name': self.name,
-            'learning_rate': self.learning_rate,
+            'optimize_attr': {
+                'learning_rate': self.learning_rate
+            },
            'regularizer': self.regularizer,
            'trainable': self.trainable,
            'clip_attr': self.clip

--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -341,6 +341,10 @@ class TestBatchNormOp(OpTest):
        places = [core.CPUPlace()]
        if core.is_compile_gpu() and core.op_support_gpu("batch_norm"):
            places.append(core.GPUPlace(0))
+            core.init_devices(["CPU", "GPU:0"])
+        else:
+            core.init_devices(["CPU"])
        for place in places:
            for data_format in ["NCHW", "NHWC"]:
                test_with_place(place, data_format, [2, 3, 4, 5])

--- a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
@@ -3,14 +3,17 @@ import numpy as np
 from op_test import OpTest
-def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
+def conv2dtranspose_forward_naive(input_, filter_, attrs):
    in_n, in_c, in_h, in_w = input_.shape
    f_c, out_c, f_h, f_w = filter_.shape
    assert in_c == f_c
-    stride, pad = conv2dtranspose_param['stride'], conv2dtranspose_param['pad']
+    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
-    out_h = (in_h - 1) * stride[0] + f_h
+        'dilations']
-    out_w = (in_w - 1) * stride[1] + f_w
+    d_bolck_h = dilations[0] * (f_h - 1) + 1
+    d_bolck_w = dilations[1] * (f_w - 1) + 1
+    out_h = (in_h - 1) * stride[0] + d_bolck_h
+    out_w = (in_w - 1) * stride[1] + d_bolck_w
    out = np.zeros((in_n, out_c, out_h, out_w))
@@ -23,9 +26,9 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
                for k in range(out_c):
                    tmp_out = np.sum(input_masked * filter_[:, k, :, :], axis=0)
-                    i1, i2 = i * stride[0], i * stride[0] + f_h
+                    i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
-                    j1, j2 = j * stride[0], j * stride[0] + f_w
+                    j1, j2 = j * stride[0], j * stride[0] + d_bolck_h
-                    out[n, k, i1:i2, j1:j2] += tmp_out
+                    out[n, k, i1:i2:dilations[0], j1:j2:dilations[1]] += tmp_out
    out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
    return out
@@ -37,11 +40,8 @@ class TestConv2dTransposeOp(OpTest):
        self.init_op_type()
        self.init_test_case()
-        conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad}
        input_ = np.random.random(self.input_size).astype("float32")
        filter_ = np.random.random(self.filter_size).astype("float32")
-        output = conv2dtranspose_forward_naive(
-            input_, filter_, conv2dtranspose_param).astype('float32')
        self.inputs = {'Input': input_, 'Filter': filter_}
        self.attrs = {
@@ -49,6 +49,10 @@ class TestConv2dTransposeOp(OpTest):
            'paddings': self.pad,
            'dilations': self.dilations
        }
+        output = conv2dtranspose_forward_naive(input_, filter_,
+                                               self.attrs).astype('float32')
        self.outputs = {'Output': output}
    def test_check_output(self):
@@ -104,11 +108,60 @@ class TestWithStride(TestConv2dTransposeOp):
        self.filter_size = [f_c, 6, 3, 3]
+class TestWithDilation(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
 # ------------ test_cudnn ------------
 class TestCudnn(TestConv2dTransposeOp):
    def init_op_type(self):
        self.op_type = "conv2d_transpose_cudnn"
+class TestCudnnWithPad(TestWithPad):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+    def init_op_type(self):
+        self.op_type = "conv2d_transpose_cudnn"
+class TestCudnnWithStride(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+    def init_op_type(self):
+        self.op_type = "conv2d_transpose_cudnn"
+# #cudnn v5 does not support dilation conv.
+# class TestCudnnWithDilation(TestWithDilation):
+#     def init_test_case(self):
+#         self.pad = [1, 1]
+#         self.stride = [2, 2]
+#         self.dilations = [2, 2]
+#         self.input_size = [2, 3, 5, 5]  # NCHW
+#         f_c = self.input_size[1]
+#         self.filter_size = [f_c, 6, 3, 3]
+#
+#     def init_op_type(self):
+#         self.op_type = "conv2d_transpose_cudnn"
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
@@ -3,15 +3,20 @@ import numpy as np
 from op_test import OpTest
-def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
+def conv3dtranspose_forward_naive(input_, filter_, attrs):
    in_n, in_c, in_d, in_h, in_w = input_.shape
    f_c, out_c, f_d, f_h, f_w = filter_.shape
    assert in_c == f_c
-    stride, pad = conv3dtranspose_param['stride'], conv3dtranspose_param['pad']
+    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
-    out_d = (in_d - 1) * stride[0] + f_d
+        'dilations']
-    out_h = (in_h - 1) * stride[1] + f_h
-    out_w = (in_w - 1) * stride[2] + f_w
+    d_bolck_d = dilations[0] * (f_d - 1) + 1
+    d_bolck_h = dilations[1] * (f_h - 1) + 1
+    d_bolck_w = dilations[2] * (f_w - 1) + 1
+    out_d = (in_d - 1) * stride[0] + d_bolck_d
+    out_h = (in_h - 1) * stride[1] + d_bolck_h
+    out_w = (in_w - 1) * stride[2] + d_bolck_w
    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
    for n in range(in_n):
@@ -25,10 +30,11 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
                    for k in range(out_c):
                        tmp_out = np.sum(input_masked * filter_[:, k, :, :, :],
                                         axis=0)
-                        d1, d2 = d * stride[0], d * stride[0] + f_d
+                        d1, d2 = d * stride[0], d * stride[0] + d_bolck_d
-                        i1, i2 = i * stride[1], i * stride[1] + f_h
+                        i1, i2 = i * stride[1], i * stride[1] + d_bolck_h
-                        j1, j2 = j * stride[2], j * stride[2] + f_w
+                        j1, j2 = j * stride[2], j * stride[2] + d_bolck_w
-                        out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out
+                        out[n, k, d1:d2:dilations[0], i1:i2:dilations[1], j1:j2:
+                            dilations[2]] += tmp_out
    out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
              pad[2]]
@@ -41,18 +47,19 @@ class TestConv3dTransposeOp(OpTest):
        self.init_op_type()
        self.init_test_case()
-        conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad}
        input_ = np.random.random(self.input_size).astype("float32")
        filter_ = np.random.random(self.filter_size).astype("float32")
-        output = conv3dtranspose_forward_naive(
-            input_, filter_, conv3dtranspose_param).astype("float32")
        self.inputs = {'Input': input_, 'Filter': filter_}
        self.attrs = {
            'strides': self.stride,
            'paddings': self.pad,
-            # 'dilations': self.dilations
+            'dilations': self.dilations
        }
+        output = conv3dtranspose_forward_naive(input_, filter_,
+                                               self.attrs).astype("float32")
        self.outputs = {'Output': output}
    def test_check_output(self):
@@ -108,11 +115,60 @@ class TestWithStride(TestConv3dTransposeOp):
        self.filter_size = [f_c, 6, 3, 3, 3]
+class TestWithDilation(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [2, 2, 2]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
 # ------------ test_cudnn ------------
 class TestCudnn(TestConv3dTransposeOp):
    def init_op_type(self):
        self.op_type = "conv3d_transpose_cudnn"
+class TestCudnnWithPad(TestWithPad):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose_cudnn"
+class TestCudnnWithStride(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose_cudnn"
+# #cudnn v5 does not support dilation conv.
+# class TestCudnnWithDilation(TestWithDilation):
+#     def init_test_case(self):
+#         self.pad = [1, 1, 1]
+#         self.stride = [2, 2, 2]
+#         self.dilations = [2, 2, 2]
+#         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+#         f_c = self.input_size[1]
+#         self.filter_size = [f_c, 6, 3, 3, 3]
+#
+#     def init_op_type(self):
+#         self.op_type = "conv3d_transpose_cudnn"
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
@@ -7,7 +7,7 @@ class TestFillZerosLikeOp(OpTest):
    def setUp(self):
        self.op_type = "fill_zeros_like"
        self.inputs = {'X': np.random.random((219, 232)).astype("float32")}
-        self.outputs = {'Y': np.zeros_like(self.inputs["X"])}
+        self.outputs = {'Out': np.zeros_like(self.inputs["X"])}
    def test_check_output(self):
        self.check_output()

--- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
 import unittest
+import numpy
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.op import Operator
-import numpy
+from paddle.v2.fluid.executor import Executor
 class TestGaussianRandomOp(unittest.TestCase):
+    def setUp(self):
+        self.op_type = "gaussian_random"
+        self.inputs = {}
+        self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10}
+        self.outputs = ["Out"]
    def test_cpu(self):
-        self.gaussian_random_test(place=core.CPUPlace())
+        self.gaussian_random_test(place=fluid.CPUPlace())
    def test_gpu(self):
        if core.is_compile_gpu():
-            self.gaussian_random_test(place=core.GPUPlace(0))
+            self.gaussian_random_test(place=fluid.GPUPlace(0))
    def gaussian_random_test(self, place):
-        scope = core.Scope()
-        scope.var('Out').get_tensor()
-        op = Operator(
-            "gaussian_random",
-            Out='Out',
-            shape=[1000, 784],
-            mean=.0,
-            std=1.,
-            seed=10)
        context = core.DeviceContext.create(place)
-        op.run(scope, context)
+        program = fluid.Program()
-        tensor = numpy.array(scope.find_var('Out').get_tensor())
+        block = program.global_block()
+        vout = block.create_var(name="Out")
+        op = block.append_op(
+            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+        fetch_list = []
+        for var_name in self.outputs:
+            fetch_list.append(block.var(var_name))
+        exe = Executor(place)
+        outs = exe.run(program, fetch_list=fetch_list)
+        tensor = outs[0]
        self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
        self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)

--- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
 import unittest
+import numpy
 from paddle.v2.fluid.op import Operator
 import paddle.v2.fluid.core as core
-import numpy
+import paddle.v2.fluid as fluid
 class TestUniformRandomOp(unittest.TestCase):
-    def test_uniform_random_cpu(self):
+    def setUp(self):
+        self.op_type = "uniform_random"
+        self.inputs = {}
+        self.attrs = {
+            "shape": [1000, 784],
+            "min": -5.0,
+            "max": 10.0,
+            "seed": 10
+        }
+        self.outputs = ["Out"]
+    def test_cpu(self):
        self.uniform_random_test(place=core.CPUPlace())
-    def test_uniform_random_gpu(self):
+    def test_gpu(self):
        if core.is_compile_gpu():
            self.uniform_random_test(place=core.GPUPlace(0))
    def uniform_random_test(self, place):
-        scope = core.Scope()
+        context = core.DeviceContext.create(place)
-        scope.var('X').get_tensor()
+        program = fluid.Program()
+        block = program.global_block()
-        op = Operator(
+        vout = block.create_var(name="Out")
-            "uniform_random",
+        op = block.append_op(
-            Out='X',
+            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
-            shape=[1000, 784],
-            min=-5.0,
+        op.desc.infer_var_type(block.desc)
-            max=10.0,
+        op.desc.infer_shape(block.desc)
-            seed=10)
+        fetch_list = []
-        ctx = core.DeviceContext.create(place)
+        for var_name in self.outputs:
-        op.run(scope, ctx)
+            fetch_list.append(block.var(var_name))
-        tensor = numpy.array(scope.find_var('X').get_tensor())
+        exe = fluid.Executor(place)
+        outs = exe.run(program, fetch_list=fetch_list)
+        tensor = outs[0]
        self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1)