diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py
index 3358d43a4b08c6a9b89d59e1a8be53ee1f12bbe0..77d130ae34059d1e87040d00346ac1dadd86b0d8 100644
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@@ -6,8 +6,18 @@ height = 227
 width = 227
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
+gp = get_config_arg('layer_num', int, 1)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
 
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer,
+    'num_samples': num_samples
+}
 define_py_data_sources2(
     "train.list", None, module="provider", obj="process", args=args)
 
@@ -31,7 +41,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)
 
 # conv2
 net = img_conv_layer(
-    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
 net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
 
@@ -40,11 +50,11 @@ net = img_conv_layer(
     input=net, filter_size=3, num_filters=384, stride=1, padding=1)
 # conv4
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
 
 # conv5
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
 
 net = fc_layer(
@@ -59,6 +69,9 @@ net = fc_layer(
     layer_attr=ExtraAttr(drop_rate=0.5))
 net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
 
-lab = data_layer('label', num_class)
-loss = cross_entropy(input=net, label=lab)
-outputs(loss)
+if is_infer:
+    outputs(net)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=net, label=lab)
+    outputs(loss)
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
index 7059c13bd2c2b98eb3fbcf633a6f7064e54d5402..2a850ccb7f2c75b467554181fc5f4aa8f2b97a09 100644
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 use_gpu = get_config_arg('use_gpu', bool, True)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
 
 args = {
     'height': height,
     'width': width,
     'color': True,
     'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
     "train.list" if not is_infer else None,
diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
index 927b1759941f362ef4b5ffe84dd01332986d9306..1018ec9ce1e529f618ddd7b7afa72a84c5e876a1 100644
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -14,6 +14,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
     else:
         settings.data_size = settings.height * settings.width
     settings.is_infer = kwargs.get('is_infer', False)
+    settings.num_samples = kwargs.get('num_samples', 2560)
     if settings.is_infer:
         settings.slots = [dense_vector(settings.data_size)]
     else:
@@ -23,7 +24,7 @@ def initHook(settings, height, width, color, num_class, **kwargs):
 @provider(
     init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(2560 if settings.is_infer else 1024):
+    for i in xrange(settings.num_samples):
         img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
         if settings.is_infer:
             yield img.astype('float32')
diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py
index 4a14363ff1db48a5072cbb5f5eb3bc9241ffca8f..2846e4763f1cda4602f03af5ec649d57ee6cf0d8 100644
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg("layer_num", int, 50)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
 
 args = {
     'height': height,
     'width': width,
     'color': True,
     'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
     "train.list" if not is_infer else None,
diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
index d795bcab1b7d098295066f79189d17e8299d28fb..62c9bf6efd3810f506fd4592b2ba3a21b1b7f0e7 100755
--- a/benchmark/paddle/image/run_mkl_infer.sh
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@@ -37,7 +37,7 @@ function infer() {
       --trainer_count=1 \
       --num_passes=1 \
       --save_dir="models/${topology}-${layer_num}" \
-      --config_args="batch_size=128,layer_num=${layer_num}" \
+      --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
       > /dev/null 2>&1
     echo "Done"
   fi
@@ -79,8 +79,9 @@ fi
 # inference benchmark
 for use_mkldnn in True False; do
   for batchsize in 1 2 4 8 16; do
-    infer googlenet v1 $batchsize $use_mkldnn
-    infer resnet 50 $batchsize $use_mkldnn
     infer vgg 19 $batchsize $use_mkldnn
+    infer resnet 50 $batchsize $use_mkldnn
+    infer googlenet v1 $batchsize $use_mkldnn
+    infer alexnet 2 $batchsize $use_mkldnn
   done
 done
diff --git a/benchmark/paddle/image/run_mkl_train.sh b/benchmark/paddle/image/run_mkl_train.sh
index 5335af5ac1b9a4a48ec107b8b6386b50ead8284c..03d2d378fb72e36f765d89af788f6ee96fe21d4e 100755
--- a/benchmark/paddle/image/run_mkl_train.sh
+++ b/benchmark/paddle/image/run_mkl_train.sh
@@ -47,5 +47,6 @@ for use_mkldnn in True False; do
     train vgg 19 $batchsize $use_mkldnn
     train resnet 50 $batchsize $use_mkldnn
     train googlenet v1 $batchsize $use_mkldnn
+    train alexnet 2 $batchsize $use_mkldnn
   done
 done
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
index c1001d3a7c95a293d0b2b5b78fb7415e167b3e9f..da034f3b9dff794e22086a5295ad2b0c2361c356 100755
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -23,24 +23,25 @@ function infer() {
     echo "./run_mkl_infer.sh to save the model first"
     exit 0
   fi
-  log_period=$((256 / bs))
+  log_period=$((32 / bs))
   paddle train --job=test \
     --config="${topology}.py" \
+    --use_mkldnn=False \
     --use_gpu=False \
     --trainer_count=$thread \
     --log_period=$log_period \
-    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
     --init_model_path=$models_in \
     2>&1 | tee ${log}
 
-  # calculate the last 5 logs period time of 1280 samples,
+  # calculate the last 5 logs period time of 160(=32*5) samples,
   # the time before are burning time.
   start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
   end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
   start_sec=`clock_to_seconds $start`
   end_sec=`clock_to_seconds $end`
-  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
-  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
+  echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
   echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
 }
 
@@ -56,7 +57,8 @@ fi
 
 # inference benchmark
 for batchsize in 1 2 4 8 16; do
-  infer googlenet v1 $batchsize
-  infer resnet 50 $batchsize
   infer vgg 19 $batchsize
+  infer resnet 50 $batchsize 
+  infer googlenet v1 $batchsize
+  infer alexnet 2 $batchsize
 done
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
index b9494ce119523953a3360b2b67e2cb6f3e0f1643..e9df83fee2a3f796b7234b39619364f6ee4d5dc9 100755
--- a/benchmark/paddle/image/run_openblas_train.sh
+++ b/benchmark/paddle/image/run_openblas_train.sh
@@ -12,10 +12,11 @@ function train() {
   config="${topology}.py"
   paddle train --job=time \
     --config=$config \
+    --use_mkldnn=False \
     --use_gpu=False \
     --trainer_count=$thread \
-    --log_period=10 \
-    --test_period=100 \
+    --log_period=3 \
+    --test_period=30 \
     --config_args=$args \
     2>&1 | tee ${log} 
 
@@ -36,4 +37,5 @@ for batchsize in 64 128 256; do
   train vgg 19 $batchsize
   train resnet 50 $batchsize
   train googlenet v1 $batchsize
+  train alexnet 2 $batchsize
 done
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
index 8d0a1e97a451cd52ef17e4e326673cc90059ef3c..ca0a6798fb8c35b68cf84d263855955eb93ba0b0 100644
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -7,13 +7,15 @@ num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
 is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
 
 args = {
     'height': height,
     'width': width,
     'color': True,
     'num_class': num_class,
-    'is_infer': is_infer
+    'is_infer': is_infer,
+    'num_samples': num_samples
 }
 define_py_data_sources2(
     "train.list" if not is_infer else None,
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index fab2af362bb070a54987b6499748056f3d12a56b..ff5855052dabaa0b63099cd219f3f04e22f1aa85 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -253,9 +253,9 @@ IF(NOT PROTOBUF_FOUND)
     IF(WITH_C_API)
         INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
         IF(ANDROID)
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
         ELSE()
-            INSTALL(FILES ${PROTOBUF_LIBRARY} DESTINATION third_party/protobuf/lib)
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
         ENDIF()
     ENDIF()
 
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index c3f9c18d0663a7a24880b441981875c1e4f015aa..d81481ca819c13ee0e299c204f998f3915c34bd4 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -467,7 +467,7 @@ lambda_cost
     :noindex:
 
 square_error_cost
---------
+-----------------
 ..  autoclass:: paddle.v2.layer.square_error_cost
     :noindex:
 
@@ -533,7 +533,7 @@ Miscs
 =====
 
 dropout
---------------
+--------
 ..  autoclass:: paddle.v2.layer.dropout
     :noindex:
 
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 92ca1cf0f836a376387f3e6f2b5a24c78109323d..ef9febe0aa9d1f65e6608495f6ad7d4f502acf59 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -19,17 +19,17 @@ dynamic_lstm
     :noindex:
 
 data
----------
+----
 ..  autofunction:: paddle.v2.fluid.layers.data
     :noindex:
 
 mean
----------
+----
 ..  autofunction:: paddle.v2.fluid.layers.mean
     :noindex:
 
 mul
----------
+---
 ..  autofunction:: paddle.v2.fluid.layers.mul
     :noindex:
 
@@ -45,13 +45,13 @@ elementwise_div
 
 
 dropout
----------
+-------
 ..  autofunction:: paddle.v2.fluid.layers.dropout
     :noindex:
 
 
 reshape
----------
+--------
 ..  autofunction:: paddle.v2.fluid.layers.reshape
     :noindex:
 
@@ -81,67 +81,67 @@ transpose
 
 
 sigmoid_cross_entropy_with_logits
----------
+---------------------------------
 ..  autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
     :noindex:
 
 
 cast
----------
+----
 ..  autofunction:: paddle.v2.fluid.layers.cast
     :noindex:
 
 
 concat
----------
+-------
 ..  autofunction:: paddle.v2.fluid.layers.concat
     :noindex:
 
 
 sums
----------
+----
 ..  autofunction:: paddle.v2.fluid.layers.sums
     :noindex:
 
 
 linear_chain_crf
----------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
     :noindex:
 
 
 assign
----------
+-------
 ..  autofunction:: paddle.v2.fluid.layers.embedding
     :noindex:
 
 
 split_lod_tensor
----------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
     :noindex:
 
 
 merge_lod_tensor
----------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
     :noindex:
 
 cos_sim
----------
+--------
 ..  autofunction:: paddle.v2.fluid.layers.cos_sim
     :noindex:
 
 
 cross_entropy
----------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.cross_entropy
     :noindex:
 
 
 
 square_error_cost
----------
+-----------------
 ..  autofunction:: paddle.v2.fluid.layers.square_error_cost
     :noindex:
 
@@ -153,68 +153,80 @@ accuracy
 
 
 sequence_conv
----------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_conv
     :noindex:
 
 
 conv2d
----------
+------
 ..  autofunction:: paddle.v2.fluid.layers.conv2d
     :noindex:
 
 
 sequence_pool
----------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_pool
     :noindex:
 
 
+sequence_first_step
+-------------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
+    :noindex:
+
+
+sequence_last_step
+------------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+    :noindex:
+
+
 pool2d
----------
+------
 ..  autofunction:: paddle.v2.fluid.layers.pool2d
     :noindex:
 
 
 batch_norm
----------
+----------
 ..  autofunction:: paddle.v2.fluid.layers.batch_norm
     :noindex:
 
 
 beam_search_decode
----------
+------------------
 ..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
     :noindex:
 
 
 lod_rank_table
----------
+--------------
 ..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
     :noindex:
 
 
 max_sequence_len
----------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
     :noindex:
 
 
 topk
----------
+-----
 ..  autofunction:: paddle.v2.fluid.layers.topk
     :noindex:
 
 
 lod_tensor_to_array
----------
+-------------------
 ..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
     :noindex:
 
 
 
 array_to_lod_tensor
----------
+-------------------
 ..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
     :noindex:
 
@@ -222,26 +234,26 @@ array_to_lod_tensor
 
 
 fill_constant
----------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.fill_constant
     :noindex:
 
 
 
 fill_constant_batch_size_like
----------
+-----------------------------
 ..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
     :noindex:
 
 
 ones
----------
+----
 ..  autofunction:: paddle.v2.fluid.layers.ones
     :noindex:
 
 
 zeros
----------
+-----
 ..  autofunction:: paddle.v2.fluid.layers.zeros
     :noindex:
 
@@ -253,14 +265,14 @@ increment
 
 
 array_write
----------
+-----------
 ..  autofunction:: paddle.v2.fluid.layers.array_write
     :noindex:
 
 
 
 create_array
----------
+------------
 ..  autofunction:: paddle.v2.fluid.layers.create_array
     :noindex:
 
@@ -272,31 +284,31 @@ less_than
 
 
 array_read
----------
+----------
 ..  autofunction:: paddle.v2.fluid.layers.array_read
     :noindex:
 
 
 shrink_memory
----------
+--------------
 ..  autofunction:: paddle.v2.fluid.layers.shrink_memory
     :noindex:
 
 
 array_length
----------
+-------------
 ..  autofunction:: paddle.v2.fluid.layers.array_length
     :noindex:
 
 
 conv2d_transpose
----------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
     :noindex:
 
 
 sequence_expand
----------
+---------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_expand
     :noindex:
 
@@ -308,7 +320,19 @@ lstm_unit
 
 
 sequence_softmax
----------
+----------------
 ..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
     :noindex:
 
+
+reduce_sum
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+    :noindex:
+
+
+reduce_mean
+---------
+..  autofunction:: paddle.v2.fluid.layers.reduce_mean
+    :noindex:
+
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst
index 2c3d075422de29c96e25458e831133a30270dd39..b792efb71f85ae643df655568da69c82414e9d5d 100644
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@@ -3,19 +3,19 @@ Nets
 ===========
 
 simple_img_conv_pool
------------
+--------------------
 ..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
     :noindex:
 
 
 img_conv_group
------------
+---------------
 ..  autofunction:: paddle.v2.fluid.nets.img_conv_group
     :noindex:
 
 
 sequence_conv_pool
------------
+------------------
 ..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
     :noindex:
 
diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst
index 233762fcdfb39e592740adef6721a556fae3feef..19b4940f08de3e2f7dc177f2961e538946d10a78 100644
--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
@@ -18,7 +18,7 @@ SGDOptimizer
 
 
 MomentumOptimizer
------------
+-----------------
 ..  automodule:: paddle.v2.fluid.optimizer
     :members: MomentumOptimizer
     :noindex:
@@ -26,14 +26,14 @@ MomentumOptimizer
 
 
 AdagradOptimizer
------------
+----------------
 ..  automodule:: paddle.v2.fluid.optimizer
     :members: AdagradOptimizer
     :noindex:
 
 
 AdamOptimizer
------------
+-------------
 ..  automodule:: paddle.v2.fluid.optimizer
     :members: AdamOptimizer
     :noindex:
@@ -47,7 +47,7 @@ AdamaxOptimizer
 
 
 DecayedAdagradOptimizer
------------
+-----------------------
 ..  automodule:: paddle.v2.fluid.optimizer
     :members: DecayedAdagradOptimizer
     :noindex:
diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst
index 3af2b07d2ae55d99df705fbf1ad2402eee05c435..868e225ed3d59e79aeb217fb88081ea25f80fa2c 100644
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
@@ -3,14 +3,14 @@ Regularizer
 ===========
 
 WeightDecayRegularizer
------------
+----------------------
 ..  automodule:: paddle.v2.fluid.regularizer
     :members: WeightDecayRegularizer
     :noindex:
 
 
 L2DecayRegularizer
------------
+------------------
 ..  automodule:: paddle.v2.fluid.regularizer
     :members: L2DecayRegularizer
     :noindex:
@@ -18,7 +18,7 @@ L2DecayRegularizer
 
 
 L1DecayRegularizer
------------
+-------------------
 ..  automodule:: paddle.v2.fluid.regularizer
     :members: L1DecayRegularizer
 
diff --git a/doc/design/kernel_hint_design.md b/doc/design/kernel_hint_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..a54b7da045e1a362626ef066f9ebb56af2c3181a
--- /dev/null
+++ b/doc/design/kernel_hint_design.md
@@ -0,0 +1,57 @@
+## Problem
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+
+In the current design, we use KernelType to describe one kernel.
+
+```cpp
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+ `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
+
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+
+So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
+
+The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
+
+## Solution
+
+### Potential choice
+1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
+
+2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
+
+### Final choice
+To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
+
+In C++
+
+```cpp
+const std::string kForceCPU = "force_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
+
+KernelType GetExpectedKernelType() {
+  if (Attr<bool>(kForceCPU)) {
+    return KernelType(CPUPlace, ...)
+  } else {
+    ...
+  }
+}
+```
+
+In Python code
+
+```python
+FORCE_CPU = core.kForceCPU()
+
+def xx_layer(..., force_cpu=false):
+  layer_helper = LayerHelper(...)
+  layer_helper.append_op(
+    type="xx",
+    attr={FORCE_CPU: force_cpu})
+```
diff --git a/doc/design/operator_kernel_type.md b/doc/design/operator_kernel_type.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa82e96bf79319f1a57e2ad58aa9826e57be6470
--- /dev/null
+++ b/doc/design/operator_kernel_type.md
@@ -0,0 +1,91 @@
+# Design Doc: The Keys of Operator Kernel Type
+## Problem
+An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique Kernel. Before an operator runs, an certain kernel must be chosen by a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  proto::DataType data_type_;
+};
+```
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
+
+It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys are not enough. We need a more complete representation of `OpKernelType`. 
+
+We often implement a kernel of an operator with some computing library in certain device(place). Please remind that computing library and device are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. 
+
+For example, Eigen library can support Nvidia GPU/AMD GPU/CPU. And MKLDNN library can support Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
+
+It's obvious that different DataTypes, like fp64/fp32/int8 will have different kernels. But the data layout of a Tensor will also lead to different implementation. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209). Data Layout should also be taken into consideration.
+
+## Solution
+
+There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  platform::Library library_;
+  proto::DataType data_type_;
+  framework::Layout layout_;
+};
+```
+
+Following is the details:
+
+### Place
+
+`Place` is defined as follows:
+
+```cpp
+typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
+```
+
+`Place` is to represent the device memory where data is locating.
+
+
+### Library
+
+One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
+
+```cpp
+enum Library { Plain, MKLDNN, CUDNN };
+```
+
+We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
+A library usually has a corresponding `DeviceContext` which contains some handles needed by computation. Fluid now have two default DeviceContexts in CPU and CUDA, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains a Eigen library handle and `CDUADeviceContext` contains a Eigen library handle and cuBLAS handle.
+
+If we want to support new Library, a new enumerator need to be added to `Library` and a new corresponding `LibraryDeviceContext` will be created.
+
+
+### DataType
+
+
+`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
+
+### Layout
+
+Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
+
+Different layout leads to different implementation of operator kernel. There are mainly 4 principles we have to follow to support layout in our fluid framework.
+
+- We take layout as a data member of Tensor. Layout is actually a enum variable. If fluid is built with MKLDNN, then, the memory format in MKLDNN will be added into this enum variable too.
+
+- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout of generating data. Of course, we can have some default layout, like NCHW.
+
+- The inference of Layout is at run-time, not compile-time.
+
+- Every operator have to implement different kernels for different layouts. Let's take MKLDNN as an example, if we want to implement a MKLDNN convolution operator, we have to realize all the kernels for different layout, list at [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to do registering kernels for MKLDNN operators.
+
+`Layout` is also defined as a enum variable:
+
+```cpp
+enum Layout {
+  kNCHW,
+  kNHWC,
+#ifdef PADDLE_WITH_MKLDNN
+  knChw8c
+  ...
+#endif
+};
+```
diff --git a/doc/design/refactor/multi_cpu.md b/doc/design/refactor/multi_cpu.md
new file mode 100644
index 0000000000000000000000000000000000000000..a8d8ee0422acc84835170a44eb83f9b5f0c6bb40
--- /dev/null
+++ b/doc/design/refactor/multi_cpu.md
@@ -0,0 +1,43 @@
+# Design Doc: Execute the Program with Multi CPU
+
+## Abstract
+
+This Design Doc propose an approach to make the user-defined Op graph
+running with multi-CPU, we will use an auto transpiler to convert the user-defined
+Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
+
+## Transpiler
+
+<img src="src/multi-threads/single-thread@3x.png" width="300">
+
+After converted:
+
+<img src="src/multi-threads/multi-threads@3x.png" width="1000">
+
+## Implement
+
+- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
+  which would be executed with multi-threads.
+- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
+  for the atomic counter become `0`:
+  ```cpp
+  BlockingCounter bc(thread_count);
+  for (int i = 0; i < thread_count; ++i) {
+    thread_pool->Start([&bc] {bc.DecrementCount(); })
+  }
+  bc.Wait();
+  ```
+- `ParallelDo` Operator
+  - Initialize a thread pool which is a Singleton.
+  - Use a block id as the input, and create run the specify Block on independent scope
+    with multi-threads.
+  - Initialize a `BlockingCounter` instance and wait until all threads are done.
+- `Split` Operator will split the Input Tensor into a TensorArray.
+- `Merge` merge all the gradients which calculated in different threads
+  with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
+
+## TODO
+
+- Improve the optimizer stage with multi-threads, since we could
+  assign the parameters to the different threads and execute
+  optimizer with multi-threads.
diff --git a/doc/design/refactor/src/multi-threads.graffle b/doc/design/refactor/src/multi-threads.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..e71173715fff92a0a933d0c7d83599ba948552c6
Binary files /dev/null and b/doc/design/refactor/src/multi-threads.graffle differ
diff --git a/doc/design/refactor/src/multi-threads/multi-threads@3x.png b/doc/design/refactor/src/multi-threads/multi-threads@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..e40a869987dbbf5019d4cb03c1dab55b74d6c9f9
Binary files /dev/null and b/doc/design/refactor/src/multi-threads/multi-threads@3x.png differ
diff --git a/doc/design/refactor/src/multi-threads/single-thread@3x.png b/doc/design/refactor/src/multi-threads/single-thread@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..4083aebfdd45af5fbac25fa2c4176bc08c3cb44a
Binary files /dev/null and b/doc/design/refactor/src/multi-threads/single-thread@3x.png differ
diff --git a/doc/design/switch_kernel.md b/doc/design/switch_kernel.md
new file mode 100644
index 0000000000000000000000000000000000000000..1846e5d9f99dd433b44ac6b5ae52893ec8f0d451
--- /dev/null
+++ b/doc/design/switch_kernel.md
@@ -0,0 +1,66 @@
+## Background
+Every operator has many kernels because there are multiple data types, places, data layout that Fluid supports. We use the `KernelType` to describe kernel types that operators can hold. 
+
+The `KernelType` is as follows.
+
+```
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+
+The `place_` is a descriptor of the device and the computational library, e.g., `MKLDNNPlace`, `CUDAPlace`.
+
+The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float`/`double`.
+
+The `layout` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+
+## Problem
+
+We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
+
+1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
+2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
+3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
+
+Problems under these situations are similar. We can formalise this problem as follow.
+
+We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
+
+## Solution
+
+It is clearly that transforming inputs of an operator toadapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
+
+We can infer a kernel type from the inputs of an operators. We let this kernel type as `actual kernel type`, which means this kernel type is the actually kernel type that operator should be performed.
+
+We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
+
+We transform the input data from `actual` to `expect` if the expect kernel type is not as same as actual kernel type.
+
+The algorithm is described as follow
+
+```cpp
+using DataTransformationFN = std::function<void(const Tensor& in, Tensor* out)>;
+using KernelTypePair = std::pair<KernelType, KernelType>;
+
+map<KernelTypePair, DataTransformationFN> g_data_transformation_;
+
+void OpWithKernel::Run() {
+  vec<Tensor> inputs = ...
+  auto actual_kernel_type = GetActualKernelType(inputs);
+  
+  // The expected kernel type is related to actual kernel type.
+  // For the most operators, the expected kernel type is as same as
+  // actual kernel type.
+  //
+  // So we pass `actual_kernel_type` as a parameter of 
+  // GetExpectedKernelType
+  auto expect_kernel_type = GetExpectedKernelType(actual_kernel_type);
+  
+  auto trans = g_data_transformation_[{actual_kernel_type, expect_kernel_type}];
+  
+  kernel.run(trans(inputs));
+}
+```
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
index c875c807b8ab2e420dec189ef32d41533f58fa6d..41ac07ca5674d2c121baba77c58226ad328cd681 100644
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -70,13 +70,13 @@ PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其
    :header: "依赖", "版本", "说明"
    :widths: 10, 15, 30
 
-   "CMake", ">=3.5", ""
+   "CMake", ">=3.2", ""
    "GCC", "4.8.2", "推荐使用CentOS的devtools2"
-   "Python", "2.7.x", "依赖libpython2.7.so"
-   "pip", ">=9.0", ""
-   "numpy", "", ""
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
    "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "可选"
+   "Go", ">=1.8", "可选"
 
 
 .. _build_options:
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
index f194f84ce7c961bb8644d7c077a7c71730220ea2..92211aee8c3bc0ae6e1a38311d40ddf92117cac7 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -76,13 +76,13 @@ will be downloaded automatically.
    :header: "Dependency", "Version", "Description"
    :widths: 10, 15, 30
 
-   "CMake", ">=3.5", ""
+   "CMake", ">=3.2", ""
    "GCC", "4.8.2", "Recommend devtools2 for CentOS"
-   "Python", "2.7.x", "Need libpython2.7.so"
-   "pip", ">=9.0", ""
-   "numpy", "", ""
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
    "SWIG", ">=2.0", ""
-   "Go", ">=1.8", "Optional"
+   "Go", ">=1.8", "Optional"
 
 
 .. _build_options:
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 1eb06e4182d40c3be20d71e37b34009905eaf9d6..fa1b6a372728ccac128d2e6e79a6514b8884ea3f 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -128,7 +128,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
 
 AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
 是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
-`编译 <./build_from_source_cn.rst>`_ PaddlePaddle为no-avx版本。
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
 
 以下指令能检查Linux电脑是否支持AVX：
 
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 5a46c598f2248c7912169a9e77b16851230c1d2e..06012bf65e75c32957516f6b7f62e09480871b84 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -137,7 +137,7 @@ GPU driver installed before move on.
 AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
 The latest PaddlePaddle Docker image turns AVX on by default, so, if your
 computer doesn't support AVX, you'll probably need to
-`build <./build_from_source_en.rst>`_ with :code:`WITH_AVX=OFF`.
+`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`.
 
 The following command will tell you whether your computer supports AVX.
 
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst
index b270e2c2f0b0cbfd6fb4b9b0750d207952f84d76..a4587f82a984acf243f49834e707fcd66d5b1252 100644
--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -37,11 +37,11 @@ PaddlePaddle可以使用常用的Python包管理工具
     :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
     :widths: 1, 3, 3, 3
 
-    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
 
 .. _pip_dependency:
 
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst
index 70f601a11c610e0a2b5dcc8b73d2c3ea19e195e1..55e31560a0f5087ab69966a6281c6c8573c04204 100644
--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -40,11 +40,11 @@ If the links below shows up the login form, just click "Log in as guest" to star
     :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
     :widths: 1, 3, 3, 3
 
-    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
 
 .. _pip_dependency:
 
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index 757a5840bca4c8028e362789ec95bb03d261d2c1..3109d72001f13a38a93b9ca39d3f8525c8cea9f1 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -53,7 +53,7 @@ Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU
 ```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor), 2D tensor of size (M x K)");
     AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
@@ -82,7 +82,7 @@ The equation is: Out = X * Y
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of scale operator.").NotInGradient();
     AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md
index fe86936bc12cc2fb88d653429e250f71a478dfb6..7175d8370d6ce08c6d502eb42b8e53252db89bbb 100644
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -50,7 +50,7 @@ First, define `ProtoMaker` to describe the Operator's input, output, and additio
 ```cpp
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor), 2D tensor of size (M x K)");
     AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
@@ -79,7 +79,7 @@ An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/de
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of scale operator.").NotInGradient();
     AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
diff --git a/doc/howto/usage/cluster/k8s_distributed_cn.md b/doc/howto/usage/cluster/k8s_distributed_cn.md
index 701a9a75d78b53d7dab94529dbd1be382ff0d04e..167089b8074b33e3b094fa3ec8e377630cec42ac 100644
--- a/doc/howto/usage/cluster/k8s_distributed_cn.md
+++ b/doc/howto/usage/cluster/k8s_distributed_cn.md
@@ -2,8 +2,6 @@
 
 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cluster/cluster_train_cn.html)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
 
-有关Kubernetes相关概念以及如何搭建和配置Kubernetes集群，可以参考[k8s_basis](./k8s_basis_cn.md)。
-
 ## 整体方案
 
 在训练之前，用户将配置与训练数据切分好放在分布式文件系统预先分配好的目录中(不同的分布式文件系统，需要使用其制定的方式挂载后并导入数据)，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
index b1e17936417e4ce09bace1d1a5d346d1c9cfa710..b0fd4d2750eb2529706d871947332d39494505cd 100644
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -19,42 +19,42 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
+Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
   switch (attr_desc.type()) {
-    case framework::AttrType::BOOLEAN: {
+    case proto::AttrType::BOOLEAN: {
       return attr_desc.b();
     }
-    case framework::AttrType::INT: {
+    case proto::AttrType::INT: {
       return attr_desc.i();
     }
-    case framework::AttrType::FLOAT: {
+    case proto::AttrType::FLOAT: {
       return attr_desc.f();
     }
-    case framework::AttrType::STRING: {
+    case proto::AttrType::STRING: {
       return attr_desc.s();
     }
-    case framework::AttrType::BOOLEANS: {
+    case proto::AttrType::BOOLEANS: {
       std::vector<bool> val(attr_desc.bools_size());
       for (int i = 0; i < attr_desc.bools_size(); ++i) {
         val[i] = attr_desc.bools(i);
       }
       return val;
     }
-    case framework::AttrType::INTS: {
+    case proto::AttrType::INTS: {
       std::vector<int> val(attr_desc.ints_size());
       for (int i = 0; i < attr_desc.ints_size(); ++i) {
         val[i] = attr_desc.ints(i);
       }
       return val;
     }
-    case framework::AttrType::FLOATS: {
+    case proto::AttrType::FLOATS: {
       std::vector<float> val(attr_desc.floats_size());
       for (int i = 0; i < attr_desc.floats_size(); ++i) {
         val[i] = attr_desc.floats(i);
       }
       return val;
     }
-    case framework::AttrType::STRINGS: {
+    case proto::AttrType::STRINGS: {
       std::vector<std::string> val(attr_desc.strings_size());
       for (int i = 0; i < attr_desc.strings_size(); ++i) {
         val[i] = attr_desc.strings(i);
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index 0641907d6ff7546df1601d3b0263ff42f4186968..c1c63d9cb13acb195b3bc3b30088f5fa7daf2a3d 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -27,12 +27,12 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 template <typename T>
-inline AttrType AttrTypeID() {
+inline proto::AttrType AttrTypeID() {
   Attribute tmp = T();
-  return static_cast<AttrType>(tmp.which() - 1);
+  return static_cast<proto::AttrType>(tmp.which() - 1);
 }
 
-Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
+Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc);
 
 class AttrReader {
  public:
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index faf6e60cbd1bcda9864c12696b336998ea7606b7..222aee5974ca82c12a9c3b18f7fac4f96ee251bd 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -42,7 +42,7 @@ static std::unordered_set<std::string>& CtrlFlowOps() {
 static inline std::unique_ptr<OperatorBase> CreateGradOp(
     const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
     std::unordered_map<std::string, std::string>* grad_to_var) {
-  OpDescBind op_desc;
+  OpDesc op_desc;
   op_desc.SetInputMap(op.Inputs());
   op_desc.SetOutputMap(op.Outputs());
   op_desc.SetType(op.Type());
@@ -53,7 +53,7 @@ static inline std::unique_ptr<OperatorBase> CreateGradOp(
   grad_ops.reserve(grad_descs.size());
   std::transform(grad_descs.begin(), grad_descs.end(),
                  std::back_inserter(grad_ops),
-                 [](const std::unique_ptr<OpDescBind>& grad_desc) {
+                 [](const std::unique_ptr<OpDesc>& grad_desc) {
                    return OpRegistry::CreateOp(*grad_desc);
                  });
   PADDLE_ENFORCE(!grad_ops.empty());
@@ -217,7 +217,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
         // If part of input gradient of that operator is not calculated, fill
         // zero variables to that input gradient.
         net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
-                                           {{"Y", {grad_input}}},
+                                           {{"Out", {grad_input}}},
                                            AttributeMap{}));
       }
       return false;
@@ -296,7 +296,7 @@ static std::string FwdName(const std::string& grad_name) {
 static void CreateGradVarInBlock(
     size_t grad_op_start_index,
     const std::unordered_map<std::string, std::string>& param_name_map,
-    BlockDescBind* block_desc,
+    BlockDesc* block_desc,
     std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
   auto ops = block_desc->AllOps();
   for (size_t op_index = grad_op_start_index; op_index < ops.size();
@@ -341,7 +341,7 @@ static void CreateGradVarInBlock(
       auto* param = block_desc->FindVarRecursive(pname);
       auto* grad = block_desc->FindVar(arg);
       if (param == nullptr) {
-        grad->SetDataType(DataType::FP32);
+        grad->SetDataType(proto::DataType::FP32);
       } else {
         grad->SetDataType(param->GetDataType());
       }
@@ -350,12 +350,11 @@ static void CreateGradVarInBlock(
   }
 }
 
-std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
-    const OpDescBind* op_desc, std::unordered_set<std::string>* no_grad_vars,
+std::vector<std::unique_ptr<OpDesc>> MakeOpGrad(
+    const OpDesc* op_desc, std::unordered_set<std::string>* no_grad_vars,
     std::unordered_map<std::string, std::string>* grad_to_var,
-    const std::vector<BlockDescBind*>& grad_block =
-        std::vector<BlockDescBind*>()) {
-  std::vector<std::unique_ptr<OpDescBind>> grad_op_descs;
+    const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>()) {
+  std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
   // All input gradients of forwarding operator do not need to calculate.
   const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
   if (AllGradInSet(inputs, *no_grad_vars)) {
@@ -386,7 +385,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
           .Get(op_desc->Type())
           .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block);
 
-  std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
+  std::list<std::unique_ptr<OpDesc>> pending_fill_zeros_ops;
   for (auto& desc : grad_op_descs) {
     for (const std::string& in_name : desc->InputArgumentNames()) {
       if (no_grad_vars->count(in_name)) {
@@ -394,9 +393,9 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
             0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
         std::string new_name = prefix + kZeroVarSuffix;
         desc->Rename(in_name, new_name);
-        std::unique_ptr<OpDescBind> fill_zeros_op(
-            new OpDescBind("fill_zeros_like", {{"X", {prefix}}},
-                           {{"Y", {new_name}}}, AttributeMap{}));
+        std::unique_ptr<OpDesc> fill_zeros_op(
+            new OpDesc("fill_zeros_like", {{"X", {prefix}}},
+                       {{"Out", {new_name}}}, AttributeMap{}));
         pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
       }
     }
@@ -408,34 +407,33 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
   return grad_op_descs;
 }
 
-static BlockDescBind* CreateStepBlock(
-    ProgramDescBind& program_desc,
-    std::unordered_set<std::string>* no_grad_vars,
+static BlockDesc* CreateStepBlock(
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
     std::unordered_map<std::string, std::string>* grad_to_var,
     int step_block_idx);
 
-std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
-    ProgramDescBind& program_desc, int block_idx,
+std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
+    ProgramDesc& program_desc, int block_idx,
     std::unordered_set<std::string>* no_grad_vars,
     std::unordered_map<std::string, std::string>* grad_to_var) {
   VLOG(5) << "MakeBlockBackward";
-  BlockDescBind* cur_block = program_desc.MutableBlock(block_idx);
-  std::vector<OpDescBind*> op_descs = cur_block->AllOps();
+  BlockDesc* cur_block = program_desc.MutableBlock(block_idx);
+  std::vector<OpDesc*> op_descs = cur_block->AllOps();
   std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
   size_t grad_desc_idx = 0;
-  std::vector<std::unique_ptr<OpDescBind>> backward_descs;
+  std::vector<std::unique_ptr<OpDesc>> backward_descs;
 
   for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
     VLOG(5) << "Making backward " << (*it)->Type() << " op";
-    std::vector<std::unique_ptr<OpDescBind>> op_grads;
+    std::vector<std::unique_ptr<OpDesc>> op_grads;
 
     if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
       int step_block_idx = (*it)->GetBlockAttr("sub_block");
-      BlockDescBind* backward_block = CreateStepBlock(
-          program_desc, no_grad_vars, grad_to_var, step_block_idx);
+      BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
+                                                  grad_to_var, step_block_idx);
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
     } else if ((*it)->Type() == "conditional_block") {
-      BlockDescBind* backward_block =
+      BlockDesc* backward_block =
           CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
                           (*it)->GetBlockAttr("sub_block"));
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
@@ -463,14 +461,14 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
       }
       ++grad_desc_idx;
     }
-    std::transform(
-        op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
-        [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
+    std::transform(op_grads.begin(), op_grads.end(),
+                   std::back_inserter(backward_descs),
+                   [](std::unique_ptr<OpDesc>& ptr) { return std::move(ptr); });
   }
 
   VLOG(5) << "Appending Sums";
   // Check whether some variables are written more than once
-  std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
+  std::list<std::pair<size_t, std::unique_ptr<OpDesc>>> pending_sum_ops;
   for (const auto& dup : dup_out_ops) {
     const std::string& out_name = dup.first;
     const std::vector<size_t> dup_op = dup.second;
@@ -486,18 +484,17 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
         sum_op_inputs.emplace_back(new_name);
         next_g_name = sum_op_inputs.back();
       }
-      std::unique_ptr<OpDescBind> sum_op(
-          new OpDescBind("sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}},
-                         AttributeMap{}));
+      std::unique_ptr<OpDesc> sum_op(new OpDesc("sum", {{"X", sum_op_inputs}},
+                                                {{"Out", {out_name}}},
+                                                AttributeMap{}));
       pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
     }
   }
 
-  pending_sum_ops.sort(
-      [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
-         const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
-        return a.first > b.first;
-      });
+  pending_sum_ops.sort([](const std::pair<size_t, std::unique_ptr<OpDesc>>& a,
+                          const std::pair<size_t, std::unique_ptr<OpDesc>>& b) {
+    return a.first > b.first;
+  });
   for (auto& p : pending_sum_ops) {
     backward_descs.insert(backward_descs.begin() + p.first + 1,
                           std::move(p.second));
@@ -508,14 +505,13 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
   return backward_descs;
 }
 
-static BlockDescBind* CreateStepBlock(
-    ProgramDescBind& program_desc,
-    std::unordered_set<std::string>* no_grad_vars,
+static BlockDesc* CreateStepBlock(
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
     std::unordered_map<std::string, std::string>* grad_to_var,
     int step_block_idx) {
   auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
                                                    no_grad_vars, grad_to_var);
-  BlockDescBind* backward_block =
+  BlockDesc* backward_block =
       program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
   for (auto& ptr : backward_block_op_descs) {
     backward_block->AppendAllocatedOp(move(ptr));
@@ -524,7 +520,7 @@ static BlockDescBind* CreateStepBlock(
 }
 
 ParamGradInfoMap AppendBackward(
-    ProgramDescBind& program_desc, const VarDescBind& target,
+    ProgramDesc& program_desc, const VarDesc& target,
     const std::unordered_set<std::string>& no_grad_vars) {
   std::unordered_set<std::string> no_grad_var_names;
   no_grad_var_names.reserve(no_grad_vars.size() + 1);
@@ -541,11 +537,11 @@ ParamGradInfoMap AppendBackward(
   PADDLE_ENFORCE(is_scalar, "target should be scalar");
   VLOG(3) << "backward from loss=" << target.Name()
           << " data_type=" << target.GetDataType();
-  std::unique_ptr<OpDescBind> fill_one_op(
-      new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
-                     {{"shape", std::vector<int>{1}},
-                      {"value", static_cast<float>(1.0)},
-                      {"dtype", target.GetDataType()}}));
+  std::unique_ptr<OpDesc> fill_one_op(
+      new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}},
+                 {{"shape", std::vector<int>{1}},
+                  {"value", static_cast<float>(1.0)},
+                  {"dtype", target.GetDataType()}}));
   // infer var type of fill_one_op
   fill_one_op->InferVarType(root_block);
 
diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h
index 96154fa82cb7a486aa4762ae633982ed6735220b..2d3b75fe6966cb5dad32dc185a3973e92b23e26e 100644
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@@ -49,7 +49,7 @@ using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
                                             GradVarInfo /*grad_var_info*/>;
 
 ParamGradInfoMap AppendBackward(
-    ProgramDescBind& program_desc, const VarDescBind& target,
+    ProgramDesc& program_desc, const VarDesc& target,
     const std::unordered_set<std::string>& no_grad_vars);
 
 }  // namespace framework
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 9fe49881d5b740655432f6e83a7886878ceb17e8..0957646b5642cd9afce5d88b2c638679cb01f198 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -58,13 +58,13 @@ class RowWiseAddGradMaker : public SingleGradOpDescMaker {
   using SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<OpDescBind> Apply() const override {
-    auto grad_op = new OpDescBind();
+  std::unique_ptr<OpDesc> Apply() const override {
+    auto grad_op = new OpDesc();
     grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
     grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
     grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
     grad_op->SetType("rowwise_add_grad");
-    return std::unique_ptr<OpDescBind>(grad_op);
+    return std::unique_ptr<OpDesc>(grad_op);
   }
 };
 
@@ -159,14 +159,14 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
   FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "x");
-    AddOutput("Y", "out");
+    AddOutput("Out", "out");
     AddComment("");
   }
 };
 
 class SumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SumOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "the input tensors of sum operator.").AsDuplicable();
     AddOutput("Out", "the output tensor of sum operator.");
@@ -190,11 +190,11 @@ class MinusGradOpDescMaker : public GradOpDescMakerBase {
  public:
   using GradOpDescMakerBase::GradOpDescMakerBase;
 
-  std::vector<std::unique_ptr<OpDescBind>> operator()() const override {
-    std::vector<std::unique_ptr<OpDescBind>> retv;
+  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
+    std::vector<std::unique_ptr<OpDesc>> retv;
     auto x_g = InputGrad("X");
     if (!x_g.empty()) {
-      auto *op_desc = new OpDescBind();
+      auto *op_desc = new OpDesc();
       op_desc->SetType("scale");
       op_desc->SetInput("X", OutputGrad("Out"));
       op_desc->SetOutput("Out", x_g);
@@ -204,7 +204,7 @@ class MinusGradOpDescMaker : public GradOpDescMakerBase {
 
     auto y_g = InputGrad("Y");
     if (!y_g.empty()) {
-      auto *op_desc = new OpDescBind();
+      auto *op_desc = new OpDesc();
       op_desc->SetType("scale");
       op_desc->SetInput("X", OutputGrad("Out"));
       op_desc->SetOutput("Out", y_g);
@@ -430,8 +430,8 @@ TEST(Backward, op_part_of_output_are_not_need) {
   ASSERT_EQ("fill_zeros_like", fill_zero.Type());
   ASSERT_EQ(1UL, fill_zero.Inputs("X").size());
   ASSERT_EQ("Z", fill_zero.Input("X"));
-  ASSERT_EQ(1UL, fill_zero.Outputs("Y").size());
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Y"));
+  ASSERT_EQ(1UL, fill_zero.Outputs("Out").size());
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out"));
 
   auto &d_many_out = *net->ops_[1];
   ASSERT_EQ("many_output_op_grad", d_many_out.Type());
@@ -505,25 +505,25 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
 }
 
 TEST(Backward, simple_single_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
-  f::OpDescBind *op = block->AppendOp();
+  f::OpDesc *op = block->AppendOp();
   op->SetType("rowwise_add");
   op->SetInput("X", {"x"});
   op->SetInput("b", {"b"});
   op->SetOutput("Out", {"out"});
 
-  auto target = f::VarDescBind("out");
+  auto target = f::VarDesc("out");
   target.SetShape({1});
   auto var_to_grad =
       AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 3UL);
-  f::OpDescBind *fill_op = block->AllOps()[1];
+  f::OpDesc *fill_op = block->AllOps()[1];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op = block->AllOps()[2];
+  f::OpDesc *grad_op = block->AllOps()[2];
   EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
@@ -543,16 +543,16 @@ TEST(Backward, simple_single_op) {
 }
 
 TEST(Backward, default_attribute) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op = block->AppendOp();
   op->SetType("mul");
   op->SetInput("X", {"x"});
   op->SetInput("Y", {"y"});
   op->SetOutput("Out", {"out"});
   op->CheckAttrs();
 
-  auto target = f::VarDescBind("out");
+  auto target = f::VarDesc("out");
   target.SetShape({1});
   AppendBackward(program, target, std::unordered_set<std::string>{});
 
@@ -560,47 +560,47 @@ TEST(Backward, default_attribute) {
   EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
   EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
 
-  f::OpDescBind *fill_op = block->AllOps()[1];
+  f::OpDesc *fill_op = block->AllOps()[1];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op = block->AllOps()[2];
+  f::OpDesc *grad_op = block->AllOps()[2];
   ASSERT_EQ(grad_op->Type(), "mul_grad");
   EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
   EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
 }
 
 TEST(Backward, simple_mult_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
   op1->SetType("rowwise_add");
   op1->SetInput("X", {"x1"});
   op1->SetInput("b", {"b1"});
   op1->SetOutput("Out", {"out1"});
 
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
   op2->SetType("mul");
   op2->SetInput("X", {"out1"});
   op2->SetInput("Y", {"y2"});
   op2->SetOutput("Out", {"out2"});
 
-  f::OpDescBind *op3 = block->AppendOp();
+  f::OpDesc *op3 = block->AppendOp();
   op3->SetType("rowwise_add");
   op3->SetInput("X", {"out2"});
   op3->SetInput("b", {"b3"});
   op3->SetOutput("Out", {"out3"});
 
-  auto target = f::VarDescBind("out3");
+  auto target = f::VarDesc("out3");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad =
       AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 6UL + 1);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  f::OpDesc *grad_op1 = block->AllOps()[6];
   EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -611,7 +611,7 @@ TEST(Backward, simple_mult_op) {
   EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
             std::vector<std::string>({f::GradVarName("b1")}));
 
-  f::OpDescBind *grad_op2 = block->AllOps()[5];
+  f::OpDesc *grad_op2 = block->AllOps()[5];
   EXPECT_EQ(grad_op2->Type(), "mul_grad");
   ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
   ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
@@ -625,7 +625,7 @@ TEST(Backward, simple_mult_op) {
   EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
             std::vector<std::string>({f::GradVarName("y2")}));
 
-  f::OpDescBind *grad_op3 = block->AllOps()[4];
+  f::OpDesc *grad_op3 = block->AllOps()[4];
   EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
@@ -655,42 +655,42 @@ TEST(Backward, simple_mult_op) {
 }
 
 TEST(Backward, intermedia_var_no_grad) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
   op1->SetType("rowwise_add");
   op1->SetInput("X", {"x1"});
   op1->SetInput("b", {"b1"});
   op1->SetOutput("Out", {"out1"});
 
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
   op2->SetType("mul");
   op2->SetInput("X", {"x2"});
   op2->SetInput("Y", {"y2"});
   op2->SetOutput("Out", {"out2"});
 
-  f::OpDescBind *op3 = block->AppendOp();
+  f::OpDesc *op3 = block->AppendOp();
   op3->SetType("rowwise_add");
   op3->SetInput("X", {"out2"});
   op3->SetInput("b", {"b3"});
   op3->SetOutput("Out", {"out3"});
 
-  f::OpDescBind *op4 = block->AppendOp();
+  f::OpDesc *op4 = block->AppendOp();
   op4->SetType("mul");
   op4->SetInput("X", {"out1"});
   op4->SetInput("Y", {"out3"});
   op4->SetOutput("Out", {"out4"});
 
-  auto target = f::VarDescBind("out4");
+  auto target = f::VarDesc("out4");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {"out3"});
 
   ASSERT_EQ(block->AllOps().size(), 7UL);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  f::OpDesc *grad_op1 = block->AllOps()[6];
   EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -701,7 +701,7 @@ TEST(Backward, intermedia_var_no_grad) {
   EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
             std::vector<std::string>({f::GradVarName("b1")}));
 
-  f::OpDescBind *grad_op4 = block->AllOps()[5];
+  f::OpDesc *grad_op4 = block->AllOps()[5];
   EXPECT_EQ(grad_op4->Type(), "mul_grad");
   ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
   ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
@@ -726,32 +726,32 @@ TEST(Backward, intermedia_var_no_grad) {
 }
 
 TEST(Backward, var_no_grad) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
   op1->SetType("mult_in_out");
   op1->SetInput("X", {"x1"});
   op1->SetInput("H", {"h1"});
   op1->SetOutput("Y", {"y1"});
   op1->SetOutput("Z", {"z1"});
 
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
   op2->SetType("mult_in_out");
   op2->SetInput("X", {"y1"});
   op2->SetInput("H", {"z1"});
   op2->SetOutput("Y", {"y2"});
   op2->SetOutput("Z", {"z2"});
 
-  auto target = f::VarDescBind("z2");
+  auto target = f::VarDesc("z2");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {"z1"});
 
   ASSERT_EQ(block->AllOps().size(), 6UL);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op2 = block->AllOps()[3];
+  f::OpDesc *grad_op2 = block->AllOps()[3];
   ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
   ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
   ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
@@ -767,15 +767,15 @@ TEST(Backward, var_no_grad) {
             std::vector<std::string>({f::GradVarName("y1")}));
   EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());
 
-  f::OpDescBind *fill_zero_op = block->AllOps()[4];
+  f::OpDesc *fill_zero_op = block->AllOps()[4];
   ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
   ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
   ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
   EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(fill_zero_op->Output("Y"),
+  EXPECT_EQ(fill_zero_op->Output("Out"),
             std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
 
-  f::OpDescBind *grad_op1 = block->AllOps()[5];
+  f::OpDesc *grad_op1 = block->AllOps()[5];
   ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -803,37 +803,37 @@ TEST(Backward, var_no_grad) {
 }
 
 TEST(Backward, shared_var) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
-  f::OpDescBind *op1 = block->AppendOp();
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
   op1->SetType("rowwise_add");
   op1->SetInput("X", {"x1"});
   op1->SetInput("b", {"b1"});
   op1->SetOutput("Out", {"out1"});
 
-  f::OpDescBind *op2 = block->AppendOp();
+  f::OpDesc *op2 = block->AppendOp();
   op2->SetType("mul");
   op2->SetInput("X", {"out1"});
   op2->SetInput("Y", {"y2"});
   op2->SetOutput("Out", {"out2"});
 
-  f::OpDescBind *op3 = block->AppendOp();
+  f::OpDesc *op3 = block->AppendOp();
   op3->SetType("rowwise_add");
   op3->SetInput("X", {"out1"});
   op3->SetInput("b", {"b3"});
   op3->SetOutput("Out", {"out3"});
 
-  auto target = f::VarDescBind("out3");
+  auto target = f::VarDesc("out3");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad =
       AppendBackward(program, target, std::unordered_set<std::string>{});
 
   ASSERT_EQ(block->AllOps().size(), 8UL);
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
 
-  f::OpDescBind *grad_op3 = block->AllOps()[4];
+  f::OpDesc *grad_op3 = block->AllOps()[4];
   ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
@@ -844,7 +844,7 @@ TEST(Backward, shared_var) {
   EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
             std::vector<std::string>({f::GradVarName("b3")}));
 
-  f::OpDescBind *grad_op4 = block->AllOps()[5];
+  f::OpDesc *grad_op4 = block->AllOps()[5];
   ASSERT_EQ(grad_op4->Type(), "mul_grad");
   ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
   ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
@@ -858,7 +858,7 @@ TEST(Backward, shared_var) {
   EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
             std::vector<std::string>({f::GradVarName("y2")}));
 
-  f::OpDescBind *sum_op = block->AllOps()[6];
+  f::OpDesc *sum_op = block->AllOps()[6];
   ASSERT_EQ(sum_op->Type(), "sum");
   ASSERT_EQ(sum_op->InputNames().size(), 1UL);
   ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
@@ -868,7 +868,7 @@ TEST(Backward, shared_var) {
   EXPECT_EQ(sum_op->Output("Out"),
             std::vector<std::string>({f::GradVarName("out1")}));
 
-  f::OpDescBind *grad_op1 = block->AllOps()[7];
+  f::OpDesc *grad_op1 = block->AllOps()[7];
   ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
   ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
   ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
@@ -895,19 +895,19 @@ TEST(Backward, shared_var) {
 }
 
 TEST(Backward, half_backward) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
   auto *op1 = block->AppendOp();
   op1->SetType("minus");
   op1->SetInput("X", {"a"});
   op1->SetInput("Y", {"b"});
   op1->SetOutput("Out", {"out"});
 
-  auto target = f::VarDescBind("out");
+  auto target = f::VarDesc("out");
   target.SetShape({1});
   size_t forward_len = block->AllOps().size();
   auto var_to_grad = AppendBackward(program, target, {"b"});
-  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
   EXPECT_EQ(fill_op->Type(), "fill_constant");
   auto ops = block->AllOps();
   ASSERT_EQ(3UL, ops.size());
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 6a7a07d5cf471a32822cdccf5c616d8748fd1bd7..0668b08ff7ab3c8ca4f1e989fc7af45a8ec5f63c 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -19,18 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-VarDescBind *BlockDescBind::Var(const std::string &name) {
+VarDesc *BlockDesc::Var(const std::string &name) {
   auto it = vars_.find(name);
   if (it != vars_.end()) {
     return it->second.get();
   }
   need_update_ = true;
-  auto *var = new VarDescBind(name);
+  auto *var = new VarDesc(name);
   vars_[name].reset(var);
   return var;
 }
 
-VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
+VarDesc *BlockDesc::FindVar(const std::string &name) const {
   auto it = vars_.find(name);
   if (it == vars_.end()) {
     return nullptr;
@@ -38,11 +38,11 @@ VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
   return it->second.get();
 }
 
-bool BlockDescBind::HasVar(const std::string &name) const {
+bool BlockDesc::HasVar(const std::string &name) const {
   return vars_.find(name) != vars_.end();
 }
 
-VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
+VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const {
   if (name == kEmptyVarName) return nullptr;
 
   auto it = vars_.find(name);
@@ -53,53 +53,67 @@ VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
   return it->second.get();
 }
 
-VarDescBind *BlockDescBind::FindRecursiveOrCreateVar(
-    const std::string &name_bytes) {
-  VarDescBind *res = FindVarRecursive(name_bytes);
+VarDesc *BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
+  VarDesc *res = FindVarRecursive(name_bytes);
   if (res == nullptr) {
     res = Var(name_bytes);
   }
   return res;
 }
 
-bool BlockDescBind::HasVarRecursive(const std::string &name) const {
+bool BlockDesc::HasVarRecursive(const std::string &name) const {
   return FindVarRecursive(name) != nullptr;
 }
 
-std::vector<VarDescBind *> BlockDescBind::AllVars() const {
-  std::vector<VarDescBind *> res;
+std::vector<VarDesc *> BlockDesc::AllVars() const {
+  std::vector<VarDesc *> res;
   for (const auto &p : vars_) {
     res.push_back(p.second.get());
   }
   return res;
 }
 
-OpDescBind *BlockDescBind::AppendOp() {
+OpDesc *BlockDesc::AppendOp() {
   need_update_ = true;
-  ops_.emplace_back(new OpDescBind());
+  ops_.emplace_back(new OpDesc());
   return ops_.back().get();
 }
 
-void BlockDescBind::AppendAllocatedOp(std::unique_ptr<OpDescBind> &&op_desc) {
+void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
   need_update_ = true;
   ops_.emplace_back(std::move(op_desc));
 }
 
-OpDescBind *BlockDescBind::PrependOp() {
+OpDesc *BlockDesc::PrependOp() {
   need_update_ = true;
-  ops_.emplace_front(new OpDescBind());
+  ops_.emplace_front(new OpDesc());
   return ops_.front().get();
 }
 
-std::vector<OpDescBind *> BlockDescBind::AllOps() const {
-  std::vector<OpDescBind *> res;
+void BlockDesc::RemoveOp(size_t s, size_t e) {
+  if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
+    return;
+  }
+  need_update_ = true;
+  for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) {
+    auto names = (*it)->InputArgumentNames();
+    for (auto n : names) {
+      // TODO(typhoonzero): delete vars if no other op use it.
+      VLOG(3) << "deleting var " << n;
+    }
+  }
+  ops_.erase(ops_.begin() + s, ops_.begin() + e);
+}
+
+std::vector<OpDesc *> BlockDesc::AllOps() const {
+  std::vector<OpDesc *> res;
   for (const auto &op : ops_) {
     res.push_back(op.get());
   }
   return res;
 }
 
-void BlockDescBind::Flush() {
+void BlockDesc::Flush() {
   for (auto &op_desc : ops_) {
     op_desc->Flush();
   }
@@ -121,43 +135,43 @@ void BlockDescBind::Flush() {
   }
 }
 
-BlockDescBind *BlockDescBind::ParentBlock() const {
+BlockDesc *BlockDesc::ParentBlock() const {
   if (this->desc_->parent_idx() == kNoneBlockIndex) {
     return nullptr;
   }
   return prog_->MutableBlock(static_cast<size_t>(this->desc_->parent_idx()));
 }
 
-BlockDesc *BlockDescBind::Proto() {
+proto::BlockDesc *BlockDesc::Proto() {
   Flush();
   return desc_;
 }
 
-BlockDescBind::BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
+BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
     : prog_(prog), desc_(desc), need_update_(false) {
-  for (const VarDesc &var_desc : desc_->vars()) {
-    vars_[var_desc.name()].reset(new VarDescBind(var_desc));
+  for (const proto::VarDesc &var_desc : desc_->vars()) {
+    vars_[var_desc.name()].reset(new VarDesc(var_desc));
   }
-  for (const OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDescBind(op_desc, prog));
+  for (const proto::OpDesc &op_desc : desc_->ops()) {
+    ops_.emplace_back(new OpDesc(op_desc, prog));
   }
 }
 
-BlockDescBind::BlockDescBind(const BlockDescBind &other, BlockDesc *desc,
-                             ProgramDescBind *prog)
+BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
+                     ProgramDesc *prog)
     : prog_(prog), desc_(desc) {
   need_update_ = true;
   for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDescBind(*op));
+    ops_.emplace_back(new OpDesc(*op));
   }
 
   for (auto &it : other.vars_) {
-    auto *var = new VarDescBind(*it.second);
+    auto *var = new VarDesc(*it.second);
     vars_[it.first].reset(var);
   }
 }
 
-void BlockDescBind::ClearPBOps() {
+void BlockDesc::ClearPBOps() {
   auto ops = this->desc_->mutable_ops();
   while (!ops->empty()) {
     // we do not own the OpDesc, so release the ownership.
@@ -165,7 +179,7 @@ void BlockDescBind::ClearPBOps() {
   }
 }
 
-void BlockDescBind::ClearPBVars() {
+void BlockDesc::ClearPBVars() {
   auto vars = this->desc_->mutable_vars();
   while (!vars->empty()) {
     // we do not own the VarDesc, so release the ownership.
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 8e967e5378eb47a7869efb59cc96a271f1cbb9a1..6c8c81b332d99e52db41018e117aa837be6745bc 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -28,20 +28,19 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class ProgramDescBind;
+class ProgramDesc;
 
 // Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
 // read/write speed. Only when we want the protobuf message, the local changes
 // will be synchronized (by `Sync` method).
 
-class BlockDescBind {
+class BlockDesc {
  public:
-  BlockDescBind(ProgramDescBind *prog, BlockDesc *desc);
+  BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc);
 
-  BlockDescBind(const BlockDescBind &other, BlockDesc *desc,
-                ProgramDescBind *prog);
+  BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);
 
-  ~BlockDescBind() {
+  ~BlockDesc() {
     this->ClearPBVars();
     this->ClearPBOps();
   }
@@ -50,15 +49,15 @@ class BlockDescBind {
 
   int32_t Parent() const { return desc_->parent_idx(); }
 
-  VarDescBind *Var(const std::string &name_bytes);
+  VarDesc *Var(const std::string &name_bytes);
 
-  VarDescBind *FindVar(const std::string &name_bytes) const;
+  VarDesc *FindVar(const std::string &name_bytes) const;
 
   bool HasVar(const std::string &var_name) const;
 
-  VarDescBind *FindVarRecursive(const std::string &name_bytes) const;
+  VarDesc *FindVarRecursive(const std::string &name_bytes) const;
 
-  VarDescBind *FindRecursiveOrCreateVar(const std::string &name_bytes);
+  VarDesc *FindRecursiveOrCreateVar(const std::string &name_bytes);
 
   bool HasVarRecursive(const std::string &var_name) const;
 
@@ -70,41 +69,43 @@ class BlockDescBind {
     return var_names;
   }
 
-  std::vector<VarDescBind *> AllVars() const;
+  std::vector<VarDesc *> AllVars() const;
 
-  BlockDescBind *ParentBlock() const;
+  BlockDesc *ParentBlock() const;
 
-  OpDescBind *AppendOp();
+  OpDesc *AppendOp();
 
-  void AppendAllocatedOp(std::unique_ptr<OpDescBind> &&op_desc);
+  void AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc);
 
-  OpDescBind *PrependOp();
+  OpDesc *PrependOp();
 
-  std::vector<OpDescBind *> AllOps() const;
+  void RemoveOp(size_t s, size_t e);
+
+  std::vector<OpDesc *> AllOps() const;
 
   size_t OpSize() const { return ops_.size(); }
 
-  OpDescBind *Op(int idx) { return ops_.at(idx).get(); }
+  OpDesc *Op(int idx) { return ops_.at(idx).get(); }
 
   void Flush();
 
-  BlockDesc *Proto();
+  proto::BlockDesc *Proto();
 
-  ProgramDescBind *Program() { return this->prog_; }
+  ProgramDesc *Program() { return this->prog_; }
 
  private:
   void ClearPBOps();
   void ClearPBVars();
 
  private:
-  ProgramDescBind *prog_;  // not_own
-  BlockDesc *desc_;        // not_own
+  ProgramDesc *prog_;       // not_own
+  proto::BlockDesc *desc_;  // not_own
   bool need_update_;
 
-  std::deque<std::unique_ptr<OpDescBind>> ops_;
-  std::unordered_map<std::string, std::unique_ptr<VarDescBind>> vars_;
+  std::deque<std::unique_ptr<OpDesc>> ops_;
+  std::unordered_map<std::string, std::unique_ptr<VarDesc>> vars_;
 
-  DISABLE_COPY_AND_ASSIGN(BlockDescBind);
+  DISABLE_COPY_AND_ASSIGN(BlockDesc);
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/data_layout.h b/paddle/framework/data_layout.h
new file mode 100644
index 0000000000000000000000000000000000000000..7429de7ee39297c26360984809e2451100f7b3ff
--- /dev/null
+++ b/paddle/framework/data_layout.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+
+enum DataLayout {
+  kNHWC = 0,
+  kNCHW = 1,
+  kAnyLayout = 2,
+};
+
+inline DataLayout StringToDataLayout(const std::string& str) {
+  if (str == "NHWC" || str == "nhwc") {
+    return DataLayout::kNHWC;
+  } else if (str == "NCHW" || str == "nchw") {
+    return DataLayout::kNCHW;
+  } else {
+    PADDLE_THROW("Unknown storage order string: %s", str);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index c54d2d4ddf09c445fb25c1fbe8a7498f233d8212..e94ee2ed52bc40f52caf783f971dd0b560534e08 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -20,7 +20,8 @@
 namespace paddle {
 namespace framework {
 
-inline DataType ToDataType(std::type_index type) {
+inline proto::DataType ToDataType(std::type_index type) {
+  using namespace paddle::framework::proto;
   if (typeid(float).hash_code() == type.hash_code()) {
     return DataType::FP32;
   } else if (typeid(double).hash_code() == type.hash_code()) {
@@ -36,7 +37,8 @@ inline DataType ToDataType(std::type_index type) {
   }
 }
 
-inline std::type_index ToTypeIndex(DataType type) {
+inline std::type_index ToTypeIndex(proto::DataType type) {
+  using namespace paddle::framework::proto;
   switch (type) {
     case DataType::FP32:
       return typeid(float);
@@ -54,7 +56,8 @@ inline std::type_index ToTypeIndex(DataType type) {
 }
 
 template <typename Visitor>
-inline void VisitDataType(DataType type, Visitor visitor) {
+inline void VisitDataType(proto::DataType type, Visitor visitor) {
+  using namespace paddle::framework::proto;
   switch (type) {
     case DataType::FP32:
       visitor.template operator()<float>();
diff --git a/paddle/framework/details/op_registry.h b/paddle/framework/details/op_registry.h
index f91e0e03410c95f84a65f02beed38b7bbfdcaa86..7f5151c41d6046f21f7a9707e45de85ec50219ad 100644
--- a/paddle/framework/details/op_registry.h
+++ b/paddle/framework/details/op_registry.h
@@ -90,7 +90,7 @@ struct OpInfoFiller<T, kOperator> {
 template <typename T>
 struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
   void operator()(const char* op_type, OpInfo* info) const {
-    info->proto_ = new OpProto;
+    info->proto_ = new proto::OpProto;
     info->checker_ = new OpAttrChecker();
     auto maker = T(info->proto_, info->checker_);
     maker.Validate();
@@ -106,10 +106,10 @@ template <typename T>
 struct OpInfoFiller<T, kGradOpDescMaker> {
   void operator()(const char* op_type, OpInfo* info) const {
     info->grad_op_maker_ = [](
-        const OpDescBind& fwd_op,
+        const OpDesc& fwd_op,
         const std::unordered_set<std::string>& no_grad_set,
         std::unordered_map<std::string, std::string>* grad_to_var,
-        const std::vector<BlockDescBind*>& grad_block) {
+        const std::vector<BlockDesc*>& grad_block) {
       T maker(fwd_op, no_grad_set, grad_to_var, grad_block);
       return maker();
     };
@@ -119,7 +119,7 @@ struct OpInfoFiller<T, kGradOpDescMaker> {
 template <typename T>
 struct OpInfoFiller<T, kVarTypeInference> {
   void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_var_type_ = [](const OpDescBind& fwd_op, BlockDescBind* block) {
+    info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) {
       T inference;
       inference(fwd_op, block);
     };
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index a8b8a6f8e82525bd9a1f709516483de6f44142dc..14ae37ec49c12203381e74b3f9174a460e41c18e 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -41,20 +41,20 @@ Executor::Executor(const std::vector<platform::Place>& places) {
   device_contexts_.swap(borrowed_contexts);
 }
 
-static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
-  if (var_type == VarDesc::LOD_TENSOR) {
+static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
+  if (var_type == proto::VarDesc::LOD_TENSOR) {
     var->GetMutable<LoDTensor>();
-  } else if (var_type == VarDesc::SELECTED_ROWS) {
+  } else if (var_type == proto::VarDesc::SELECTED_ROWS) {
     var->GetMutable<SelectedRows>();
-  } else if (var_type == VarDesc::FEED_MINIBATCH) {
+  } else if (var_type == proto::VarDesc::FEED_MINIBATCH) {
     var->GetMutable<FeedFetchList>();
-  } else if (var_type == VarDesc::FETCH_LIST) {
+  } else if (var_type == proto::VarDesc::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
-  } else if (var_type == VarDesc::STEP_SCOPES) {
+  } else if (var_type == proto::VarDesc::STEP_SCOPES) {
     var->GetMutable<std::vector<framework::Scope>>();
-  } else if (var_type == VarDesc::LOD_RANK_TABLE) {
+  } else if (var_type == proto::VarDesc::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
-  } else if (var_type == VarDesc::LOD_TENSOR_ARRAY) {
+  } else if (var_type == proto::VarDesc::LOD_TENSOR_ARRAY) {
     var->GetMutable<LoDTensorArray>();
   } else {
     PADDLE_THROW(
@@ -64,8 +64,8 @@ static void CreateTensor(Variable* var, VarDesc::VarType var_type) {
   }
 }
 
-void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
-                   bool create_local_scope) {
+void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
+                   bool create_local_scope, bool create_vars) {
   // TODO(tonyyang-svail):
   //    - only runs on the first device (i.e. no interdevice communication)
   //    - will change to use multiple blocks for RNN op and Cond Op
@@ -74,33 +74,35 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
   auto& device = device_contexts_[0];
 
   Scope* local_scope = scope;
-  if (create_local_scope) {
-    local_scope = &scope->NewScope();
-    for (auto& var : block.AllVars()) {
-      if (var->Name() == framework::kEmptyVarName) {
-        continue;
+  if (create_vars) {
+    if (create_local_scope) {
+      local_scope = &scope->NewScope();
+      for (auto& var : block.AllVars()) {
+        if (var->Name() == framework::kEmptyVarName) {
+          continue;
+        }
+
+        if (var->Persistable()) {
+          auto* ptr = scope->Var(var->Name());
+          CreateTensor(ptr, var->GetType());
+          VLOG(3) << "Create Variable " << var->Name()
+                  << " global, which pointer is " << ptr;
+        } else {
+          auto* ptr = local_scope->Var(var->Name());
+          CreateTensor(ptr, var->GetType());
+          VLOG(3) << "Create Variable " << var->Name()
+                  << " locally, which pointer is " << ptr;
+        }
       }
-
-      if (var->Persistable()) {
-        auto* ptr = scope->Var(var->Name());
-        CreateTensor(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " global, which pointer is " << ptr;
-      } else {
+    } else {
+      for (auto& var : block.AllVars()) {
         auto* ptr = local_scope->Var(var->Name());
         CreateTensor(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " locally, which pointer is " << ptr;
+        VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+                << ptr;
       }
-    }
-  } else {
-    for (auto& var : block.AllVars()) {
-      auto* ptr = local_scope->Var(var->Name());
-      CreateTensor(ptr, var->GetType());
-      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
-              << ptr;
-    }
-  }
+    }  // if (create_local_scope)
+  }    // if (create_vars)
 
   for (auto& op_desc : block.AllOps()) {
     auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index 073e04729b1166f1cabd16709d161fda0d580f1c..a3d1609293a0d687c33447ca7a0df95c6aac3bc5 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -40,6 +40,16 @@ class DeviceContextPool {
     return *pool;
   }
 
+  const platform::DeviceContext* Borrow(const platform::Place& place) {
+    auto range = device_contexts_.equal_range(place);
+    if (range.first == range.second) {
+      PADDLE_THROW(
+          "'Place' is not supported, Please re-compile with WITH_GPU "
+          "option");
+    }
+    return range.first->second;
+  }
+
   std::vector<const platform::DeviceContext*> Borrow(
       const std::vector<platform::Place>& places) {
     PADDLE_ENFORCE_GT(places.size(), 0);
@@ -114,7 +124,8 @@ class Executor {
    *  ProgramDesc
    *  Scope
    */
-  void Run(const ProgramDescBind&, Scope*, int, bool create_local_scope = true);
+  void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
+           bool create_vars = true);
 
  private:
   std::vector<const platform::DeviceContext*> device_contexts_;
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index f1fc4529e15502927560eefd74110f6ca7eab4a9..4f2746e4b86ee5fe095897ff6ef9d3f6473e8a14 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -14,7 +14,7 @@ limitations under the License. */
 
 syntax = "proto2";
 option optimize_for = LITE_RUNTIME;
-package paddle.framework;
+package paddle.framework.proto;
 
 enum AttrType {
   INT = 0;
diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h
index 998186e33915a11f2864eb5387d19ed1bfbab51c..cf411fa710103350713342b43946697a8dd2aa46 100644
--- a/paddle/framework/grad_op_desc_maker.h
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -22,21 +22,27 @@
 namespace paddle {
 namespace framework {
 
+/*
+  This functor class is responsible for creating the gradient ops for the given
+  operator fwd_op. After it is called (through operator()), the pairs of
+  (gradient variable, corresponding input variable of fwd_op) will be added to
+  grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its
+  gradient varialbe will be ignored or kEmptyVarName depending on the template
+  argument DropEmptyIG in the derived classes.
+ */
 class GradOpDescMakerBase {
  public:
   explicit GradOpDescMakerBase(
-      const OpDescBind& fwd_op,
-      const std::unordered_set<std::string>& no_grad_set,
+      const OpDesc& fwd_op, const std::unordered_set<std::string>& no_grad_set,
       std::unordered_map<std::string, std::string>* grad_to_var,
-      const std::vector<BlockDescBind*>& grad_block =
-          std::vector<BlockDescBind*>())
+      const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>())
       : fwd_op_(fwd_op),
         no_grad_set_(no_grad_set),
         grad_to_var_(grad_to_var),
         grad_block_(grad_block) {}
 
   virtual ~GradOpDescMakerBase() = default;
-  virtual std::vector<std::unique_ptr<OpDescBind>> operator()() const = 0;
+  virtual std::vector<std::unique_ptr<OpDesc>> operator()() const = 0;
 
  protected:
   std::vector<std::string> InputGrad(const std::string& name,
@@ -58,6 +64,16 @@ class GradOpDescMakerBase {
     if (!drop_empty_grad) {
       return ret_val;
     }
+    PADDLE_ENFORCE_LE(var_names.size(), 1UL,
+                      "BUG from operator developer:"
+                      " for input argument with a list of variables, "
+                      " drop_empty_grad is not allowed because it makes"
+                      " the correspondence bewteen a variable and its gradient"
+                      " ambiguous. Use REGISTER_OP_EX to register the op"
+                      " or call InputGrad(?,false) in GradOpDescMaker."
+                      " Op type %s",
+                      fwd_op_.Type());
+
     std::vector<std::string> dropped_ret_val;
     dropped_ret_val.reserve(ret_val.size());
     std::copy_if(ret_val.begin(), ret_val.end(),
@@ -105,26 +121,26 @@ class GradOpDescMakerBase {
   std::string ForwardOpType() const { return this->fwd_op_.Type(); }
 
  private:
-  const OpDescBind& fwd_op_;
+  const OpDesc& fwd_op_;
   const std::unordered_set<std::string>& no_grad_set_;
   std::unordered_map<std::string, std::string>* grad_to_var_;
 
  protected:
-  std::vector<BlockDescBind*> grad_block_;
+  std::vector<BlockDesc*> grad_block_;
 };
 
 class SingleGradOpDescMaker : public GradOpDescMakerBase {
  public:
   using GradOpDescMakerBase::GradOpDescMakerBase;
 
-  std::vector<std::unique_ptr<OpDescBind>> operator()() const {
-    std::vector<std::unique_ptr<OpDescBind>> retv;
+  std::vector<std::unique_ptr<OpDesc>> operator()() const {
+    std::vector<std::unique_ptr<OpDesc>> retv;
     retv.emplace_back(this->Apply());
     return retv;
   }
 
  protected:
-  virtual std::unique_ptr<OpDescBind> Apply() const = 0;
+  virtual std::unique_ptr<OpDesc> Apply() const = 0;
 };
 
 template <bool DropEmptyIG = true>
@@ -133,8 +149,8 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
   using SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  virtual std::unique_ptr<OpDescBind> Apply() const {
-    auto* grad = new OpDescBind();
+  virtual std::unique_ptr<OpDesc> Apply() const {
+    auto* grad = new OpDesc();
     grad->SetType(this->GradOpType());
 
     for (auto& input_param : this->InputNames()) {
@@ -150,7 +166,7 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
 
     grad->SetAttrMap(this->Attrs());
 
-    return std::unique_ptr<OpDescBind>(grad);
+    return std::unique_ptr<OpDesc>(grad);
   }
 
   virtual std::string GradOpType() const {
@@ -161,7 +177,7 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
 class EmptyGradOpMaker : public GradOpDescMakerBase {
  public:
   using GradOpDescMakerBase::GradOpDescMakerBase;
-  std::vector<std::unique_ptr<OpDescBind>> operator()() const override {
+  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
     return {};
   }
 };
diff --git a/paddle/framework/library_type.h b/paddle/framework/library_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..68e9cabb667a5b2421fad8333d8e1be7bfa57002
--- /dev/null
+++ b/paddle/framework/library_type.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+
+// For more details about the design of LibraryType, Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library
+
+enum LibraryType { kPlain = 0; kMKLDNN = 1; kCUDNN = 2; }
+
+}  // namespace
+}  // framework
diff --git a/paddle/framework/lod_rank_table.cc b/paddle/framework/lod_rank_table.cc
index 1c2fba70c8ab0827ba6d1563f08cd0820650822e..17d524c09276fc0eb166925bd79bc0bdfcead195 100644
--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/framework/lod_rank_table.cc
@@ -46,4 +46,13 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
 }
 
 }  // namespace framework
+
+std::ostream& operator<<(std::ostream& out,
+                         const framework::LoDRankTable& table) {
+  out << "NumOfSequence " << table.items().size() << "\n";
+  for (auto& each_item : table.items()) {
+    out << "\tSeq #" << each_item.index << ", Len=" << each_item.length << "\n";
+  }
+  return out;
+}
 }  // namespace paddle
diff --git a/paddle/framework/lod_rank_table.h b/paddle/framework/lod_rank_table.h
index 9faa3a4d7bdc55ab7b24e31f5e5434dacc0a4b36..d3007d3d7379a59b32465cbd55780c6268e0e4a8 100644
--- a/paddle/framework/lod_rank_table.h
+++ b/paddle/framework/lod_rank_table.h
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #pragma once
+#include <iosfwd>
 #include "paddle/framework/lod_tensor.h"
 
 namespace paddle {
@@ -52,4 +53,8 @@ class LoDRankTable {
 };
 
 }  // namespace framework
+
+std::ostream& operator<<(std::ostream& out,
+                         const framework::LoDRankTable& table);
+
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index fdf6de4babff3bb3c253aaf516636882237e6faf..465f8c62b5fe2efd549f68bb3a9823d299ba5393 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -197,7 +197,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
   {  // the 2nd field, tensor description
      // int32_t  size
      // void*    protobuf message
-    framework::TensorDesc desc;
+    proto::TensorDesc desc;
     desc.set_data_type(framework::ToDataType(tensor.type()));
     auto dims = framework::vectorize(tensor.dims());
     auto *pb_dims = desc.mutable_dims();
@@ -262,7 +262,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
   uint32_t version;
   is.read(reinterpret_cast<char *>(&version), sizeof(version));
   PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-  framework::TensorDesc desc;
+  proto::TensorDesc desc;
   {  // int32_t size
      // proto buffer
     int32_t size;
@@ -281,16 +281,16 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
     void *buf;
     platform::Place cpu = platform::CPUPlace();
     switch (desc.data_type()) {
-      case framework::FP32:
+      case proto::FP32:
         buf = tensor->mutable_data<float>(cpu);
         break;
-      case framework::FP64:
+      case proto::FP64:
         buf = tensor->mutable_data<double>(cpu);
         break;
-      case framework::INT32:
+      case proto::INT32:
         buf = tensor->mutable_data<int>(cpu);
         break;
-      case framework::INT64:
+      case proto::INT64:
         buf = tensor->mutable_data<int64_t>(cpu);
         break;
       default:
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 9411c96aea4c10ebf921cc3e3b442769c8acbefa..0923c52a0ad2fe10cea760df20c99021984ad39d 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -184,6 +184,18 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
   return tensor;
 }
 
+// Get the absolute offset of a lod[start_level][start_idx:end_idx] and
+// relative length of details for every levels(i.e., [start_level: ]).
+//
+// For example,
+//   lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]]
+//   start_level = 0
+//   start_idx = 1
+//   end_idx = 3
+//
+// Returns:
+//  LoD = [[1, 4], [2, 4, 2, 3, 2]]
+//  pair<size_t, size_t> = {11, 24}
 std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
     const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);
 
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 7ba1e3e4e3270f4cd88e41e245f24c3cfc8aaab7..b361e64438251c1df827667fb825e7f5909fb09e 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class OpDescBind;
-class BlockDescBind;
+class OpDesc;
+class BlockDesc;
 class CompileTimeInferShapeContext : public InferShapeContext {
  public:
-  CompileTimeInferShapeContext(const OpDescBind &op,
-                               const BlockDescBind &block);
+  CompileTimeInferShapeContext(const OpDesc &op, const BlockDesc &block);
 
   bool HasInput(const std::string &name) const override;
 
@@ -58,11 +57,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     PADDLE_ENFORCE_LT(j, Outputs(out).size());
     auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
     auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
-    if (in_var->GetType() != VarDesc::LOD_TENSOR) {
+    if (in_var->GetType() != proto::VarDesc::LOD_TENSOR) {
       VLOG(3) << "input " << in << " is not LodTensor";
       return;
     }
-    PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
+    PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarDesc::LOD_TENSOR,
                       "The %d-th output of Output(%s) must be LoDTensor.", j,
                       out);
     out_var->SetLoDLevel(in_var->GetLodLevel());
@@ -70,19 +69,18 @@ class CompileTimeInferShapeContext : public InferShapeContext {
   bool IsRuntime() const override;
 
  protected:
-  VarDesc::VarType GetVarType(const std::string &name) const override;
+  proto::VarDesc::VarType GetVarType(const std::string &name) const override;
 
   DDim GetDim(const std::string &name) const override;
 
   void SetDim(const std::string &name, const DDim &dim) override;
 
-  const OpDescBind &op_;
-  const BlockDescBind &block_;
+  const OpDesc &op_;
+  const BlockDesc &block_;
 };
 
-OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const AttributeMap &attrs) {
+OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
+               const VariableNameMap &outputs, const AttributeMap &attrs) {
   desc_.set_type(type);
   inputs_ = inputs;
   outputs_ = outputs;
@@ -90,12 +88,12 @@ OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
   need_update_ = true;
 }
 
-OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
+OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
     : desc_(desc), need_update_(false) {
   // restore inputs_
   int input_size = desc_.inputs_size();
   for (int i = 0; i < input_size; ++i) {
-    const OpDesc::Var &var = desc_.inputs(i);
+    const proto::OpDesc::Var &var = desc_.inputs(i);
     std::vector<std::string> &args = inputs_[var.parameter()];
     int argu_size = var.arguments_size();
     args.reserve(argu_size);
@@ -106,7 +104,7 @@ OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
   // restore outputs_
   int output_size = desc_.outputs_size();
   for (int i = 0; i < output_size; ++i) {
-    const OpDesc::Var &var = desc_.outputs(i);
+    const proto::OpDesc::Var &var = desc_.outputs(i);
     std::vector<std::string> &args = outputs_[var.parameter()];
     int argu_size = var.arguments_size();
     args.reserve(argu_size);
@@ -115,9 +113,9 @@ OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
     }
   }
   // restore attrs_
-  for (const OpDesc::Attr &attr : desc_.attrs()) {
+  for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
     std::string attr_name = attr.name();
-    if (attr.type() != AttrType::BLOCK) {
+    if (attr.type() != proto::AttrType::BLOCK) {
       attrs_[attr_name] = GetAttrValue(attr);
     } else {
       auto bid = attr.block_idx();
@@ -126,20 +124,19 @@ OpDescBind::OpDescBind(const OpDesc &desc, ProgramDescBind *prog)
   }
 }
 
-OpDesc *OpDescBind::Proto() {
+proto::OpDesc *OpDesc::Proto() {
   Flush();
   return &desc_;
 }
 
-const std::vector<std::string> &OpDescBind::Input(
-    const std::string &name) const {
+const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
   auto it = inputs_.find(name);
   PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name,
                  Type());
   return it->second;
 }
 
-std::vector<std::string> OpDescBind::InputArgumentNames() const {
+std::vector<std::string> OpDesc::InputArgumentNames() const {
   std::vector<std::string> retv;
   for (auto &ipt : this->inputs_) {
     retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
@@ -147,21 +144,20 @@ std::vector<std::string> OpDescBind::InputArgumentNames() const {
   return retv;
 }
 
-void OpDescBind::SetInput(const std::string &param_name,
-                          const std::vector<std::string> &args) {
+void OpDesc::SetInput(const std::string &param_name,
+                      const std::vector<std::string> &args) {
   need_update_ = true;
   inputs_[param_name] = args;
 }
 
-const std::vector<std::string> &OpDescBind::Output(
-    const std::string &name) const {
+const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
   auto it = outputs_.find(name);
   PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
                  name, Type());
   return it->second;
 }
 
-std::vector<std::string> OpDescBind::OutputArgumentNames() const {
+std::vector<std::string> OpDesc::OutputArgumentNames() const {
   std::vector<std::string> retv;
   for (auto &ipt : this->outputs_) {
     retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
@@ -169,19 +165,19 @@ std::vector<std::string> OpDescBind::OutputArgumentNames() const {
   return retv;
 }
 
-void OpDescBind::SetOutput(const std::string &param_name,
-                           const std::vector<std::string> &args) {
+void OpDesc::SetOutput(const std::string &param_name,
+                       const std::vector<std::string> &args) {
   need_update_ = true;
   this->outputs_[param_name] = args;
 }
 
-AttrType OpDescBind::GetAttrType(const std::string &name) const {
+proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return static_cast<AttrType>(it->second.which() - 1);
+  return static_cast<proto::AttrType>(it->second.which() - 1);
 }
 
-std::vector<std::string> OpDescBind::AttrNames() const {
+std::vector<std::string> OpDesc::AttrNames() const {
   std::vector<std::string> retv;
   retv.reserve(attrs_.size());
   for (auto &attr : attrs_) {
@@ -190,41 +186,39 @@ std::vector<std::string> OpDescBind::AttrNames() const {
   return retv;
 }
 
-void OpDescBind::SetAttr(const std::string &name, const Attribute &v) {
+void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
   this->attrs_[name] = v;
   need_update_ = true;
 }
 
-void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
+void OpDesc::SetBlockAttr(const std::string &name, BlockDesc &block) {
   this->attrs_[name] = &block;
   need_update_ = true;
 }
 
-void OpDescBind::SetAttrMap(
+void OpDesc::SetAttrMap(
     const std::unordered_map<std::string, Attribute> &attr_map) {
   attrs_ = attr_map;
   need_update_ = true;
 }
 
-Attribute OpDescBind::GetAttr(const std::string &name) const {
+Attribute OpDesc::GetAttr(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
   return it->second;
 }
 
-int OpDescBind::GetBlockAttr(const std::string &name) const {
+int OpDesc::GetBlockAttr(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
-  return boost::get<BlockDescBind *>(it->second)->ID();
+  return boost::get<BlockDesc *>(it->second)->ID();
 }
 
-const std::unordered_map<std::string, Attribute> &OpDescBind::GetAttrMap()
-    const {
+const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
   return attrs_;
 }
 
-void OpDescBind::Rename(const std::string &old_name,
-                        const std::string &new_name) {
+void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
   for (auto &input : inputs_) {
     std::replace(input.second.begin(), input.second.end(), old_name, new_name);
   }
@@ -235,8 +229,8 @@ void OpDescBind::Rename(const std::string &old_name,
   need_update_ = true;
 }
 
-void OpDescBind::RenameOutput(const std::string &old_name,
-                              const std::string &new_name) {
+void OpDesc::RenameOutput(const std::string &old_name,
+                          const std::string &new_name) {
   for (auto &output : outputs_) {
     std::replace(output.second.begin(), output.second.end(), old_name,
                  new_name);
@@ -244,8 +238,8 @@ void OpDescBind::RenameOutput(const std::string &old_name,
   need_update_ = true;
 }
 
-void OpDescBind::RenameInput(const std::string &old_name,
-                             const std::string &new_name) {
+void OpDesc::RenameInput(const std::string &old_name,
+                         const std::string &new_name) {
   for (auto &input : inputs_) {
     std::replace(input.second.begin(), input.second.end(), old_name, new_name);
   }
@@ -253,8 +247,8 @@ void OpDescBind::RenameInput(const std::string &old_name,
 }
 
 struct SetAttrDescVisitor : public boost::static_visitor<void> {
-  explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
-  mutable OpDesc::Attr *attr_;
+  explicit SetAttrDescVisitor(proto::OpDesc::Attr *attr) : attr_(attr) {}
+  mutable proto::OpDesc::Attr *attr_;
   void operator()(int v) const { attr_->set_i(v); }
   void operator()(float v) const { attr_->set_f(v); }
   void operator()(const std::string &v) const { attr_->set_s(v); }
@@ -272,11 +266,13 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
   void operator()(const std::vector<bool> &v) const {
     VectorToRepeated(v, attr_->mutable_bools());
   }
-  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->idx()); }
+  void operator()(proto::BlockDesc *desc) const {
+    attr_->set_block_idx(desc->idx());
+  }
   void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
 };
 
-void OpDescBind::Flush() {
+void OpDesc::Flush() {
   if (need_update_) {
     this->desc_.mutable_inputs()->Clear();
     for (auto &ipt : inputs_) {
@@ -297,7 +293,7 @@ void OpDescBind::Flush() {
       auto *attr_desc = desc_.add_attrs();
       attr_desc->set_name(attr.first);
       attr_desc->set_type(
-          static_cast<framework::AttrType>(attr.second.which() - 1));
+          static_cast<proto::AttrType>(attr.second.which() - 1));
       SetAttrDescVisitor visitor(attr_desc);
       boost::apply_visitor(visitor, attr.second);
     }
@@ -328,7 +324,7 @@ static void InitInferShapeFuncs() {
   });
 }
 
-void OpDescBind::CheckAttrs() {
+void OpDesc::CheckAttrs() {
   PADDLE_ENFORCE(!Type().empty(),
                  "CheckAttr() can not be called before type is setted.");
   auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
@@ -340,7 +336,7 @@ void OpDescBind::CheckAttrs() {
   checker->Check(attrs_);
 }
 
-void OpDescBind::InferShape(const BlockDescBind &block) const {
+void OpDesc::InferShape(const BlockDesc &block) const {
   VLOG(3) << "CompileTime infer shape on " << Type();
   InitInferShapeFuncs();
   auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
@@ -363,7 +359,7 @@ void OpDescBind::InferShape(const BlockDescBind &block) const {
   infer_shape(&ctx);
 }
 
-void OpDescBind::InferVarType(BlockDescBind *block) const {
+void OpDesc::InferVarType(BlockDesc *block) const {
   auto &info = OpInfoMap::Instance().Get(this->Type());
   if (info.infer_var_type_) {
     info.infer_var_type_(*this, block);
@@ -375,14 +371,14 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
     for (auto &out_pair : this->outputs_) {
       for (auto &out_var_name : out_pair.second) {
         block->FindRecursiveOrCreateVar(out_var_name)
-            ->SetType(VarDesc::LOD_TENSOR);
+            ->SetType(proto::VarDesc::LOD_TENSOR);
       }
     }
   }
 }
 
 CompileTimeInferShapeContext::CompileTimeInferShapeContext(
-    const OpDescBind &op, const BlockDescBind &block)
+    const OpDesc &op, const BlockDesc &block)
     : op_(op), block_(block) {}
 
 bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
@@ -484,7 +480,7 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
 }
 bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
 
-VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
+proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
     const std::string &name) const {
   return block_.FindVarRecursive(name)->GetType();
 }
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index da032319afa775571d3942bf6ae415db7d233735..93d4a88f3c390551ab41e42ec2f6f30f52e306db 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -23,19 +23,19 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class BlockDescBind;
-class ProgramDescBind;
+class BlockDesc;
+class ProgramDesc;
 
-class OpDescBind {
+class OpDesc {
  public:
-  OpDescBind() {}
+  OpDesc() {}
 
-  OpDescBind(const std::string &type, const VariableNameMap &inputs,
-             const VariableNameMap &outputs, const AttributeMap &attrs);
+  OpDesc(const std::string &type, const VariableNameMap &inputs,
+         const VariableNameMap &outputs, const AttributeMap &attrs);
 
-  OpDescBind(const OpDesc &desc, ProgramDescBind *prog);
+  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog);
 
-  OpDesc *Proto();
+  proto::OpDesc *Proto();
 
   std::string Type() const { return desc_.type(); }
 
@@ -59,13 +59,13 @@ class OpDescBind {
     return attrs_.find(name) != attrs_.end();
   }
 
-  AttrType GetAttrType(const std::string &name) const;
+  proto::AttrType GetAttrType(const std::string &name) const;
 
   std::vector<std::string> AttrNames() const;
 
   void SetAttr(const std::string &name, const Attribute &v);
 
-  void SetBlockAttr(const std::string &name, BlockDescBind &block);
+  void SetBlockAttr(const std::string &name, BlockDesc &block);
 
   Attribute GetAttr(const std::string &name) const;
 
@@ -107,9 +107,9 @@ class OpDescBind {
 
   void CheckAttrs();
 
-  void InferShape(const BlockDescBind &block) const;
+  void InferShape(const BlockDesc &block) const;
 
-  void InferVarType(BlockDescBind *block) const;
+  void InferVarType(BlockDesc *block) const;
 
   void MarkAsTarget() { desc_.set_is_target(true); }
 
@@ -126,8 +126,10 @@ class OpDescBind {
     return ret_val;
   }
 
-  OpDesc desc_;
+  proto::OpDesc desc_;
+  // input arg name => output variable names
   VariableNameMap inputs_;
+  // output arg name => output variable names
   VariableNameMap outputs_;
   AttributeMap attrs_;
 
diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
index d3b1a3b5fa2cf8f6a9571e92a319f3757666657e..7772d6e745c2207024863d3dd5cbef052358272e 100644
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -34,7 +34,7 @@ class InferShapeBase {
 struct OpInfo {
   OpCreator creator_;
   GradOpMakerFN grad_op_maker_;
-  OpProto* proto_{nullptr};
+  proto::OpProto* proto_{nullptr};
   OpAttrChecker* checker_{nullptr};
   InferVarTypeFN infer_var_type_;
   InferShapeFN infer_shape_;
@@ -43,7 +43,7 @@ struct OpInfo {
     return proto_ != nullptr && checker_ != nullptr;
   }
 
-  const OpProto& Proto() const {
+  const proto::OpProto& Proto() const {
     PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
     PADDLE_ENFORCE(proto_->IsInitialized(),
                    "Operator Proto must be initialized in op info");
diff --git a/paddle/framework/op_proto_maker.h b/paddle/framework/op_proto_maker.h
index 44e8ab16895cc604f85bb83e240eab55739f8ba0..efd3a5ca535403d8d46a73adc899d914623b53e4 100644
--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/framework/op_proto_maker.h
@@ -22,6 +22,8 @@ namespace framework {
 // this class not only make proto but also init attribute checkers.
 class OpProtoAndCheckerMaker {
  public:
+  using OpProto = proto::OpProto;
+  using OpAttrChecker = framework::OpAttrChecker;
   OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : proto_(proto), op_checker_(op_checker) {}
 
@@ -80,7 +82,7 @@ class OpProtoAndCheckerMaker {
 
 class NOPMaker : public OpProtoAndCheckerMaker {
  public:
-  NOPMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  NOPMaker(OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {}
 };
 
diff --git a/paddle/framework/op_proto_maker_test.cc b/paddle/framework/op_proto_maker_test.cc
index 988a14cf4de8fdf052ca7e8c41bff0c05ba2daaa..f16cb6fa3aa095a6d9737d84c7ce58f385a7072b 100644
--- a/paddle/framework/op_proto_maker_test.cc
+++ b/paddle/framework/op_proto_maker_test.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 
 class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
-  TestAttrProtoMaker(paddle::framework::OpProto* proto,
+  TestAttrProtoMaker(paddle::framework::proto::OpProto* proto,
                      paddle::framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<float>("scale", "scale of test op");
@@ -27,7 +27,7 @@ class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };
 
 TEST(ProtoMaker, DuplicatedAttr) {
-  paddle::framework::OpProto op_proto;
+  paddle::framework::proto::OpProto op_proto;
   paddle::framework::OpAttrChecker op_checker;
   auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
   ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
@@ -35,7 +35,7 @@ TEST(ProtoMaker, DuplicatedAttr) {
 
 class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
-  TestInOutProtoMaker(paddle::framework::OpProto* proto,
+  TestInOutProtoMaker(paddle::framework::proto::OpProto* proto,
                       paddle::framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("input", "input of test op");
@@ -44,7 +44,7 @@ class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 };
 
 TEST(ProtoMaker, DuplicatedInOut) {
-  paddle::framework::OpProto op_proto;
+  paddle::framework::proto::OpProto op_proto;
   paddle::framework::OpAttrChecker op_checker;
   auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
   ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
index 8dedd873aad648174b770b84e5232cd17b577e72..dfa151316daeccfe92e26818165a694b78b5df62 100644
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -31,7 +31,8 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
 }
 
 static VariableNameMap ConvertOpDescVarsToVarNameMap(
-    const google::protobuf::RepeatedPtrField<OpDesc::Var>& op_desc_vars) {
+    const google::protobuf::RepeatedPtrField<proto::OpDesc::Var>&
+        op_desc_vars) {
   VariableNameMap ret_val;
   for (auto& var : op_desc_vars) {
     auto& var_names = ret_val[var.parameter()];
@@ -43,9 +44,10 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap(
   return ret_val;
 }
 
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
+    const proto::OpDesc& op_desc) {
   VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be"
-             "used in unit tests. Use CreateOp(const OpDescBind& op_desc) "
+             "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
              "instead.";
   VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
   VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
@@ -57,7 +59,7 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
   return CreateOp(op_desc.type(), inputs, outputs, attrs);
 }
 
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDescBind& op_desc) {
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
   return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(),
                   op_desc.GetAttrMap());
 }
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index b29238432b05d81e984e1f4c269a00b01a4229cc..7f0155b61f44b676825b84667d5bebb798cae8a3 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -77,9 +77,9 @@ class OpRegistry {
                                                 const VariableNameMap& outputs,
                                                 AttributeMap attrs);
 
-  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
+  static std::unique_ptr<OperatorBase> CreateOp(const proto::OpDesc& op_desc);
 
-  static std::unique_ptr<OperatorBase> CreateOp(const OpDescBind& op_desc);
+  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
 };
 
 template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
@@ -126,6 +126,14 @@ class OpKernelRegistrar : public Registrar {
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
+/*
+  The variadic arguments should be class types derived from one of the
+  following classes:
+    OpProtoAndCheckerMaker
+    GradOpDescMakerBase
+    VarTypeInference
+    InferShapeBase
+*/
 #define REGISTER_OPERATOR(op_type, op_class, ...)                      \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
       __reg_op__##op_type,                                             \
@@ -144,20 +152,29 @@ class OpKernelRegistrar : public Registrar {
   }
 
 /**
- * Macro to register Operator.
+ * Macro to register Operator. When the input is duplicable, you should
+ * use REGISTER_OP_EX with deop_empty_grad=false instead.
  */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,       \
-                    grad_op_class)                                         \
-  REGISTER_OPERATOR(grad_op_type, grad_op_class);                          \
-  class _GradOpDescMaker_##grad_op_type##_                                 \
-      : public ::paddle::framework::DefaultGradOpDescMaker<true> {         \
-    using ::paddle::framework::DefaultGradOpDescMaker<                     \
-        true>::DefaultGradOpDescMaker;                                     \
-                                                                           \
-   protected:                                                              \
-    virtual std::string GradOpType() const { return #grad_op_type; }       \
-  };                                                                       \
-  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \
+#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
+                    grad_op_class)                                   \
+  REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,    \
+                 grad_op_class, true)
+
+// When an argument is duplicable, we need to use this version.
+// Perhaps we can omit DropEmptyIG template parameter and
+// only have one version of REGISTER_OP.
+#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,       \
+                       grad_op_class, drop_empty_grad)                        \
+  REGISTER_OPERATOR(grad_op_type, grad_op_class);                             \
+  class _GradOpDescMaker_##grad_op_type##_                                    \
+      : public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
+    using ::paddle::framework::DefaultGradOpDescMaker<                        \
+        drop_empty_grad>::DefaultGradOpDescMaker;                             \
+                                                                              \
+   protected:                                                                 \
+    virtual std::string GradOpType() const { return #grad_op_type; }          \
+  };                                                                          \
+  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,    \
                     op_maker_class);
 
 #define REGISTER_OP_WITH_KERNEL(op_type, ...)                         \
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index b860fe6cac773d1e85adecc43f5dfec42b6c7661..27713e5cbffe95e0ae31ac94a70c64deb53c4ffb 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -51,7 +51,7 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 
 static void BuildVar(const std::string& param_name,
                      std::initializer_list<const char*> arguments,
-                     paddle::framework::OpDesc::Var* var) {
+                     paddle::framework::proto::OpDesc::Var* var) {
   var->set_parameter(param_name);
   for (auto& arg_name : arguments) {
     var->add_arguments(arg_name);
@@ -63,7 +63,7 @@ REGISTER_OP_WITHOUT_GRADIENT(my_test_op, paddle::framework::MyTestOp,
                              paddle::framework::MyTestOpProtoAndCheckerMaker);
 
 TEST(OpRegistry, CreateOp) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("cos_sim");
   BuildVar("input", {"aa"}, op_desc.add_inputs());
   BuildVar("output", {"bb"}, op_desc.add_outputs());
@@ -71,7 +71,7 @@ TEST(OpRegistry, CreateOp) {
   float scale = 3.3;
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(scale);
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
@@ -83,14 +83,14 @@ TEST(OpRegistry, CreateOp) {
 }
 
 TEST(OpRegistry, IllegalAttr) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("cos_sim");
   BuildVar("input", {"aa"}, op_desc.add_inputs());
   BuildVar("output", {"bb"}, op_desc.add_outputs());
 
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(-2.0);
 
   bool caught = false;
@@ -108,7 +108,7 @@ TEST(OpRegistry, IllegalAttr) {
 }
 
 TEST(OpRegistry, DefaultValue) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("cos_sim");
   BuildVar("input", {"aa"}, op_desc.add_inputs());
   BuildVar("output", {"bb"}, op_desc.add_outputs());
@@ -123,7 +123,7 @@ TEST(OpRegistry, DefaultValue) {
 }
 
 TEST(OpRegistry, CustomChecker) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("my_test_op");
   BuildVar("input", {"ii"}, op_desc.add_inputs());
   BuildVar("output", {"oo"}, op_desc.add_outputs());
@@ -145,7 +145,7 @@ TEST(OpRegistry, CustomChecker) {
   // set 'test_attr' set to an illegal value
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("test_attr");
-  attr->set_type(paddle::framework::AttrType::INT);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
   attr->set_i(3);
   caught = false;
   try {
@@ -164,7 +164,7 @@ TEST(OpRegistry, CustomChecker) {
   op_desc.mutable_attrs()->Clear();
   attr = op_desc.mutable_attrs()->Add();
   attr->set_name("test_attr");
-  attr->set_type(paddle::framework::AttrType::INT);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
   attr->set_i(4);
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   paddle::platform::CPUDeviceContext dev_ctx;
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index e83d7547831744333d6a9c36e842d840a2a0dc03..0e58c0b5707516bd1274181df568d08ff504c152 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -377,7 +377,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
     }
   }
 
-  VarDesc::VarType GetVarType(const std::string& name) const override {
+  proto::VarDesc::VarType GetVarType(const std::string& name) const override {
     auto* var = scope_.FindVar(name);
     return ToVarType(var->Type());
   }
@@ -417,7 +417,7 @@ OpKernelType OperatorWithKernel::GetKernelType(
     const ExecutionContext& ctx) const {
   return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
 }
-DataType OperatorWithKernel::IndicateDataType(
+proto::DataType OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
   auto& scope = ctx.scope();
   int data_type = -1;
@@ -443,7 +443,7 @@ DataType OperatorWithKernel::IndicateDataType(
     }
   }
   PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
-  return static_cast<DataType>(data_type);
+  return static_cast<proto::DataType>(data_type);
 }
 
 }  // namespace framework
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index e60dbfc313f732120f6879fd6fd19ca8abc06813..3207360cbaca4e3b96dfe933c67aaa70c59a6044 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -358,12 +358,13 @@ struct OpKernelType {
   };
 
   platform::Place place_;
-  DataType data_type_;
+  proto::DataType data_type_;
 
-  OpKernelType(DataType data_type, platform::Place place)
+  OpKernelType(proto::DataType data_type, platform::Place place)
       : place_(place), data_type_(data_type) {}
 
-  OpKernelType(DataType data_type, const platform::DeviceContext& dev_ctx)
+  OpKernelType(proto::DataType data_type,
+               const platform::DeviceContext& dev_ctx)
       : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
 
   bool operator==(const OpKernelType& o) const {
@@ -409,7 +410,7 @@ class OperatorWithKernel : public OperatorBase {
  private:
   // indicate kernel DataType by input data. Defaultly all input data must be
   // same.
-  DataType IndicateDataType(const ExecutionContext& ctx) const;
+  proto::DataType IndicateDataType(const ExecutionContext& ctx) const;
 };
 
 std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key);
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index b678178454ff63e4217f0be7a9938a9ba183cda4..05a465152204c8e9f9dbd75d0bfb21ea44d25cf1 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -58,7 +58,7 @@ class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 
 static void BuildVar(const std::string& param_name,
                      std::initializer_list<const char*> arguments,
-                     paddle::framework::OpDesc::Var* var) {
+                     paddle::framework::proto::OpDesc::Var* var) {
   var->set_parameter(param_name);
   for (auto& arg_name : arguments) {
     *var->mutable_arguments()->Add() = arg_name;
@@ -70,14 +70,14 @@ REGISTER_OP_WITHOUT_GRADIENT(
     paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);
 
 TEST(OperatorBase, all) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("test_operator");
   BuildVar("input", {"IN1"}, op_desc.add_inputs());
   BuildVar("output", {"OUT1"}, op_desc.add_outputs());
 
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(3.14);
 
   paddle::platform::CPUDeviceContext device_context;
@@ -115,7 +115,7 @@ class OpWithKernelTest : public OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {}
   OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
-    return OpKernelType(DataType::FP32, ctx.GetPlace());
+    return OpKernelType(proto::DataType::FP32, ctx.GetPlace());
   }
 };
 
@@ -195,14 +195,14 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
 
 // test with single input
 TEST(OpKernel, all) {
-  paddle::framework::OpDesc op_desc;
+  paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("op_with_kernel");
   BuildVar("x", {"IN1"}, op_desc.add_inputs());
   BuildVar("y", {"OUT1"}, op_desc.add_outputs());
 
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(3.14);
 
   paddle::platform::CPUDeviceContext cpu_device_context;
@@ -224,7 +224,7 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 TEST(OpKernel, multi_inputs) {
   using namespace paddle::framework;
 
-  OpDesc op_desc;
+  proto::OpDesc op_desc;
   op_desc.set_type("op_multi_inputs_with_kernel");
   BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
   BuildVar("k", {"k0"}, op_desc.add_inputs());
@@ -232,7 +232,7 @@ TEST(OpKernel, multi_inputs) {
 
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
-  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
   attr->set_f(3.14);
 
   paddle::platform::CPUDeviceContext cpu_device_context;
diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc
index 4af8d94563ad0ecf6fcc6fe0575b0f69006a9a2d..b5d9e5e385c1ba57169ef885824fc23b0f130692 100644
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -18,49 +18,49 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-BlockDescBind *ProgramDescBind::AppendBlock(const BlockDescBind &parent) {
+BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
   auto *b = desc_.add_blocks();
   b->set_parent_idx(parent.ID());
   b->set_idx(desc_.blocks_size() - 1);
-  blocks_.emplace_back(new BlockDescBind(this, b));
+  blocks_.emplace_back(new BlockDesc(this, b));
   return blocks_.back().get();
 }
 
-ProgramDesc *ProgramDescBind::Proto() {
+proto::ProgramDesc *ProgramDesc::Proto() {
   for (auto &block : blocks_) {
     block->Flush();
   }
   return &desc_;
 }
 
-ProgramDescBind::ProgramDescBind() {
+ProgramDesc::ProgramDesc() {
   auto *block = desc_.mutable_blocks()->Add();
   block->set_idx(kRootBlockIndex);
   block->set_parent_idx(kNoneBlockIndex);
-  blocks_.emplace_back(new BlockDescBind(this, block));
+  blocks_.emplace_back(new BlockDesc(this, block));
 }
 
-ProgramDescBind::ProgramDescBind(const ProgramDescBind &o) {
+ProgramDesc::ProgramDesc(const ProgramDesc &o) {
   desc_ = o.desc_;
 
   for (int i = 0; i < desc_.blocks_size(); ++i) {
     auto *block = desc_.mutable_blocks(i);
-    blocks_.emplace_back(new BlockDescBind(*o.blocks_[i], block, this));
+    blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
   }
 }
 
-ProgramDescBind::ProgramDescBind(const ProgramDesc &desc) {
+ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
   desc_ = desc;
   for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDescBind(this, &block_desc));
+    blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
 }
 
-ProgramDescBind::ProgramDescBind(const std::string &binary_str) {
+ProgramDesc::ProgramDesc(const std::string &binary_str) {
   PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
                  "Fail to parse program_desc from binary string.");
   for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDescBind(this, &block_desc));
+    blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
 }
 
diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h
index b1cb086de4345902482d8254b8aeec041ecf81bc..15a962bb696d6172acd1a83cf9bb1ffd0846d449 100644
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -23,32 +23,32 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class BlockDescBind;
+class BlockDesc;
 
-class ProgramDescBind {
+class ProgramDesc {
  public:
-  ProgramDescBind();
+  ProgramDesc();
 
-  explicit ProgramDescBind(const ProgramDesc &desc);
+  explicit ProgramDesc(const proto::ProgramDesc &desc);
 
-  ProgramDescBind(const ProgramDescBind &o);
+  ProgramDesc(const ProgramDesc &o);
 
-  explicit ProgramDescBind(const std::string &binary_str);
+  explicit ProgramDesc(const std::string &binary_str);
 
-  BlockDescBind *AppendBlock(const BlockDescBind &parent);
+  BlockDesc *AppendBlock(const BlockDesc &parent);
 
-  BlockDescBind *MutableBlock(size_t idx) { return blocks_[idx].get(); }
+  BlockDesc *MutableBlock(size_t idx) { return blocks_[idx].get(); }
 
-  const BlockDescBind &Block(size_t idx) const { return *blocks_[idx]; }
+  const BlockDesc &Block(size_t idx) const { return *blocks_[idx]; }
 
   size_t Size() const { return blocks_.size(); }
 
-  ProgramDesc *Proto();
+  proto::ProgramDesc *Proto();
 
  private:
-  ProgramDesc desc_;
+  proto::ProgramDesc desc_;
 
-  std::vector<std::unique_ptr<BlockDescBind>> blocks_;
+  std::vector<std::unique_ptr<BlockDesc>> blocks_;
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/program_desc_test.cc b/paddle/framework/program_desc_test.cc
index 83e7286e0ec3639fa589b0958922543a3ba16a00..a49886f7ea56bc57459202dba65e3f76a902cd70 100644
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
@@ -19,18 +19,18 @@
 namespace paddle {
 namespace framework {
 TEST(ProgramDesc, copy_ctor) {
-  ProgramDescBind program;
+  ProgramDesc program;
   auto* global_block = program.MutableBlock(0);
   auto* x = global_block->Var("X");
-  x->SetType(VarDesc_VarType_LOD_TENSOR);
+  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   x->SetLoDLevel(0);
-  x->SetDataType(FP32);
+  x->SetDataType(proto::FP32);
   x->SetShape({1000, 784});
 
   auto* y = global_block->Var("Y");
-  y->SetType(VarDesc_VarType_LOD_TENSOR);
+  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   y->SetLoDLevel(0);
-  y->SetDataType(FP32);
+  y->SetDataType(proto::FP32);
   y->SetShape({784, 100});
 
   auto* op = global_block->AppendOp();
@@ -39,15 +39,15 @@ TEST(ProgramDesc, copy_ctor) {
   op->SetInput("Y", {y->Name()});
 
   auto* out = global_block->Var("Out");
-  out->SetType(VarDesc_VarType_LOD_TENSOR);
+  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   op->SetOutput("Y", {out->Name()});
 
-  ProgramDescBind program_copy(program);
+  ProgramDesc program_copy(program);
 
   auto* global_block_copy = program_copy.MutableBlock(0);
   ASSERT_NE(global_block, global_block_copy);
 
-  auto assert_same_var = [&](const std::string& name, VarDescBind* var_before) {
+  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
     ASSERT_TRUE(global_block_copy->HasVar(name));
     auto* copy = global_block_copy->Var(name);
     ASSERT_NE(copy, var_before);
@@ -81,18 +81,18 @@ TEST(ProgramDesc, copy_ctor) {
 }
 
 TEST(ProgramDescBind, serialize_and_deserialize) {
-  ProgramDescBind program_origin;
+  ProgramDesc program_origin;
   auto* global_block = program_origin.MutableBlock(0);
   auto* x = global_block->Var("X");
-  x->SetType(VarDesc_VarType_LOD_TENSOR);
+  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   x->SetLoDLevel(0);
-  x->SetDataType(FP32);
+  x->SetDataType(proto::FP32);
   x->SetShape({1000, 784});
 
   auto* y = global_block->Var("Y");
-  y->SetType(VarDesc_VarType_LOD_TENSOR);
+  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   y->SetLoDLevel(0);
-  y->SetDataType(FP32);
+  y->SetDataType(proto::FP32);
   y->SetShape({784, 100});
 
   auto* op = global_block->AppendOp();
@@ -101,17 +101,17 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
   op->SetInput("Y", {y->Name()});
 
   auto* out = global_block->Var("Out");
-  out->SetType(VarDesc_VarType_LOD_TENSOR);
+  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
   op->SetOutput("Y", {out->Name()});
 
   std::string binary_str;
   program_origin.Proto()->SerializeToString(&binary_str);
 
-  ProgramDescBind program_restored(binary_str);
+  ProgramDesc program_restored(binary_str);
   auto* global_block_restored = program_restored.MutableBlock(0);
   ASSERT_NE(global_block, global_block_restored);
 
-  auto assert_same_var = [&](const std::string& name, VarDescBind* var_before) {
+  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
     ASSERT_TRUE(global_block_restored->HasVar(name));
     auto* restored = global_block_restored->Var(name);
     ASSERT_NE(restored, var_before);
diff --git a/paddle/framework/prune.cc b/paddle/framework/prune.cc
index da76052eb4d3067214841af72a35cebb26477e7f..25eb813ffb96e9b1e13299421ead9f85c02da59f 100644
--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
@@ -29,7 +29,7 @@ const std::string kFetchOpType = "fetch";
 const std::string kDropOutOpType = "dropout";
 const std::string kBatchNormOpType = "batch_norm";
 
-bool HasDependentVar(const OpDesc& op_desc,
+bool HasDependentVar(const proto::OpDesc& op_desc,
                      const std::set<std::string>& dependent_vars) {
   for (auto& var : op_desc.outputs()) {
     for (auto& argu : var.arguments()) {
@@ -41,14 +41,15 @@ bool HasDependentVar(const OpDesc& op_desc,
   return false;
 }
 
-bool IsTarget(const OpDesc& op_desc) {
+bool IsTarget(const proto::OpDesc& op_desc) {
   if (op_desc.has_is_target()) {
     return op_desc.is_target();
   }
   return false;
 }
 
-void prune_impl(const ProgramDesc& input, ProgramDesc* output, int block_id) {
+void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
+                int block_id) {
   // TODO(tonyyang-svail):
   //    - will change to use multiple blocks for RNN op and Cond Op
 
@@ -104,12 +105,12 @@ void prune_impl(const ProgramDesc& input, ProgramDesc* output, int block_id) {
 }
 
 // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
-void Prune(const ProgramDesc& input, ProgramDesc* output) {
+void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
   prune_impl(input, output, 0);
 }
 
-void inference_optimize_impl(const ProgramDesc& input, ProgramDesc* output,
-                             int block_id) {
+void inference_optimize_impl(const proto::ProgramDesc& input,
+                             proto::ProgramDesc* output, int block_id) {
   *output = input;
   auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
   for (auto& op_desc : *op_field) {
@@ -125,7 +126,8 @@ void inference_optimize_impl(const ProgramDesc& input, ProgramDesc* output,
   }
 }
 
-void InferenceOptimize(const ProgramDesc& input, ProgramDesc* output) {
+void InferenceOptimize(const proto::ProgramDesc& input,
+                       proto::ProgramDesc* output) {
   inference_optimize_impl(input, output, 0);
 }
 
diff --git a/paddle/framework/prune.h b/paddle/framework/prune.h
index 23db014894348094a98e043aa744c6f0d27b2640..593292523d0c14136791bb804a4721a0740b47ba 100644
--- a/paddle/framework/prune.h
+++ b/paddle/framework/prune.h
@@ -20,9 +20,10 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void Prune(const ProgramDesc& input, ProgramDesc* output);
+void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output);
 
-void InferenceOptimize(const ProgramDesc& input, ProgramDesc* output);
+void InferenceOptimize(const proto::ProgramDesc& input,
+                       proto::ProgramDesc* output);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/prune_test.cc b/paddle/framework/prune_test.cc
index f21df37a292fd1e039ee8f8fa26244e26c978cae..bdd57659432ea4f9bdd05425a802110b0c202fb8 100644
--- a/paddle/framework/prune_test.cc
+++ b/paddle/framework/prune_test.cc
@@ -29,12 +29,12 @@ namespace ops = paddle::operators;
 
 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
            const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           paddle::framework::BlockDescBind *block) {
+           paddle::framework::BlockDesc *block) {
   // insert output
   for (auto kv : outputs) {
     for (auto v : kv.second) {
       auto var = block->Var(v);
-      var->SetDataType(paddle::framework::DataType::FP32);
+      var->SetDataType(paddle::framework::proto::DataType::FP32);
     }
   }
 
@@ -51,26 +51,26 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
 }
 
 TEST(Prune, one_operator) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
   AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
         block);
 
-  f::ProgramDesc *pdesc = program.Proto();
-  f::ProgramDesc pruned;
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc pruned;
 
-  Prune(*pdesc, &pruned);
+  f::Prune(*pdesc, &pruned);
   PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 0);
 
   pdesc->mutable_blocks(0)->mutable_ops(0)->set_is_target(true);
-  Prune(*pdesc, &pruned);
+  f::Prune(*pdesc, &pruned);
   PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 1);
 }
 
 TEST(Prune, forward) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
   AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
         block);
@@ -81,19 +81,19 @@ TEST(Prune, forward) {
   AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{},
         block);
 
-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
 
   for (int i = 0; i < pdesc->blocks(0).ops_size(); ++i) {
-    f::ProgramDesc pruned;
+    f::proto::ProgramDesc pruned;
     pdesc->mutable_blocks(0)->mutable_ops(i)->set_is_target(true);
-    Prune(*pdesc, &pruned);
+    f::Prune(*pdesc, &pruned);
     PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), i + 1);
   }
 }
 
 TEST(Prune, multi_input_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
   AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{},
         block);
@@ -104,17 +104,17 @@ TEST(Prune, multi_input_op) {
   AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}},
         f::AttributeMap{}, block);
 
-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
   pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true);
 
-  f::ProgramDesc pruned;
-  Prune(*pdesc, &pruned);
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
   PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 4);
 }
 
 TEST(Prune, multi_output_op) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
   AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
         f::AttributeMap{}, block);
@@ -123,17 +123,17 @@ TEST(Prune, multi_output_op) {
   AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
         block);
 
-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
   pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
 
-  f::ProgramDesc pruned;
-  Prune(*pdesc, &pruned);
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
   PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 2);
 }
 
 TEST(Prune, multi_target) {
-  f::ProgramDescBind program;
-  f::BlockDescBind *block = program.MutableBlock(0);
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
 
   AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
         f::AttributeMap{}, block);
@@ -142,11 +142,11 @@ TEST(Prune, multi_target) {
   AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
         block);
 
-  f::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc *pdesc = program.Proto();
   pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
   pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
 
-  f::ProgramDesc pruned;
-  Prune(*pdesc, &pruned);
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
   PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 3);
 }
diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
index 7dac1cfd5ee0c320c67bc0b2448417d258d6862b..86dc01665bda5e7f988e60780c0600b049d737ef 100644
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -57,17 +57,17 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
     SetDim(names[i], dims[i]);
   }
 }
-std::vector<VarDesc::VarType> InferShapeContext::GetInputsVarType(
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetInputsVarType(
     const std::string &name) const {
   return GetVarTypes(Inputs(name));
 }
-std::vector<VarDesc::VarType> InferShapeContext::GetOutputsVarType(
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetOutputsVarType(
     const std::string &name) const {
   return GetVarTypes(Outputs(name));
 }
-std::vector<VarDesc::VarType> InferShapeContext::GetVarTypes(
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetVarTypes(
     const std::vector<std::string> &names) const {
-  std::vector<VarDesc::VarType> retv;
+  std::vector<proto::VarDesc::VarType> retv;
   retv.resize(names.size());
   std::transform(names.begin(), names.end(), retv.begin(),
                  std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index 46f2ea84b4b64292cc9026ef9864621efba79c7a..f93319d8f2fd4c5d388bd57fd595a6a5edd51775 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -27,8 +27,9 @@ class InferShapeContext {
   virtual bool HasInput(const std::string &name) const = 0;
   virtual bool HasOutput(const std::string &name) const = 0;
 
-  std::vector<VarDesc::VarType> GetInputsVarType(const std::string &name) const;
-  std::vector<VarDesc::VarType> GetOutputsVarType(
+  std::vector<proto::VarDesc::VarType> GetInputsVarType(
+      const std::string &name) const;
+  std::vector<proto::VarDesc::VarType> GetOutputsVarType(
       const std::string &name) const;
 
   virtual bool HasInputs(const std::string &name) const = 0;
@@ -65,10 +66,10 @@ class InferShapeContext {
   std::vector<framework::DDim> GetDims(
       const std::vector<std::string> &names) const;
 
-  std::vector<VarDesc::VarType> GetVarTypes(
+  std::vector<proto::VarDesc::VarType> GetVarTypes(
       const std::vector<std::string> &names) const;
 
-  virtual VarDesc::VarType GetVarType(const std::string &name) const = 0;
+  virtual proto::VarDesc::VarType GetVarType(const std::string &name) const = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h
index baeb98c9bd49ec65da5931bcbe33ab788f86f3e8..da152e8b9d23490e2e69c2dd215c45355e1c1e44 100644
--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
@@ -25,11 +25,9 @@
 namespace paddle {
 namespace framework {
 class OperatorBase;
-class OpDescBind;
-class BlockDescBind;
-class BlockDesc;
+class OpDesc;
 class InferShapeContext;
-class BlockDescBind;
+class BlockDesc;
 
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 
@@ -37,7 +35,7 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using Attribute =
     boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                    std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDescBind*>;
+                   std::vector<bool>, BlockDesc*>;
 
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
@@ -45,13 +43,13 @@ using OpCreator = std::function<OperatorBase*(
     const std::string& /*type*/, const VariableNameMap& /*inputs*/,
     const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
 
-using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDescBind>>(
-    const OpDescBind&, const std::unordered_set<std::string>& /*no_grad_set*/,
+using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
+    const OpDesc&, const std::unordered_set<std::string>& /*no_grad_set*/,
     std::unordered_map<std::string, std::string>* /*grad_to_var*/,
-    const std::vector<BlockDescBind*>& grad_block)>;
+    const std::vector<BlockDesc*>& grad_block)>;
 
-using InferVarTypeFN = std::function<void(const OpDescBind& /*op_desc*/,
-                                          BlockDescBind* /*block*/)>;
+using InferVarTypeFN =
+    std::function<void(const OpDesc& /*op_desc*/, BlockDesc* /*block*/)>;
 
 using InferShapeFN = std::function<void(InferShapeContext*)>;
 
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 0babec29f6f4412ed29deeafe24470e86b30a636..bd8973eeb369aabd2c52d4fccf799657c564ee78 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -18,30 +18,32 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-VarDesc::VarType VarDescBind::GetType() const { return desc_.type(); }
+proto::VarDesc::VarType VarDesc::GetType() const { return desc_.type(); }
 
-void VarDescBind::SetType(VarDesc::VarType type) { desc_.set_type(type); }
+void VarDesc::SetType(proto::VarDesc::VarType type) { desc_.set_type(type); }
 
-void VarDescBind::SetShape(const std::vector<int64_t> &dims) {
+void VarDesc::SetShape(const std::vector<int64_t> &dims) {
   VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
 }
 
-void VarDescBind::SetDataType(DataType data_type) {
+void VarDesc::SetDataType(proto::DataType data_type) {
   mutable_tensor_desc()->set_data_type(data_type);
 }
 
-std::vector<int64_t> VarDescBind::Shape() const {
+std::vector<int64_t> VarDesc::Shape() const {
   return RepeatedToVector(tensor_desc().dims());
 }
 
-DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }
+proto::DataType VarDesc::GetDataType() const {
+  return tensor_desc().data_type();
+}
 
-void VarDescBind::SetLoDLevel(int32_t lod_level) {
+void VarDesc::SetLoDLevel(int32_t lod_level) {
   switch (desc_.type()) {
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
       desc_.mutable_lod_tensor()->set_lod_level(lod_level);
       break;
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
       desc_.mutable_tensor_array()->set_lod_level(lod_level);
       break;
     default:
@@ -50,11 +52,11 @@ void VarDescBind::SetLoDLevel(int32_t lod_level) {
   }
 }
 
-int32_t VarDescBind::GetLodLevel() const {
+int32_t VarDesc::GetLodLevel() const {
   switch (desc_.type()) {
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
       return desc_.lod_tensor().lod_level();
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().lod_level();
     default:
       PADDLE_THROW("Tensor type=%d does not support LoDLevel",
@@ -62,29 +64,29 @@ int32_t VarDescBind::GetLodLevel() const {
   }
 }
 
-const TensorDesc &VarDescBind::tensor_desc() const {
+const proto::TensorDesc &VarDesc::tensor_desc() const {
   PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
   switch (desc_.type()) {
-    case VarDesc::SELECTED_ROWS:
+    case proto::VarDesc::SELECTED_ROWS:
       return desc_.selected_rows();
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
       return desc_.lod_tensor().tensor();
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().tensor();
     default:
       PADDLE_THROW("Unexpected branch.");
   }
 }
 
-TensorDesc *VarDescBind::mutable_tensor_desc() {
+proto::TensorDesc *VarDesc::mutable_tensor_desc() {
   PADDLE_ENFORCE(desc_.has_type(),
                  "invoke MutableTensorDesc must after set type");
   switch (desc_.type()) {
-    case VarDesc::SELECTED_ROWS:
+    case proto::VarDesc::SELECTED_ROWS:
       return desc_.mutable_selected_rows();
-    case VarDesc::LOD_TENSOR:
+    case proto::VarDesc::LOD_TENSOR:
       return desc_.mutable_lod_tensor()->mutable_tensor();
-    case VarDesc::LOD_TENSOR_ARRAY:
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.mutable_tensor_array()->mutable_tensor();
     default:
       PADDLE_THROW("Unexpected branch.");
diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h
index 5cf4608944c5011d798fbde060002a57be8f6102..4fd2abe7fb215c3ac454de3e30754685111eb570 100644
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -53,44 +53,44 @@ inline void VectorToRepeated(const std::vector<bool> &vec,
   }
 }
 
-class VarDescBind {
+class VarDesc {
  public:
-  explicit VarDescBind(const std::string &name) {
+  explicit VarDesc(const std::string &name) {
     desc_.set_name(name);
-    desc_.set_type(VarDesc::LOD_TENSOR);
+    desc_.set_type(proto::VarDesc::LOD_TENSOR);
   }
 
-  explicit VarDescBind(const VarDesc &desc) : desc_(desc) {}
+  explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {}
 
-  VarDesc *Proto() { return &desc_; }
+  proto::VarDesc *Proto() { return &desc_; }
 
   std::string Name() const { return desc_.name(); }
 
   void SetShape(const std::vector<int64_t> &dims);
 
-  void SetDataType(DataType data_type);
+  void SetDataType(proto::DataType data_type);
 
   std::vector<int64_t> Shape() const;
 
-  DataType GetDataType() const;
+  proto::DataType GetDataType() const;
 
   void SetLoDLevel(int32_t lod_level);
 
   int32_t GetLodLevel() const;
 
-  VarDesc::VarType GetType() const;
+  proto::VarDesc::VarType GetType() const;
 
-  void SetType(VarDesc::VarType type);
+  void SetType(proto::VarDesc::VarType type);
 
   bool Persistable() const { return desc_.persistable(); }
 
   void SetPersistable(bool persistable) { desc_.set_persistable(persistable); }
 
  private:
-  const TensorDesc &tensor_desc() const;
-  TensorDesc *mutable_tensor_desc();
+  const proto::TensorDesc &tensor_desc() const;
+  proto::TensorDesc *mutable_tensor_desc();
 
-  VarDesc desc_;
+  proto::VarDesc desc_;
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/var_type.h b/paddle/framework/var_type.h
index 0f19870bec3e69d07278507cc556a86bbd25d12d..43a72276408bdefc329e8ddcd901ba346aba35f3 100644
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@@ -20,15 +20,15 @@
 
 namespace paddle {
 namespace framework {
-inline VarDesc::VarType ToVarType(std::type_index type) {
+inline proto::VarDesc::VarType ToVarType(std::type_index type) {
   if (type.hash_code() == typeid(LoDTensor).hash_code()) {
-    return VarDesc_VarType_LOD_TENSOR;
+    return proto::VarDesc_VarType_LOD_TENSOR;
   } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
-    return VarDesc_VarType_LOD_RANK_TABLE;
+    return proto::VarDesc_VarType_LOD_RANK_TABLE;
   } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
-    return VarDesc_VarType_LOD_TENSOR_ARRAY;
+    return proto::VarDesc_VarType_LOD_TENSOR_ARRAY;
   } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
-    return VarDesc_VarType_SELECTED_ROWS;
+    return proto::VarDesc_VarType_SELECTED_ROWS;
   } else {
     PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
   }
@@ -37,16 +37,16 @@ inline VarDesc::VarType ToVarType(std::type_index type) {
 template <typename Visitor>
 inline void VisitVarType(const Variable& var, Visitor visitor) {
   switch (ToVarType(var.Type())) {
-    case VarDesc_VarType_LOD_TENSOR:
+    case proto::VarDesc_VarType_LOD_TENSOR:
       visitor(var.Get<framework::LoDTensor>());
       return;
-    case VarDesc_VarType_LOD_RANK_TABLE:
+    case proto::VarDesc_VarType_LOD_RANK_TABLE:
       visitor(var.Get<LoDRankTable>());
       return;
-    case VarDesc_VarType_LOD_TENSOR_ARRAY:
+    case proto::VarDesc_VarType_LOD_TENSOR_ARRAY:
       visitor(var.Get<LoDTensorArray>());
       return;
-    case VarDesc_VarType_SELECTED_ROWS:
+    case proto::VarDesc_VarType_SELECTED_ROWS:
       visitor(var.Get<SelectedRows>());
       return;
     default:
diff --git a/paddle/framework/var_type_inference.h b/paddle/framework/var_type_inference.h
index 32abbeb33479444c5e7a9889f4211f59af07f98f..1a4dca05f741f33d58eeccda9d1f800aadb8c01f 100644
--- a/paddle/framework/var_type_inference.h
+++ b/paddle/framework/var_type_inference.h
@@ -21,8 +21,7 @@ namespace framework {
 class VarTypeInference {
  public:
   virtual ~VarTypeInference() {}
-  virtual void operator()(const OpDescBind& op_desc,
-                          BlockDescBind* block) const = 0;
+  virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/var_type_inference_test.cc b/paddle/framework/var_type_inference_test.cc
index 9035e63fa48ffdf7c72061b0a4248538d7a357e4..92f333c558413ac3253c0fb8a20d6f0cfa33f99c 100644
--- a/paddle/framework/var_type_inference_test.cc
+++ b/paddle/framework/var_type_inference_test.cc
@@ -33,17 +33,16 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 
 class SumOpVarTypeInference : public VarTypeInference {
  public:
-  void operator()(const OpDescBind &op_desc,
-                  BlockDescBind *block) const override {
+  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
     auto &inputs = op_desc.Input("X");
-    auto default_var_type = VarDesc::SELECTED_ROWS;
+    auto default_var_type = proto::VarDesc::SELECTED_ROWS;
 
     bool any_input_is_lod_tensor = std::any_of(
         inputs.begin(), inputs.end(), [block](const std::string &name) {
-          return block->Var(name)->GetType() == VarDesc::LOD_TENSOR;
+          return block->Var(name)->GetType() == proto::VarDesc::LOD_TENSOR;
         });
     if (any_input_is_lod_tensor) {
-      default_var_type = VarDesc::LOD_TENSOR;
+      default_var_type = proto::VarDesc::LOD_TENSOR;
     }
 
     auto out_var_name = op_desc.Output("Out").front();
@@ -62,43 +61,43 @@ namespace paddle {
 namespace framework {
 
 TEST(InferVarType, sum_op) {
-  ProgramDescBind prog;
+  ProgramDesc prog;
   auto *op = prog.MutableBlock(0)->AppendOp();
   op->SetType("sum");
   op->SetInput("X", {"test_a", "test_b", "test_c"});
   op->SetOutput("Out", {"test_out"});
 
-  prog.MutableBlock(0)->Var("test_a")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_b")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test_c")->SetType(VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarDesc::SELECTED_ROWS);
   prog.MutableBlock(0)->Var("test_out");
 
   op->InferVarType(prog.MutableBlock(0));
 
-  ASSERT_EQ(VarDesc::SELECTED_ROWS,
+  ASSERT_EQ(proto::VarDesc::SELECTED_ROWS,
             prog.MutableBlock(0)->Var("test_out")->GetType());
 
-  prog.MutableBlock(0)->Var("test_b")->SetType(VarDesc::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::LOD_TENSOR);
   op->InferVarType(prog.MutableBlock(0));
-  ASSERT_EQ(VarDesc::LOD_TENSOR,
+  ASSERT_EQ(proto::VarDesc::LOD_TENSOR,
             prog.MutableBlock(0)->Var("test_out")->GetType());
 }
 
 TEST(InferVarType, sum_op_without_infer_var_type) {
-  ProgramDescBind prog;
+  ProgramDesc prog;
   auto *op = prog.MutableBlock(0)->AppendOp();
   op->SetType("sum_without_infer_var_type");
   op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
   op->SetOutput("Out", {"test2_out"});
 
-  prog.MutableBlock(0)->Var("test2_a")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_b")->SetType(VarDesc::SELECTED_ROWS);
-  prog.MutableBlock(0)->Var("test2_c")->SetType(VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarDesc::SELECTED_ROWS);
   prog.MutableBlock(0)->Var("test2_out");
 
   op->InferVarType(prog.MutableBlock(0));
 
-  ASSERT_EQ(VarDesc_VarType_LOD_TENSOR,
+  ASSERT_EQ(proto::VarDesc_VarType_LOD_TENSOR,
             prog.MutableBlock(0)->Var("test2_out")->GetType());
 }
 
diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
index 1df88a6da9fb0c50d0d7ecd083c0533d8a886a67..5c629dc3d2aca2705e439df836214c1284b31c8f 100644
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -62,33 +62,6 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
   }
 }
 
-template <>
-void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num) {
-  platform::SetDeviceId(src_place.device);
-  platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
-}
-
-template <>
-void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::CPUPlace src_place,
-                                                  const void* src, size_t num) {
-  platform::SetDeviceId(dst_place.device);
-  platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
-}
-
-template <>
-void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
-                                                  void* dst,
-                                                  platform::GPUPlace src_place,
-                                                  const void* src, size_t num) {
-  platform::SetDeviceId(dst_place.device);
-  platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
-}
-
 #endif
 
 }  // namespace memory
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 76da21c4726a1245241c1cf61860f9c8b62ea452..b8ed93f4eb549fbd76bf360d4b843c1fa9635b40 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -63,8 +63,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
 
 class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AccuracyOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  AccuracyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     // TODO(typhoonzero): support both inference value and indices.
     AddInput("Out", "The network output of topk (inferences)");
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index 539a93530206c93a37791a9ccb2fb104af17f940..dd51aad105fecf4e3118f03e2f1868abb5523bc8 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -26,7 +26,7 @@ template <int BlockSize>
 __global__ void AccuracyCudaKernel(const int N, const int D,
                                    const int64_t* Xdata,
                                    const int64_t* labeldata, int* correct_data,
-                                   float* accuracy) {
+                                   float* accuracy, int* total_data) {
   int count = 0;
   __shared__ int total[BlockSize];
 
@@ -47,6 +47,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D,
   if (threadIdx.x == 0) {
     *correct_data = result;
     *accuracy = static_cast<float>(result) / static_cast<float>(N);
+    *total_data = N;
   }
 }
 
@@ -80,22 +81,11 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
     if (num_samples == 0) {
       return;
     }
-    platform::GpuMemcpyAsync(total_data, &num_samples, sizeof(int),
-                             cudaMemcpyHostToDevice, stream);
 
     AccuracyCudaKernel<
         PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
         num_samples, infer_width, indices_data, label_data, correct_data,
-        accuracy_data);
-
-    int d_num_samples, d_num_correct;
-    float d_accuracy;
-    platform::GpuMemcpyAsync(&d_num_correct, correct_data, sizeof(int),
-                             cudaMemcpyDeviceToHost, stream);
-    platform::GpuMemcpyAsync(&d_num_samples, total_data, sizeof(int),
-                             cudaMemcpyDeviceToHost, stream);
-    platform::GpuMemcpyAsync(&d_accuracy, accuracy_data, sizeof(float),
-                             cudaMemcpyDeviceToHost, stream);
+        accuracy_data, total_data);
   }
 };
 
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index 63490f0ec9f4852a3ead574b9d52c807d8ba6d89..2b4c7e5f0de8347d4789136a3a45408ada439f02 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -38,9 +38,8 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
 
 class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SigmoidOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Sigmoid operator");
     AddOutput("Y", "Output of Sigmoid operator");
     AddComment(R"DOC(
@@ -54,9 +53,8 @@ $$y = \frac{1}{1 + e^{-x}}$$
 
 class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LogSigmoidOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  LogSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of LogSigmoid operator");
     AddOutput("Y", "Output of LogSigmoid operator");
     AddComment(R"DOC(
@@ -70,8 +68,8 @@ $$y = \log \frac{1}{1 + e^{-x}}$$
 
 class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ExpOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ExpOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Exp operator");
     AddOutput("Y", "Output of Exp operator");
     AddComment(R"DOC(
@@ -85,8 +83,8 @@ $y = e^x$
 
 class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Relu operator");
     AddOutput("Y", "Output of Relu operator");
     AddComment(R"DOC(
@@ -100,9 +98,8 @@ $y = \max(x, 0)$
 
 class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LeakyReluOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of LeakyRelu operator");
     AddOutput("Y", "Output of LeakyRelu operator");
     AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
@@ -117,9 +114,8 @@ $y = \max(x, \alpha * x)$
 
 class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftShrinkOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Softshrink operator");
     AddOutput("Y", "Output of Softshrink operator");
     AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
@@ -140,8 +136,8 @@ $$
 
 class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  TanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Tanh operator");
     AddOutput("Y", "Output of Tanh operator");
     AddComment(R"DOC(
@@ -155,9 +151,8 @@ $$y = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TanhShrinkOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  TanhShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of TanhShrink operator");
     AddOutput("Y", "Output of TanhShrink operator");
     AddComment(R"DOC(
@@ -171,9 +166,8 @@ $$y = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HardShrinkOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of HardShrink operator");
     AddOutput("Y", "Output of HardShrink operator");
     AddAttr<float>("threshold", "The value of threshold for HardShrink")
@@ -195,8 +189,8 @@ $$
 
 class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SqrtOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SqrtOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Sqrt operator");
     AddOutput("Y", "Output of Sqrt operator");
     AddComment(R"DOC(
@@ -210,8 +204,8 @@ $y = \sqrt{x}$
 
 class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AbsOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  AbsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Abs operator");
     AddOutput("Y", "Output of Abs operator");
     AddComment(R"DOC(
@@ -225,8 +219,8 @@ $y = |x|$
 
 class CeilOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CeilOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  CeilOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Ceil operator");
     AddOutput("Y", "Output of Ceil operator");
     AddComment(R"DOC(
@@ -240,8 +234,8 @@ $y = ceil(x)$
 
 class FloorOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FloorOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  FloorOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Floor operator");
     AddOutput("Y", "Output of Floor operator");
     AddComment(R"DOC(
@@ -255,8 +249,8 @@ $y = floor(x)$
 
 class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RoundOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Round operator");
     AddOutput("Y", "Output of Round operator");
     AddComment(R"DOC(
@@ -270,9 +264,8 @@ $y = [x]$
 
 class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReciprocalOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ReciprocalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Reciprocal operator");
     AddOutput("Y", "Output of Reciprocal operator");
     AddComment(R"DOC(
@@ -286,8 +279,8 @@ $$y = \frac{1}{x}$$
 
 class LogOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LogOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  LogOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Log operator");
     AddOutput("Y", "Output of Log operator");
     AddComment(R"DOC(
@@ -303,8 +296,8 @@ Natural logarithm of x.
 
 class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SquareOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SquareOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Square operator");
     AddOutput("Y", "Output of Square operator");
     AddComment(R"DOC(
@@ -318,9 +311,8 @@ $y = x^2$
 
 class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftplusOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftplusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Softplus operator");
     AddOutput("Y", "Output of Softplus operator");
     AddComment(R"DOC(
@@ -334,9 +326,8 @@ $y = \ln(1 + e^{x})$
 
 class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftsignOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftsignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Softsign operator");
     AddOutput("Y", "Output of Softsign operator");
     AddComment(R"DOC(
@@ -350,8 +341,8 @@ $$y = \frac{x}{1 + |x|}$$
 
 class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  BReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of BRelu operator");
     AddOutput("Y", "Output of BRelu operator");
     AddAttr<float>("t_min", "The min marginal value of BRelu")
@@ -369,9 +360,8 @@ $y = \max(\min(x, t_{min}), t_{max})$
 
 class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftReluOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SoftReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of SoftRelu operator");
     AddOutput("Y", "Output of SoftRelu operator");
     AddAttr<float>("threshold", "The threshold value of SoftRelu")
@@ -387,8 +377,8 @@ $y = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
 
 class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ELUOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ELUOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of ELU operator");
     AddOutput("Y", "Output of ELU operator");
     AddAttr<float>("alpha", "The alpha value of ELU").SetDefault(1.0f);
@@ -406,8 +396,8 @@ $y = \max(0, x) + \min(0, \alpha * (e^x - 1))$
 
 class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Relu6OpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  Relu6OpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Relu6 operator");
     AddOutput("Y", "Output of Relu6 operator");
     AddAttr<float>("threshold", "The threshold value of Relu6")
@@ -423,8 +413,8 @@ $y = \min(\max(0, x), 6)$
 
 class PowOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PowOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  PowOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Pow operator");
     AddOutput("Y", "Output of Pow operator");
     AddAttr<float>("factor", "The exponential factor of Pow").SetDefault(1.0f);
@@ -439,8 +429,8 @@ $y = x^{factor}$
 
 class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  STanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  STanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of STanh operator");
     AddOutput("Y", "Output of STanh operator");
     AddAttr<float>("scale_a", "The scale parameter of a for the input")
@@ -458,9 +448,8 @@ $$y = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
 
 class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ThresholdedReluOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  ThresholdedReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of ThresholdedRelu operator");
     AddOutput("Y", "Output of ThresholdedRelu operator");
     AddAttr<float>("threshold", "The threshold location of activation")
@@ -481,9 +470,8 @@ $$
 
 class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HardSigmoidOpMaker(framework::OpProto *proto,
-                     framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  HardSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of HardSigmoid operator");
     AddOutput("Y", "Output of HardSigmoid operator");
     AddAttr<float>("slope", "Slope for linear approximation of sigmoid")
@@ -508,8 +496,8 @@ It is recommended to use the defaults for this activation.
 
 class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SwishOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  SwishOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input of Swish operator");
     AddOutput("Y", "Output of Swish operator");
     AddAttr<float>("beta", "Constant beta of swish operator").SetDefault(1.0f);
diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
index 507811e7b59b9426c599570ead9b42f8d02380fd..d8a9491c8247ac463e01606dac248780d5284236 100644
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -59,8 +59,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {
 
 class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdadeltaOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  AdadeltaOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
index 5d007163161cd4bf4a9fd46eda57f7984c6a414f..052c793a01907abdc7784d1290f43543ae81bdb1 100644
--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -59,8 +59,7 @@ class AdagradOp : public framework::OperatorWithKernel {
 
 class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdagradOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  AdagradOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc
index cf6ef6dd53979b23de125014b8d5150d8ce4c053..03527de936bf736d572fb0140033bde4db990981 100644
--- a/paddle/operators/adam_op.cc
+++ b/paddle/operators/adam_op.cc
@@ -73,7 +73,7 @@ class AdamOp : public framework::OperatorWithKernel {
 
 class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdamOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AdamOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc
index 49ce497bb710de24b198fb4b5f56ff6d277c6f52..3b0b71418477ea128dbb31a8d7cd44cf6bf023a1 100644
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
@@ -67,7 +67,7 @@ class AdamaxOp : public framework::OperatorWithKernel {
 
 class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdamaxOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AdamaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
index faeba7f3ed26d05de16775a1de4d42f802111207..b6ca3cad94425207629160a4c7d715f685b23a09 100644
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -114,8 +114,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
 
 class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ArrayToLoDTensorOpProtoMaker(framework::OpProto *proto,
-                               framework::OpAttrChecker *op_checker)
+  ArrayToLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(std::vector<LodTensor>) A vector of tensors that is going to "
@@ -150,14 +149,14 @@ class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("lod_tensor_to_array");
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetInput("RankTable", Input("RankTable"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
index 0a37f18729a93b15623c0a17e3689e518c38b844..a914ff4ba92318c75326bd7945bb73bcb93b6fc3 100644
--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -86,8 +86,7 @@ class AssignOp : public framework::OperatorBase {
 
 class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AssignOpProtoMaker(framework::OpProto *proto,
-                     framework::OpAttrChecker *op_checker)
+  AssignOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
@@ -109,8 +108,8 @@ class AssignInferShape : public framework::InferShapeBase {
   void operator()(framework::InferShapeContext *context) const override {
     if (context->HasInput("X")) {
       auto type = context->GetInputsVarType("X")[0];
-      if (type == framework::VarDesc_VarType_SELECTED_ROWS ||
-          type == framework::VarDesc_VarType_LOD_TENSOR) {
+      if (type == framework::proto::VarDesc_VarType_SELECTED_ROWS ||
+          type == framework::proto::VarDesc_VarType_LOD_TENSOR) {
         context->SetOutputDim("Out", context->GetInputDim("X"));
       }
     }
@@ -122,12 +121,12 @@ class AssignGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
     op->SetType("assign");
     op->SetInput("X", OutputGrad("Out"));
     op->SetOutput("Out", InputGrad("X"));
-    return std::unique_ptr<framework::OpDescBind>(op);
+    return std::unique_ptr<framework::OpDesc>(op);
   }
 };
 
diff --git a/paddle/operators/auc_op.cc b/paddle/operators/auc_op.cc
index 6c3f67ec32fb1b942241997e87a1e9c4752e707d..811c487089fcf4044f129ad6bf95b46535d4fcd6 100644
--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -49,7 +49,7 @@ class AucOp : public framework::OperatorWithKernel {
 
 class AucOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AucOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  AucOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Out",
              "A floating point 2D tensor, values are in the range [0, 1]."
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index 94a972b7ab56f41f8b6a203b6bf0330a69f84e54..1c14acbe11fbad9654bd0309f5674176ebdb5e6f 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/batch_norm_op.h"
+#include "paddle/framework/data_layout.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
 
 template <typename T>
 using EigenArrayMap =
@@ -60,15 +62,15 @@ class BatchNormOp : public framework::OperatorWithKernel {
                       "Variance and VarianceOut should share the same memory");
 
     const auto x_dims = ctx->GetInputDim("X");
-    const TensorFormat tensor_format =
-        StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
 
     PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
                    "Input X must have 2 to 5 dimensions.");
 
     const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
-                                             : x_dims[x_dims.size() - 1]);
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
 
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
@@ -85,13 +87,12 @@ class BatchNormOp : public framework::OperatorWithKernel {
 
 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BatchNormOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  BatchNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<bool>("is_test", "").SetDefault(false);
     AddAttr<float>("momentum", "").SetDefault(0.9);
     AddAttr<float>("epsilon", "").SetDefault(1e-5);
-    AddAttr<std::string>("tensor_format", "").SetDefault("NCHW");
+    AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
     AddInput("X", "The input tensor");
     AddInput("Scale",
              "Scale is a 1-dimensional tensor of size C "
@@ -142,9 +143,9 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
     const float epsilon = ctx.Attr<float>("epsilon");
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
@@ -152,8 +153,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
                    "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
-                                             : x_dims[x_dims.size() - 1]);
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
     const int sample_size = x->numel() / N / C;
 
     auto *y = ctx.Output<Tensor>("Y");
@@ -178,8 +179,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
       saved_mean_e.setZero();
       saved_variance_e.setZero();
 
-      switch (tensor_format) {
-        case TensorFormat::NCHW: {
+      switch (data_layout) {
+        case DataLayout::kNCHW: {
           ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
           for (int nc = 0; nc < N * C; ++nc) {
             saved_mean_e(nc % C) += x_arr.col(nc).sum();
@@ -192,7 +193,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
           saved_variance_e /= N * sample_size;
           break;
         }
-        case TensorFormat::NHWC: {
+        case DataLayout::kNHWC: {
           ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
           for (int i = 0; i < N * sample_size; ++i) {
             saved_mean_e += x_arr.col(i);
@@ -206,7 +207,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
           break;
         }
         default:
-          PADDLE_THROW("Unknown storage order: %s", tensor_format_str);
+          PADDLE_THROW("Unknown storage order: %s", data_layout_str);
       }
 
       EigenVectorArrayMap<T> running_mean_arr(
@@ -248,8 +249,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
     Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
         bias_arr - mean_arr * inv_std * scale_arr;
 
-    switch (tensor_format) {
-      case TensorFormat::NCHW: {
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
         EigenArrayMap<T> y_arr(y->mutable_data<T>(ctx.GetPlace()), sample_size,
                                N * C);
         ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
@@ -258,7 +259,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
         }
         break;
       }
-      case TensorFormat::NHWC: {
+      case DataLayout::kNHWC: {
         EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C,
                          N * sample_size) =
             (ConstEigenArrayMap<T>(x->data<T>(), C, N * sample_size).colwise() *
@@ -268,7 +269,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
         break;
       }
       default:
-        PADDLE_THROW("Unknown storage order: %d", tensor_format);
+        PADDLE_THROW("Unknown storage order: %d", data_layout);
     }
   }
 };
@@ -291,11 +292,11 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), "");
 
     const auto x_dims = ctx->GetInputDim("X");
-    const TensorFormat tensor_format =
-        StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
     const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
-                                             : x_dims[x_dims.size() - 1]);
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
 
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
     ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
@@ -334,9 +335,9 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
     const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
     // SavedVariance have been reverted in forward operator
     const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
 
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
@@ -345,8 +346,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
                    "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
-        (tensor_format == TensorFormat::NCHW ? x_dims[1]
-                                             : x_dims[x_dims.size() - 1]);
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
     const int sample_size = x->numel() / N / C;
 
     ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
@@ -377,8 +378,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
 
     const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
 
-    switch (tensor_format) {
-      case TensorFormat::NCHW: {
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
         ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
         ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
         EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()),
@@ -401,7 +402,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
         }
         break;
       }
-      case TensorFormat::NHWC: {
+      case DataLayout::kNHWC: {
         ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
         ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
         EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C,
@@ -426,7 +427,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
         break;
       }
       default:
-        PADDLE_THROW("Unknown storage order: %s", tensor_format_str);
+        PADDLE_THROW("Unknown storage order: %s", data_layout_str);
     }
   }
 };
diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc
index c7adc3d80ed25d129cec41a0fd3d22fd42aba363..55d0736a4c8e09eea637e5ab7e49af9a618e7fd8 100644
--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/batch_norm_op.h"
+#include "paddle/framework/data_layout.h"
 
 #include <cfloat>
 #include "paddle/operators/math/math_function.h"
@@ -22,12 +23,12 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 
-void ExtractNCWHD(const framework::DDim &dims,
-                  const TensorFormat &tensor_format, int *N, int *C, int *H,
-                  int *W, int *D) {
+void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
+                  int *N, int *C, int *H, int *W, int *D) {
   *N = dims[0];
   if (dims.size() == 2) {
     *C = dims[1];
@@ -35,13 +36,13 @@ void ExtractNCWHD(const framework::DDim &dims,
     *W = 1;
     *D = 1;
   } else {
-    *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
-    *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
     *W = dims.size() > 3
-             ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
+             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
              : 1;
     *D = dims.size() > 4
-             ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
+             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
              : 1;
   }
 }
@@ -56,9 +57,9 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
 
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
@@ -67,7 +68,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
                    "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
-    ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
     // ------------------- cudnn descriptors ---------------------
     cudnnTensorDescriptor_t data_desc_;
@@ -93,7 +94,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     VLOG(1) << "Setting descriptors.";
     std::vector<int> dims;
     std::vector<int> strides;
-    if (tensor_format == TensorFormat::NCHW) {
+    if (data_layout == DataLayout::kNCHW) {
       dims = {N, C, H, W, D};
       strides = {C * H * W * D, H * W * D, W * D, D, 1};
     } else {
@@ -180,9 +181,9 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "It must use GPUPlace.");
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const std::string tensor_format_str =
-        ctx.Attr<std::string>("tensor_format");
-    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
     const auto *x = ctx.Input<Tensor>("X");
     const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
     const auto *scale = ctx.Input<Tensor>("Scale");
@@ -192,7 +193,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
                    "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
-    ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
     PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
     PADDLE_ENFORCE_EQ(scale->dims()[0], C);
@@ -219,7 +220,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 
     std::vector<int> dims;
     std::vector<int> strides;
-    if (tensor_format == TensorFormat::NCHW) {
+    if (data_layout == DataLayout::kNCHW) {
       dims = {N, C, H, W, D};
       strides = {C * H * W * D, H * W * D, W * D, D, 1};
     } else {
diff --git a/paddle/operators/batch_norm_op.h b/paddle/operators/batch_norm_op.h
index 8d99b6864776e81b30e87c09028b336309cf2838..a817ef41fc87da33ad87923c99a75ee7c3c7bbfe 100644
--- a/paddle/operators/batch_norm_op.h
+++ b/paddle/operators/batch_norm_op.h
@@ -19,21 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-enum TensorFormat {
-  NHWC = 0,
-  NCHW = 1,
-};
-
-inline TensorFormat StringToTensorFormat(const std::string& str) {
-  if (str == "NHWC" || str == "nhwc") {
-    return TensorFormat::NHWC;
-  } else if (str == "NCHW" || str == "nchw") {
-    return TensorFormat::NCHW;
-  } else {
-    PADDLE_THROW("Unknown storage order string: %s", str);
-  }
-}
-
 template <typename DeviceContext, typename T>
 class BatchNormKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc
index c796a0c5d089499e7858c7a427825fdbeb05cb7f..32756faac5324cfb3b5366857d2c8176665fb3ec 100644
--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -83,9 +83,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
 
 class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BeamSearchDecodeOpProtoMaker(framework::OpProto* proto,
-                               framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  BeamSearchDecodeOpProtoMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Ids",
              "(LodTensorArray)"
              "score of the candidate words in each step");
@@ -120,13 +119,13 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase {
 
 class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind& op_desc,
-                  framework::BlockDescBind* block) const override {
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
     for (auto& o : op_desc.Output("SentenceIds")) {
-      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
     }
     for (auto& o : op_desc.Output("SentenceScores")) {
-      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/operators/beam_search_op.cc b/paddle/operators/beam_search_op.cc
index 8c3e2a303fb8f12a8886c11cf112b859a6db7bcf..69ddc52035ae78dd2d1926b66fcbbe36737e87aa 100644
--- a/paddle/operators/beam_search_op.cc
+++ b/paddle/operators/beam_search_op.cc
@@ -153,8 +153,7 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
 class BeamSearchProtoAndCheckerMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  BeamSearchProtoAndCheckerMaker(framework::OpProto *proto,
-                                 framework::OpAttrChecker *op_checker)
+  BeamSearchProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     // inputs and outputs stored in proto
     AddInput("pre_ids", "ids in previous step");
diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/operators/bilinear_tensor_product_op.cc
index 217fd523667777f7d250295d2a036867dac94f04..7640147a12d66a924f16eaf168227b6ce6a96040 100644
--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ b/paddle/operators/bilinear_tensor_product_op.cc
@@ -65,8 +65,7 @@ class BilinearTensorProductOp : public framework::OperatorWithKernel {
 
 class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BilinearTensorProductOpMaker(framework::OpProto* proto,
-                               framework::OpAttrChecker* op_checker)
+  BilinearTensorProductOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The first input of bilinear_tensor_product operator.");
     AddInput("Y", "The second input of bilinear_tensor_product operator.");
diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc
index d641b8fc9fea81d1e364ae05de98ed7760a32648..fc6da064904610f5c9c140a6328858d697dd954e 100644
--- a/paddle/operators/cast_op.cc
+++ b/paddle/operators/cast_op.cc
@@ -20,8 +20,7 @@ namespace operators {
 
 class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CastOpProtoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  CastOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of cast op");
     AddOutput("Out", "The output tensor of cast op");
@@ -53,14 +52,14 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto grad = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto grad = new framework::OpDesc();
     grad->SetType("cast");
     grad->SetInput("X", OutputGrad("Out"));
     grad->SetOutput("Out", InputGrad("X"));
     grad->SetAttr("out_dtype", GetAttr("in_dtype"));
     grad->SetAttr("in_dtype", GetAttr("out_dtype"));
-    return std::unique_ptr<framework::OpDescBind>(grad);
+    return std::unique_ptr<framework::OpDesc>(grad);
   }
 };
 
diff --git a/paddle/operators/cast_op.h b/paddle/operators/cast_op.h
index a6773f13a8deb443b022c6045f1b3b976b3e6607..0c72d809e67e8f3f25be5643041d89da3d04d95e 100644
--- a/paddle/operators/cast_op.h
+++ b/paddle/operators/cast_op.h
@@ -55,7 +55,7 @@ class CastOpKernel : public framework::OpKernel<InT> {
     auto* in = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
     framework::VisitDataType(
-        static_cast<framework::DataType>(context.Attr<int>("out_dtype")),
+        static_cast<framework::proto::DataType>(context.Attr<int>("out_dtype")),
         CastOpFunctor<DeviceContext, InT>(
             in, out, context.template device_context<DeviceContext>()));
   }
diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc
index 894f355deb9d764ef72d452f362e6b42f8831667..f1f274a7af079d68c7c1bcd8ec07962e18b0ea60 100644
--- a/paddle/operators/chunk_eval_op.cc
+++ b/paddle/operators/chunk_eval_op.cc
@@ -57,15 +57,14 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(framework::DataType::FP32,
+    return framework::OpKernelType(framework::proto::DataType::FP32,
                                    ctx.device_context());
   }
 };
 
 class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ChunkEvalOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ChunkEvalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Inference",
              "(Tensor, default: Tensor<int64_t>). "
diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc
index 0b7975a63f7d364bf9b0ce529e2dd72d9f3cd2e9..05c79d0e25deea84463f0b67ac4dc9a8dd43f2cb 100644
--- a/paddle/operators/clip_by_norm_op.cc
+++ b/paddle/operators/clip_by_norm_op.cc
@@ -37,8 +37,7 @@ class ClipByNormOp : public framework::OperatorWithKernel {
 
 class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ClipByNormOpMaker(framework::OpProto* proto,
-                    framework::OpAttrChecker* op_checker)
+  ClipByNormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor) The input of clip_by_norm op."
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index 6092212de4635e2ada81f8383a0ccf64a8116158..e34ba0a8f4757e45db58270dfd6191157f6e226a 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -38,7 +38,7 @@ class ClipOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ClipOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor)The input of clip op."
diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc
index bf7e88368157d29e627c3c06384f28b6e5e4ecc1..1148172f3a2cc9b3f849ee04cefc19f16742d3eb 100644
--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -20,8 +20,7 @@ namespace operators {
 template <typename OpComment>
 class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CompareOpProtoMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  CompareOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     OpComment comment;
     AddInput("X",
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
index cf522d6921ee746d03d8082b8fc4d051f4d504e6..32b61edfd0dd163e5ef8f3d1de133c55314458b5 100644
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -58,7 +58,7 @@ class ConcatOp : public framework::OperatorWithKernel {
 
 class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ConcatOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ConcatOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "Input tensors of concat operator.").AsDuplicable();
     AddOutput("Out", "Output tensor of concat operator.");
@@ -98,8 +98,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
-            ops::ConcatOpGrad)
+REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
+               ops::ConcatOpGrad, false)
 REGISTER_OP_CPU_KERNEL(concat,
                        ops::ConcatKernel<paddle::platform::CPUPlace, float>)
 REGISTER_OP_CPU_KERNEL(concat_grad,
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index b809bdc3a0fea727f2fb6ea0a55672ee9b0bbd04..8c860676e06de5dac9570d2a6f7271ff451eebee 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -205,8 +205,7 @@ void CondOp::Run(const Scope& scope,
 
 class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CondOpProtoAndCheckerMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
+  CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Cond", "The condition, which is a bool vector");
     AddInput("Xs", "Inputs of Subnets").AsDuplicable();
diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc
index 6f2ef9174e84a0c0ae096956c04039435e6583c6..204be7d1e5385b7fdab54914bec216543e360cd3 100644
--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
@@ -65,7 +65,7 @@ class ConditionalBlockOp : public ConditionalOp {
       scopes->front() = &scope.NewScope();
       auto &cur_scope = *scopes->front();
 
-      auto *block = Attr<framework::BlockDescBind *>("sub_block");
+      auto *block = Attr<framework::BlockDesc *>("sub_block");
       framework::Executor exec(dev_ctx);
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
     }
@@ -74,8 +74,7 @@ class ConditionalBlockOp : public ConditionalOp {
 
 class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ConditionalBlockOpProtoMaker(framework::OpProto *proto,
-                               framework::OpAttrChecker *op_checker)
+  ConditionalBlockOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The conditional variable of this operator. If X is empty, the "
@@ -87,7 +86,7 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
               "(std::vector<Scope*>) The step scope of conditional block. To "
               "unify the conditional block, rnn and while op, the type of "
               "scope is std::vector<Scope*>");
-    AddAttr<framework::BlockDescBind *>(
+    AddAttr<framework::BlockDesc *>(
         "sub_block", "The step block of conditional block operator");
     AddComment(R"DOC(Conditional block operator
 
@@ -117,7 +116,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
       auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
       framework::Scope &cur_scope = *scopes[0];
 
-      auto *block = Attr<framework::BlockDescBind *>("sub_block");
+      auto *block = Attr<framework::BlockDesc *>("sub_block");
       framework::Executor exec(dev_ctx);
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
 
@@ -171,18 +170,19 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto grad_op = new framework::OpDesc();
     grad_op->SetType("conditional_block_grad");
     grad_op->SetInput("X", Input("X"));
     grad_op->SetInput("Params", Input("Params"));
     grad_op->SetInput("Out", Output("Out"));
     grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     grad_op->SetInput("Scope", Output("Scope"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    grad_op->SetOutput(framework::GradVarName("Params"), InputGrad("Params"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
+    grad_op->SetOutput(framework::GradVarName("Params"),
+                       InputGrad("Params", false));
     grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc
index 008bf01885ecddd1fee76a33c43370d07a8988a2..5b27ada55d737c31f8e65dc9b460a3a2ea11b869 100644
--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
@@ -19,8 +19,7 @@ namespace operators {
 
 class CudnnConv2DOpMaker : public Conv2DOpMaker {
  public:
-  CudnnConv2DOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  CudnnConv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : Conv2DOpMaker(proto, op_checker) {
     AddAttr<int>("workspace_size_MB",
                  "workspace size for cudnn, in MB, "
@@ -34,8 +33,7 @@ class CudnnConv2DOpMaker : public Conv2DOpMaker {
 
 class CudnnConv3DOpMaker : public Conv3DOpMaker {
  public:
-  CudnnConv3DOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  CudnnConv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : Conv3DOpMaker(proto, op_checker) {
     AddAttr<int>("workspace_size_MB",
                  "workspace size for cudnn, in MB, "
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index 7ef805fd44bf94d3279ffa50f86993b3f2b64412..abe82e124121a6c57d1c3ca7337804f5a4ab3d38 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -66,8 +66,7 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
   ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
 }
 
-Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
+Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput(
       "Input",
@@ -138,8 +137,7 @@ $$
 )DOC");
 }
 
-Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
-                             framework::OpAttrChecker* op_checker)
+Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput(
       "Input",
diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h
index d2de4e80f751d4938ac9cad60871b470fccf225c..83786e2329e7ae3c2908fdfdaeb1f79d19a53f47 100644
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@@ -50,14 +50,12 @@ inline bool IsExpand(std::vector<int64_t>& filter_dim,
 // operator implementations can reuse the code.
 class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv2DOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv3DOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 class ConvOp : public framework::OperatorWithKernel {
diff --git a/paddle/operators/conv_shift_op.cc b/paddle/operators/conv_shift_op.cc
index a4150a5664690e750d2501a1849767c23209186b..ac2f80625935e14189d27bf738e9b9985a7f42c2 100644
--- a/paddle/operators/conv_shift_op.cc
+++ b/paddle/operators/conv_shift_op.cc
@@ -75,8 +75,7 @@ class ConvShiftGradOp : public framework::OperatorWithKernel {
 
 class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ConvShiftOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ConvShiftOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
diff --git a/paddle/operators/conv_transpose_cudnn_op.cc b/paddle/operators/conv_transpose_cudnn_op.cc
index 4cb6a2ccffc76066ea0868f76ba2a3bfb9e5e450..8980ff91f5d7c585d9ce0ce62cfb90f47ea86ec6 100644
--- a/paddle/operators/conv_transpose_cudnn_op.cc
+++ b/paddle/operators/conv_transpose_cudnn_op.cc
@@ -19,11 +19,8 @@ namespace operators {
 
 class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
  public:
-  CudnnConv2DTransposeOpMaker(framework::OpProto* proto,
-                              framework::OpAttrChecker* op_checker)
+  CudnnConv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : Conv2DTransposeOpMaker(proto, op_checker) {
-    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault({1, 1});
     AddAttr<int>("workspace_size_MB",
                  "workspace size for cudnn, in MB, "
                  "workspace is a section of GPU memory which will be "
@@ -36,11 +33,8 @@ class CudnnConv2DTransposeOpMaker : public Conv2DTransposeOpMaker {
 
 class CudnnConv3DTransposeOpMaker : public Conv3DTransposeOpMaker {
  public:
-  CudnnConv3DTransposeOpMaker(framework::OpProto* proto,
-                              framework::OpAttrChecker* op_checker)
+  CudnnConv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : Conv3DTransposeOpMaker(proto, op_checker) {
-    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
-        .SetDefault({1, 1, 1});
     AddAttr<int>("workspace_size_MB",
                  "workspace size for cudnn, in MB, "
                  "workspace is a section of GPU memory which will be "
diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
index ca063e94bbe64817567a298c3b1ad9306667536d..5e24fc4b2c8b479caac417c957033f7552e1c3f0 100644
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -29,6 +29,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
   auto filter_dims = ctx->GetInputDim("Filter");
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
 
   PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                  "ConvTransposeOp intput should be 4-D or 5-D tensor.");
@@ -41,20 +42,24 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
                     "ConvTransposeOp paddings dimension and strides "
                     "dimension should be the same.");
+  PADDLE_ENFORCE_EQ(paddings.size(), dilations.size(),
+                    "ConvTransposeOp paddings dimension and dilations "
+                    "dimension should be the same.");
   PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
                     "In ConvTransposeOp, The input channel should be the same "
                     "as the number of filters.");
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
   for (size_t i = 0; i < strides.size(); ++i) {
+    auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
     output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
-                           filter_dims[i + 2]);
+                           filter_extent);
   }
   ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
 }
 
-Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
-    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto,
+                                               OpAttrChecker* op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput(
       "Input",
@@ -73,6 +78,12 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
   AddOutput("Output",
             "(Tensor) The output tensor of convolution transpose operator. "
             "The format of output tensor is also NCHW.");
+
+  AddAttr<std::vector<int>>("dilations",
+                            "(vector<int> default:{1, 1}), the "
+                            "dilations(h_dilation, w_dilation) of convolution "
+                            "transpose operator.")
+      .SetDefault({1, 1});
   AddAttr<std::vector<int>>(
       "strides",
       "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
@@ -87,7 +98,7 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
 Convolution2D Transpose Operator.
 
 The convolution transpose operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and dilations, strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
 Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
 number of channels, H is the height of the feature, and W is the width of the feature.
@@ -112,8 +123,8 @@ Example:
 )DOC");
 }
 
-Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
-    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto,
+                                               OpAttrChecker* op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput("Input",
            "(Tensor) The input tensor of convolution transpose operator."
@@ -136,6 +147,13 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
             "Where N is batch size, C is "
             "the number of channels, D is the depth of the feature, H is the "
             "height of the feature, and W is the width of the feature.");
+
+  AddAttr<std::vector<int>>(
+      "dilations",
+      "(vector<int> default:{1, 1, 1}), the "
+      "dilations(d_dilation,h_dilation, w_dilation) of convolution "
+      "transpose operator.")
+      .SetDefault({1, 1, 1});
   AddAttr<std::vector<int>>("strides",
                             "(vector<int> default:{1, 1, 1}), the "
                             "strides{d_stride, h_stride, w_stride} of "
@@ -149,7 +167,7 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
 Convolution3D Transpose Operator.
 
 The convolution transpose operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and dilations, strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
 Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
 number of channels, D is the depth of the feature, H is the height of the feature,
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
index 1171b0435fd2b1abe541043e8283a8fc09dc13c7..4c8f8a80672788e8b2919e500d3627adec1ad035 100644
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -30,14 +30,12 @@ using DDim = framework::DDim;
 // operator implementations can reuse the code.
 class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv2DTransposeOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker);
+  Conv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv3DTransposeOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker);
+  Conv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 class ConvTransposeOp : public framework::OperatorWithKernel {
@@ -63,6 +61,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
 
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
     // groups will alway be disabled in conv2dtranspose.
 
     const int batch_size = static_cast<int>(input->dims()[0]);
@@ -115,7 +114,6 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
 
     math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
     math::Col2VolFunctor<DeviceContext, T> col2vol;
-    std::vector<int> dilations({1, 1, 1});
 
     // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
     // on input)
@@ -167,6 +165,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
 
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
@@ -221,7 +220,6 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
 
       math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
       math::Vol2ColFunctor<DeviceContext, T> vol2col;
-      std::vector<int> dilations({1, 1, 1});
 
       if (input_grad) {
         input_grad->mutable_data<T>(context.GetPlace());
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 440c427cba9396ec6d0ebf7814d671e45f45412d..a4d4a78d3200259403695a73ed9cfabe9baf8876 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -62,7 +62,7 @@ class CosSimOp : public framework::OperatorWithKernel {
 
 class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CosSimOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  CosSimOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The 1st input of cos_sim op.");
     AddInput("Y", "The 2nd input of cos_sim op.");
diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc
index 1ce189fa6ebba3712467572c55d599975bbe7534..27d0871f82beed4ceb3a4439be097a580631d4c6 100644
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
@@ -18,8 +18,7 @@ namespace paddle {
 namespace operators {
 class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CRFDecodingOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  CRFDecodingOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Emission",
              "(LoDTensor, default: LoDTensor<float>). A LoDTensor with shape "
diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
index 5c973fbb3cf9513d82a5b87719cb947466082424..87fcab4cca669a356ced8951fbdc3c3ee3a24f3d 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -52,7 +52,7 @@ class CropOp : public framework::OperatorWithKernel {
 
 class CropOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CropOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  CropOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input of pad op. "
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 2b06012b690c6725fd150cd99e992912655dc9c6..1ab7c0a06f85f332b290cb6cac82d0cfbe8f3242 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -111,8 +111,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
 
 class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CrossEntropyOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
diff --git a/paddle/operators/decayed_adagrad_op.cc b/paddle/operators/decayed_adagrad_op.cc
index fd29c7270b0442da740a74f83fdfeed8f47f830d..739a8d881c35817756421a3299901c9e5e7d96ba 100644
--- a/paddle/operators/decayed_adagrad_op.cc
+++ b/paddle/operators/decayed_adagrad_op.cc
@@ -55,8 +55,7 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
 
 class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  DecayedAdagradOpMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  DecayedAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc
index 89dc5045221156eed7aa9411bc96ad86f91136d2..517a1946a0c25081b20c320f4104c81503a4249e 100644
--- a/paddle/operators/detail/recv_impl.cc
+++ b/paddle/operators/detail/recv_impl.cc
@@ -20,25 +20,57 @@ namespace detail {
 
 Status SendRecvServerImpl::SendVariable(ServerContext *context,
                                         const VariableMessage *in_var,
-                                        VariableMessage *out_var) {
-  framework::LoDTensor t;
-  // TODO(typhoonzero): desirealize in_tensor and run pserver network.
+                                        VoidMessage *out_var) {
+  // TODO(typhoonzero): support different variable types.
   std::istringstream iss(in_var->serialized());
+  framework::LoDTensor t;
   framework::DeserializeFromStream(iss, &t);
-  lodtensor_queue_.Push(std::move(t));
-  // Block util the sub graph is done.
-  t = lodtensor_return_queue_.Pop();
+  TensorWithName tensor_with_name =
+      std::make_pair(in_var->varname(), std::move(t));
+
+  var_recv_queue_.Push(std::move(tensor_with_name));
+  return Status::OK;
+}
+
+Status SendRecvServerImpl::GetVariable(ServerContext *context,
+                                       const VariableMessage *in_var,
+                                       VariableMessage *out_var) {
+  std::string get_var_name = in_var->varname();
+  auto *var = scope_->FindVar(get_var_name);
+  auto tensor = var->Get<framework::LoDTensor>();
   std::ostringstream oss;
-  // FIXME(typhoonzero): get context from op.
-  framework::SerializeToStream(oss, t, platform::CPUDeviceContext());
+  framework::SerializeToStream(oss, tensor, platform::CPUDeviceContext());
+
   std::string *varname = out_var->mutable_varname();
-  *varname = in_var->varname();
+  *varname = get_var_name;
   std::string *serialized = out_var->mutable_serialized();
   *serialized = oss.str();
+  return Status::OK;
+}
 
+Status SendRecvServerImpl::Wait(ServerContext *context,
+                                const VoidMessage *in_var,
+                                VoidMessage *out_var) {
+  {
+    std::unique_lock<std::mutex> lock(this->mutex_);
+    condition_.wait(lock, [=] { return this->done_ == true; });
+  }
   return Status::OK;
 }
 
+void SendRecvServerImpl::Reset() {
+  std::lock_guard<std::mutex> lock(this->mutex_);
+  done_ = false;
+}
+
+void SendRecvServerImpl::Done() {
+  {
+    std::lock_guard<std::mutex> lock(this->mutex_);
+    done_ = true;
+  }
+  condition_.notify_all();
+}
+
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc
index da1ddf75d2afb85670c5ea0c9884376415f28208..d7165e13db961f7719139b1a40211d6a2ca67a9f 100644
--- a/paddle/operators/detail/send_impl.cc
+++ b/paddle/operators/detail/send_impl.cc
@@ -19,10 +19,10 @@ namespace operators {
 namespace detail {
 
 bool RPCClient::SendVariable(const framework::Scope& scope,
-                             const std::string& inname,
-                             const std::string& outname) {
+                             const std::string& inname) {
   ClientContext context;
-  VariableMessage msg, out_msg;
+  VariableMessage msg;
+  VoidMessage out_msg;
   // FIXME(typhoonzero): pass device context to here.
   auto ctx = platform::CPUDeviceContext();
   auto* var = scope.FindVar(inname);
@@ -37,9 +37,26 @@ bool RPCClient::SendVariable(const framework::Scope& scope,
   msg.set_serialized(oss.str());
   Status status = stub_->SendVariable(&context, msg, &out_msg);
   if (!status.ok()) {
+    LOG(ERROR) << "gRPC error: " << status.error_message();
     return false;
   }
-  std::istringstream iss(out_msg.serialized());
+  return true;
+}
+
+bool RPCClient::GetVariable(const framework::Scope& scope,
+                            const std::string& outname) {
+  ClientContext context;
+  VariableMessage call_msg, ret_msg;
+  call_msg.set_varname(outname);
+  auto ctx = platform::CPUDeviceContext();
+  Status status = stub_->GetVariable(&context, call_msg, &ret_msg);
+  if (!status.ok()) {
+    LOG(ERROR) << "gRPC error: " << status.error_message();
+    return false;
+  }
+
+  std::istringstream iss(ret_msg.serialized());
+
   framework::LoDTensor ret_tensor;
   framework::DeserializeFromStream(iss, &ret_tensor);
   auto* outvar = scope.FindVar(outname);
@@ -49,6 +66,12 @@ bool RPCClient::SendVariable(const framework::Scope& scope,
   return true;
 }
 
+void RPCClient::Wait() {
+  ClientContext context;
+  VoidMessage call_msg, ret_msg;
+  stub_->Wait(&context, call_msg, &ret_msg);
+}
+
 }  // namespace detail
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
index 07ff9d2c621a2dfb51792821a0d3fc398c315835..ce729908062ad442e66cc00001e14ceb6f268560 100644
--- a/paddle/operators/detail/send_recv.proto
+++ b/paddle/operators/detail/send_recv.proto
@@ -19,7 +19,12 @@ package sendrecv;
 service SendRecvService {
   // For parameter server round-robin like hashing, do not split tensors.
   // Send and recv only one tensor
-  rpc SendVariable(VariableMessage) returns (VariableMessage) {}
+  // TODO(typhoonzero): add streaming API
+  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
+  // Argument VariableMessage for GetVariable should only contain varname.
+  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  // wait for one execution of the program
+  rpc Wait(VoidMessage) returns (VoidMessage) {}
 }
 
 // VariableMessage is serialized paddle variable message.
diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h
index b9a5340a8636db7b5d6ec7b21368632d3916b4aa..eec9dd38d188247cba4da2a377038a28c847e40e 100644
--- a/paddle/operators/detail/send_recv_impl.h
+++ b/paddle/operators/detail/send_recv_impl.h
@@ -20,10 +20,6 @@
 #include "paddle/framework/selected_rows.h"
 #include "paddle/operators/detail/simple_block_queue.h"
 
-// #include <grpc++/channel.h>
-// #include <grpc++/client_context.h>
-// #include <grpc++/create_channel.h>
-// #include <grpc++/security/credentials.h>
 #include "paddle/operators/detail/send_recv.grpc.pb.h"
 #include "paddle/operators/detail/send_recv.pb.h"
 
@@ -48,24 +44,32 @@ namespace paddle {
 namespace operators {
 namespace detail {
 
+typedef std::pair<std::string, framework::LoDTensor> TensorWithName;
+
 class SendRecvServerImpl final : public SendRecvService::Service {
  public:
   explicit SendRecvServerImpl() {}
 
   Status SendVariable(ServerContext *context, const VariableMessage *in_var,
-                      VariableMessage *out_var) override;
-
-  const framework::LoDTensor Get() { return this->lodtensor_queue_.Pop(); }
+                      VoidMessage *out_var) override;
+  Status GetVariable(ServerContext *context, const VariableMessage *in_var,
+                     VariableMessage *out_var) override;
+  Status Wait(ServerContext *context, const VoidMessage *in_var,
+              VoidMessage *out_var) override;
+  void Reset();
+  void Done();
+  void SetScope(framework::Scope *scope) { scope_ = scope; };
 
-  void Push(const framework::LoDTensor &tensor) {
-    this->lodtensor_return_queue_.Push(tensor);
-  }
+  const TensorWithName Get() { return this->var_recv_queue_.Pop(); }
 
  private:
-  SimpleBlockQueue<framework::LoDTensor> lodtensor_queue_;
-  SimpleBlockQueue<framework::LoDTensor> lodtensor_return_queue_;
-  SimpleBlockQueue<framework::SelectedRows> selected_rows_queue_;
-  SimpleBlockQueue<framework::SelectedRows> selected_rows_return_queue_;
+  // received variable from RPC, operators fetch variable from this queue.
+  SimpleBlockQueue<TensorWithName> var_recv_queue_;
+  framework::Scope *scope_;
+  // condition of the sub program
+  std::mutex mutex_;
+  bool done_;
+  std::condition_variable condition_;
 };
 
 // RPCClient is a class to send tensors to pserver sub-network
@@ -75,8 +79,9 @@ class RPCClient {
   RPCClient(std::shared_ptr<Channel> channel)
       : stub_(SendRecvService::NewStub(channel)) {}
 
-  bool SendVariable(const framework::Scope &scope, const std::string &inname,
-                    const std::string &outname);
+  bool SendVariable(const framework::Scope &scope, const std::string &inname);
+  bool GetVariable(const framework::Scope &scope, const std::string &outname);
+  void Wait();
 
  private:
   std::unique_ptr<SendRecvService::Stub> stub_;
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index acd526ae8047292ce6c6756f174c80053dca0d9f..c4bee44e3e5a16334fb9070165eab5c7cdf0141c 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -40,8 +40,7 @@ class DropoutOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  DropoutOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of dropout op.");
     AddOutput("Out", "The output of dropout op.");
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
index 10c670751d026ef92e01aad7da31a8f59b8514c0..c31d2195e95b116451b0f620f6582f65c0dae706 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -71,7 +71,7 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
       auto M = EigenMatrix<T>::Reshape(*mask, 1);
       Y.device(place) = X * M;
     } else {
-      Y.device(place) = X * dropout_prob;
+      Y.device(place) = X * (1.0f - dropout_prob);
     }
   }
 };
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
index 84ad39f0bb639975365d427aa205411ef79ecd46..9f6c4212d4f834bbb2d1c65c836f3d3d0f3e0c96 100644
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -57,7 +57,7 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
       auto Y = EigenMatrix<T>::Reshape(*y, 1);
       auto& place =
           *context.template device_context<DeviceContext>().eigen_device();
-      Y.device(place) = X * dropout_prob;
+      Y.device(place) = X * (1.0f - dropout_prob);
     }
   }
 };
diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc
index a62eeeeb95fef77c00258403ca1cae11c2db7173..b6bd794a74665cef546347015be25ab989e852b2 100644
--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
@@ -19,8 +19,7 @@ namespace paddle {
 namespace operators {
 class ElementwiseAddOpMaker : public ElementwiseOpMaker {
  public:
-  ElementwiseAddOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseAddOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
     SetComment("Add", "$Out = X + Y$");
     AddComment(comment_);
diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc
index 1c3e9e70eef0c1adfb89cf1a58437092f8d536d7..78eae53f53593e5fd3a20daad09098190b4b59f6 100644
--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
@@ -19,8 +19,7 @@ namespace paddle {
 namespace operators {
 class ElementwiseDivOpMaker : public ElementwiseOpMaker {
  public:
-  ElementwiseDivOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseDivOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
     SetComment("Div", "$Out = X / Y$");
     AddComment(comment_);
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index aadb95cbe35fe565cf1009f0f9765def921d0906..f0a61b8b081f5675b1684022e61876ed4d1d4aca 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -20,8 +20,7 @@ namespace operators {
 
 class ElementwiseMulOpMaker : public ElementwiseOpMaker {
  public:
-  ElementwiseMulOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
     SetComment("Mul", "$Out = X \\odot\\ Y$");
     AddComment(comment_);
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
index ea533503e4916cae7e1157ed34da9629dcff3513..f308ee05e11210540e41cda4b9a896f9f96c4730 100644
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -43,8 +43,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
 
 class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ElementwiseOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The first input tensor of elementwise op");
     AddInput("Y", "(Tensor) The second input tensor of elementwise op");
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
index 7ebfc7df8c117edd7bcf14cc5ae6ba3dc1302c03..9edfacd6dfb506e12a0d82772d8de301bb8506e2 100644
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -103,10 +103,12 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
 
   MidWiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
     ++j_;
-    i_ = j_ / post_;
-    if (UNLIKELY(i_ == n_)) {
+    if (UNLIKELY(j_ == post_)) {
+      ++i_;
       j_ = 0;
-      i_ = 0;
+      if (UNLIKELY(i_ == n_)) {
+        i_ = 0;
+      }
     }
     return *this;
   }
@@ -125,10 +127,10 @@ class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
 
  private:
   const T* ptr_;
-  int i_;
+  int64_t i_;
   int64_t j_;
   int64_t n_;
-  int post_;
+  int64_t post_;
 };
 
 #ifdef __NVCC__
diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc
index 3e4d19361ead0100e45e50880d402e3d2b8557ff..1c4168621c343f14d603b18dd6c518052f83ad0d 100644
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -19,8 +19,7 @@ namespace paddle {
 namespace operators {
 class ElementwiseSubOpMaker : public ElementwiseOpMaker {
  public:
-  ElementwiseSubOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  ElementwiseSubOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
     SetComment("Sub", "$Out = X - Y$");
     AddComment(comment_);
diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc
index 8b3cddbb944de250d5754a2be64dd8e7ec53003a..08fa91ed72aa41ed2f513c090b9085410bb5cc47 100644
--- a/paddle/operators/expand_op.cc
+++ b/paddle/operators/expand_op.cc
@@ -55,7 +55,7 @@ class ExpandOp : public framework::OperatorWithKernel {
 
 class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ExpandOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  ExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index ee43c22fb13e203c7de1a7e6d1586423fcbfb25a..66b8080c26192a74cc27bce9a00107de89822717 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -54,8 +54,7 @@ class FeedOp : public framework::OperatorBase {
 
 class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FeedOpInfoMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  FeedOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of feed op");
     AddOutput("Out", "The output of feed op");
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 1ae07194c235ce6724f59c9c60df80f957787cda..616590f2001be3bea4e50c0c1755a80eb20e9348 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -61,8 +61,7 @@ class FetchOp : public framework::OperatorBase {
 
 class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FetchOpInfoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  FetchOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of fetch op");
     AddOutput("Out", "The output of fetch op");
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
index 7fb74e2b950338fbd05515f844959862504eddce..7a7e280e78309582a627087bdbdfea358c37b9eb 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -52,7 +52,7 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelType(
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
         ctx.device_context());
   }
 };
@@ -60,13 +60,12 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
 class FillConstantBatchSizeLikeOpMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  FillConstantBatchSizeLikeOpMaker(framework::OpProto *proto,
-                                   framework::OpAttrChecker *op_checker)
+  FillConstantBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddInput("Input",
              "(Tensor) Tensor "
              "whose dim_idx th dimension is used to specify the batch_size");
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
index 3d5f84bc239615797a5cf01a74150fdb7dfc1b80..3489079eaa3e8f04e27941de942ce9e14f8434f9 100644
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -34,7 +34,8 @@ class FillConstantOp : public framework::OperatorBase {
   using framework::OperatorBase::OperatorBase;
   void Run(const framework::Scope &scope,
            const platform::DeviceContext &dev_ctx) const override {
-    auto data_type = static_cast<framework::DataType>(Attr<int>("dtype"));
+    auto data_type =
+        static_cast<framework::proto::DataType>(Attr<int>("dtype"));
     auto value = Attr<float>("value");
     auto force_cpu = Attr<bool>("force_cpu");
     auto &out =
@@ -52,13 +53,12 @@ class FillConstantOp : public framework::OperatorBase {
 
 class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FillConstantOpMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  FillConstantOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
     AddAttr<float>("value", "(float, default 0) The value to be filled")
         .SetDefault(0.0f);
diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc
index 382e161c5d83ba560411b1f231aa896028b709b8..f0c6cff8e34c9038c2321c0326bd2ef728d665ba 100644
--- a/paddle/operators/fill_op.cc
+++ b/paddle/operators/fill_op.cc
@@ -48,7 +48,7 @@ class FillOp : public framework::OperatorBase {
                                 "Cannot find variable %s", Output("Out"))
                         .GetMutable<framework::LoDTensor>());
     out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
-    auto dtype = static_cast<framework::DataType>(Attr<int>("dtype"));
+    auto dtype = static_cast<framework::proto::DataType>(Attr<int>("dtype"));
     platform::CPUPlace cpu;
     auto force_cpu = Attr<bool>("force_cpu");
     out.mutable_data(force_cpu ? cpu : dev_ctx.GetPlace(),
@@ -76,7 +76,7 @@ class FillOp : public framework::OperatorBase {
 
 class FillOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FillOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  FillOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddComment(R"DOC(Fill operator
 
@@ -88,7 +88,7 @@ Fill an tensor with `value` and `shape`. The type of the tensor is specify by
         "value", "The float values of tensor, which are flatten in row major");
     AddAttr<std::vector<int>>("shape", "The shape of output tensor");
     AddAttr<int>("dtype", "The data type of output tensor, Default is float")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddAttr<bool>("force_cpu",
                   "Whether the output tensor must be at CPU memory or not. "
                   "Default is false.")
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index 720c11f5f12a8dea971fe82db6afe8f6b0d9ee1a..b4ae1de876010effff6bf577a4e33043f6760a4f 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -24,20 +24,19 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of FillZerosLikeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Y"),
-                   "Output(Y) of FillZerosLikeOp should not be null.");
-    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Y");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FillZerosLikeOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
 class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FillZerosLikeOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
+  FillZerosLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of fill-zeros-like op.");
-    AddOutput("Y", "The variable will be filled up with zeros.");
+    AddOutput("Out", "The variable will be filled up with zeros.");
     AddComment(R"DOC(
 FillZerosLike Operator.
 
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index a6e2941f52150de7886717303d2cb2f10b7eef7b..351ecf8b2f1d945fabdd1d6c5ed56f76f3caae61 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -23,7 +23,7 @@ template <typename DeviceContext, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* out = context.Output<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
     out->mutable_data<T>(context.GetPlace());
 
     math::SetConstant<DeviceContext, T> setter;
diff --git a/paddle/operators/ftrl_op.cc b/paddle/operators/ftrl_op.cc
index b14913ff213c84051b5a945f4a470cea4039a289..d00700823d48eb2ea4fc64d1fa2989f18c7c5f18 100644
--- a/paddle/operators/ftrl_op.cc
+++ b/paddle/operators/ftrl_op.cc
@@ -57,7 +57,7 @@ class FTRLOp : public framework::OperatorWithKernel {
 
 class FTRLOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FTRLOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  FTRLOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index 8f80fb162519f60fcce897b3c31a3507bbf6ba6d..47af222314c40a2c77ee422ccc70602078b3f1fb 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -67,7 +67,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
 
 class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GatherOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  GatherOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The source input of gather op");
     AddInput("Index", "The index input of gather op");
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 254c83e1378a121d99c89d9d8705935b5f06edc8..5eab1d5f4ee067db602ab81a9df1854bcfaf78a8 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -60,15 +60,14 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
         ctx.device_context());
   }
 };
 
 class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GaussianRandomOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  GaussianRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Out", "Output matrix of gaussian random op");
 
@@ -91,7 +90,7 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("dtype",
                  "(int, default 5(FP32)) "
                  "Output data type.")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
 
     AddComment(R"DOC(
 GaussianRandom Operator.
diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc
index 311e7edcf1519bc706a51e4d9242a1ebee5168ca..8e7000654c62b50a3ca130e2ffed4a0f5880de91 100644
--- a/paddle/operators/gru_op.cc
+++ b/paddle/operators/gru_op.cc
@@ -67,7 +67,7 @@ class GRUOp : public framework::OperatorWithKernel {
 
 class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GRUOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  GRUOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Input",
              "(LoDTensor) The first input is a LodTensor, which supports "
diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc
index 705de87be5b67fbc343a89eeba2282941b264c8a..7e5f674a8c020d931fd375ff5994da18052aa8fa 100644
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
@@ -71,8 +71,7 @@ class GRUUnitOp : public framework::OperatorWithKernel {
 
 class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GRUUnitOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  GRUUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Input",
              "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
diff --git a/paddle/operators/hinge_loss_op.cc b/paddle/operators/hinge_loss_op.cc
index 373b4d99b47f2a8ab06c7584a25acee59b6f3e3b..19d2e9dc56fe11f9dfb13e8cb271a23e128bf91b 100644
--- a/paddle/operators/hinge_loss_op.cc
+++ b/paddle/operators/hinge_loss_op.cc
@@ -46,8 +46,7 @@ class HingeLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HingeLossOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  HingeLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Logits",
              "The input value (Logits) of Hinge loss op."
diff --git a/paddle/operators/huber_loss_op.cc b/paddle/operators/huber_loss_op.cc
index 11828d083a55f0a38cf3b8513b7395bbb5592581..5c92f2c7b2d2f701bcc487716db41a0cce91002f 100644
--- a/paddle/operators/huber_loss_op.cc
+++ b/paddle/operators/huber_loss_op.cc
@@ -45,8 +45,7 @@ class HuberLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HuberLossOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  HuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input value of huber loss op."
diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc
index 54911267e36dfdbc62d533f40f0b754e7d2cb7bf..789c92102d63355a80c3330f2107c731206397f4 100644
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@@ -70,8 +70,7 @@ class IncrementOp : public framework::OperatorBase {
 
 class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  IncrementOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  IncrementOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input tensor of increment operator");
     AddOutput("Out", "(Tensor) The output tensor of increment operator.");
@@ -94,13 +93,13 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
  public:
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("increment");
     grad_op->SetInput("X", Output("Out"));
     grad_op->SetOutput("Out", Input("X"));
     grad_op->SetAttr("step", -boost::get<float>(GetAttr("step")));
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/is_empty_op.cc b/paddle/operators/is_empty_op.cc
index 54fecf44e881b5c283c81580fd161da9808d253e..3616a0414f9e889376f8ba46e7567d7171eff3bf 100644
--- a/paddle/operators/is_empty_op.cc
+++ b/paddle/operators/is_empty_op.cc
@@ -47,8 +47,7 @@ class IsEmptyOp : public framework::OperatorBase {
 
 class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  IsEmptyOpProtoMaker(framework::OpProto *proto,
-                      framework::OpAttrChecker *op_checker)
+  IsEmptyOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(kInput, "(Tensor) Tensor which is to be checked.");
     AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");
diff --git a/paddle/operators/l1_norm_op.cc b/paddle/operators/l1_norm_op.cc
index c0b51202c6bb708a682568175c56583394961535..3d1da79763102c876de3b45e56438da909b00394 100644
--- a/paddle/operators/l1_norm_op.cc
+++ b/paddle/operators/l1_norm_op.cc
@@ -48,7 +48,7 @@ class L1NormGradOp : public framework::OperatorWithKernel {
 
 class L1NormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  L1NormOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  L1NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input of l1_norm op.");
     AddOutput("Out", "(Scalar) The output of l1_norm op.");
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index 896e3657d4406c5a1fe07f1712abb2ff0370fd3c..ad15e8ebd2b323929a4448e98a18c5cad6f5ed12 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -19,8 +19,7 @@ namespace operators {
 
 class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LinearChainCRFOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  LinearChainCRFOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Emission",
              "(LoDTensor, default LoDTensor<float>) "
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
index 4e58b84430f2a8697bbbc1acf971fd063120f563..6c51dad27a4d9cd9e48b8591b1f14472c83ceaf1 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -58,8 +58,7 @@ class LoadOp : public framework::OperatorBase {
 
 class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoadOpProtoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  LoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Out", "(Tensor) The tensor need to be loaded");
     AddAttr<std::string>("file_path",
diff --git a/paddle/operators/lod_array_length_op.cc b/paddle/operators/lod_array_length_op.cc
index b2f4ec57fadd2ba3dc8708abbfebaaeb67100f1e..cc8593810baf83e12368e67ceaeef0631e35c051 100644
--- a/paddle/operators/lod_array_length_op.cc
+++ b/paddle/operators/lod_array_length_op.cc
@@ -38,8 +38,7 @@ class LoDArrayLengthOp : public framework::OperatorBase {
 
 class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDArrayLengthProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  LoDArrayLengthProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(LoDTensorArray) The input tensor array.");
     AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t");
diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc
index f7d4db1947b83fecf57575e17fafe26795c92bdd..2d67046bfee01d8d148da1c8b705d3ad959a4839 100644
--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -30,13 +30,13 @@ class LoDRankTableOp : public framework::OperatorBase {
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
     VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));
     out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
+    VLOG(10) << Input("X") << "'s lod information is " << *out;
   }
 };
 
 class LoDRankTableOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDRankTableOpProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  LoDRankTableOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LoDTensor) input lod tensor, must contain lod information.");
@@ -63,11 +63,11 @@ class LoDRankTableInferShape : public framework::InferShapeBase {
 
 class LoDRankTableInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
     for (auto &o : op_desc.Output("Out")) {
       block->FindRecursiveOrCreateVar(o)->SetType(
-          framework::VarDesc::LOD_RANK_TABLE);
+          framework::proto::VarDesc::LOD_RANK_TABLE);
     }
   }
 };
diff --git a/paddle/operators/lod_reset_op.cc b/paddle/operators/lod_reset_op.cc
index 32831cb1e2cf188a507773ef1e00b22de98d82ab..ccb87258c6b8629cd18d08185bfcc84c247070dd 100644
--- a/paddle/operators/lod_reset_op.cc
+++ b/paddle/operators/lod_reset_op.cc
@@ -48,8 +48,7 @@ class LoDResetOp : public framework::OperatorWithKernel {
 
 class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDResetOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  LoDResetOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
     AddInput("TargetLoD",
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
index b970bf31773f4c6feb0010bd40ba906b388ec310..643f8859f3d0d44c0b5be922bd786ab04093df94 100644
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -97,8 +97,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
 
 class LoDTensorToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDTensorToArrayOpProtoMaker(framework::OpProto *proto,
-                               framework::OpAttrChecker *op_checker)
+  LoDTensorToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "");
     AddInput("RankTable", "");
@@ -128,10 +127,10 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase {
 
 class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
     for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+      block->Var(out_var)->SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
     }
   }
 };
@@ -141,14 +140,14 @@ class LoDTensorToArrayGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("array_to_lod_tensor");
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetInput("RankTable", Input("RankTable"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc
index 4524229a330a0ceddca673e2b2a6d836a15a2e3f..f714945354c5668f58e273dc8d6c7c16d51ac17d 100644
--- a/paddle/operators/log_loss_op.cc
+++ b/paddle/operators/log_loss_op.cc
@@ -46,8 +46,7 @@ class LogLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LogLossOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  LogLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Predicted",
              "The input value (Predicted) of Log loss op."
diff --git a/paddle/operators/logical_op.cc b/paddle/operators/logical_op.cc
index c818d5e9c19abab15ebdc2b3485e03ab66cf649d..2bd6c6efae38d6d8d49cc9f3fd97cf316fbbdd0a 100644
--- a/paddle/operators/logical_op.cc
+++ b/paddle/operators/logical_op.cc
@@ -20,8 +20,7 @@ namespace operators {
 template <typename OpComment>
 class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BinaryLogicalOpProtoMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  BinaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     OpComment comment;
     AddInput("X",
@@ -45,8 +44,7 @@ Each element of Out is calculated by %s
 template <typename OpComment>
 class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  UnaryLogicalOpProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  UnaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     OpComment comment;
     AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator",
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 93e812ac5be5aea6bf3ab353d31480322c51ccbc..0a9defa8c50453abf3eefdcb89126b1349d6d756 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -51,8 +51,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
 
 class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LookupTableOpMaker(framework::OpProto* proto,
-                     framework::OpAttrChecker* op_checker)
+  LookupTableOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("W",
              "An input represents embedding tensors, "
@@ -109,19 +108,20 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
 
 class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind& op_desc,
-                  framework::BlockDescBind* block) const override {
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
     auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
     auto attr = op_desc.GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
               << " is set to SelectedRows";
-      block->Var(out_var_name)->SetType(framework::VarDesc::SELECTED_ROWS);
+      block->Var(out_var_name)
+          ->SetType(framework::proto::VarDesc::SELECTED_ROWS);
     } else {
       VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
               << " is set to LoDTensor";
-      block->Var(out_var_name)->SetType(framework::VarDesc::LOD_TENSOR);
+      block->Var(out_var_name)->SetType(framework::proto::VarDesc::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/operators/lrn_op.cc b/paddle/operators/lrn_op.cc
index b5b7bc940a85ac2bbb6c6b303284777df714b7d6..3b77b27b72d7079c10695da43a4fcfed9b4c855c 100644
--- a/paddle/operators/lrn_op.cc
+++ b/paddle/operators/lrn_op.cc
@@ -140,7 +140,7 @@ class LRNOp : public framework::OperatorWithKernel {
 template <typename T>
 class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LRNOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  LRNOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor) The input of LRN operator. "
diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc
index 2db7da30db416e03cf473c8e65b023d9265e9193..f82156170e672b5e590ddb8e0e6e8a2a24ea6868 100644
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -102,7 +102,7 @@ class LSTMOp : public framework::OperatorWithKernel {
 
 class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LSTMOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  LSTMOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Input",
              "(LoDTensor) the first input is a LodTensor, which support "
diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc
index b6eb33bafe50548502a0478d37842fd2dfdebda4..34da75c00d336d3f540a9472ee2e6c4b224add09 100644
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -48,8 +48,7 @@ class LstmUnitOp : public framework::OperatorWithKernel {
 
 class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LstmUnitOpMaker(framework::OpProto* proto,
-                  framework::OpAttrChecker* op_checker)
+  LstmUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "Lstm unit only applies non-linear activations, please make sure"
diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc
index 42e8961c0ea57650a823ee4b58516f66a455b385..fddc72aec0aa7fa17ef585388c53da640d3c1837 100644
--- a/paddle/operators/margin_rank_loss_op.cc
+++ b/paddle/operators/margin_rank_loss_op.cc
@@ -42,8 +42,7 @@ class MarginRankLossOp : public framework::OperatorWithKernel {
 template <typename T>
 class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MarginRankLossOpMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  MarginRankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X1",
              "(2-D tensor with shape [batch_size x 1]) The score for "
diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc
index 707ebf05962fb65892c2adbbf41a0a3449763d31..c2633b2e16434558d16f699a701e7b8cf1de8342 100644
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
@@ -61,14 +61,13 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
 
     const T* im_data = im.data<T>();
     T* col_data = col->data<T>();
-
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
       int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / filter_width / filter_height;
+      int c_im = c / (filter_width * filter_height);
       for (int h = 0; h < col_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
         for (int w = 0; w < col_width; ++w) {
-          int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
           int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
           int col_idx = (c * col_height + h) * col_width + w;
           int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
@@ -130,16 +129,14 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
       int h_offset = (c / filter_width) % filter_height;
-      int c_im = c / filter_width / filter_height;
+      int c_im = c / (filter_width * filter_height);
       for (int h = 0; h < col_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
         for (int w = 0; w < col_width; ++w) {
-          int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
           int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-
           if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
               (im_col_idx) >= 0 && (im_col_idx) < im_width) {
-            im_row_idx += c_im * im_height;
-            im_data[im_row_idx * im_width + im_col_idx] +=
+            im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] +=
                 col_data[(c * col_height + h) * col_width + w];
           }
         }
@@ -199,12 +196,13 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
         for (int channel = 0; channel < im_channels; ++channel) {
           for (int filter_row_idx = 0; filter_row_idx < filter_height;
                ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
             for (int filter_col_idx = 0; filter_col_idx < filter_width;
                  ++filter_col_idx) {
-              int im_row_offset =
-                  col_row_idx * stride[0] + filter_row_idx - padding[0];
               int im_col_offset =
                   col_col_idx * stride[1] + filter_col_idx - padding[1];
+
               int col_offset =
                   ((((col_row_idx)*col_width + col_col_idx) * im_channels +
                     channel) *
@@ -271,12 +269,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
         for (int channel = 0; channel < im_channels; ++channel) {
           for (int filter_row_idx = 0; filter_row_idx < filter_height;
                ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
             for (int filter_col_idx = 0; filter_col_idx < filter_width;
                  ++filter_col_idx) {
-              int im_row_offset =
-                  col_row_idx * stride[0] + filter_row_idx - padding[0];
               int im_col_offset =
                   col_col_idx * stride[1] + filter_col_idx - padding[1];
+
               int col_offset =
                   (((col_row_idx * col_width + col_col_idx) * im_channels +
                     channel) *
@@ -284,6 +283,7 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                    filter_row_idx) *
                       filter_width +
                   filter_col_idx;
+
               if (im_row_offset >= 0 && im_row_offset < im_height &&
                   im_col_offset >= 0 && im_col_offset < im_width) {
                 int im_offset =
diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h
index 3e6d83386589a02c7d8f62394c1c2becb606504c..aced2690bce9a4c2db6309f18e23a8f6cd0211f3 100644
--- a/paddle/operators/math/math_function_impl.h
+++ b/paddle/operators/math/math_function_impl.h
@@ -67,18 +67,45 @@ void RowwiseAdd<DeviceContext, T>::operator()(const DeviceContext& context,
 template <typename DeviceContext, typename T>
 void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                               const framework::Tensor& input,
-                                              framework::Tensor* vector) {
+                                              framework::Tensor* out) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), size);
+  PADDLE_ENFORCE_EQ(out->numel(), size);
 
-  auto vec = framework::EigenMatrix<T>::From(*vector);
   auto in = framework::EigenMatrix<T>::From(input);
-  Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
-  vec.reshape(shape).device(*context.eigen_device()) =
-      in.sum(Eigen::array<int, 1>({{0}})).reshape(shape);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
 }
 
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// colwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class ColwiseSum<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), size);
+
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < height; ++i) {
+      for (size_t j = 0; j < size; ++j) {
+        if (i == 0) {
+          out_buf[j] = in_buf[i * size + j];
+        } else {
+          out_buf[j] += in_buf[i * size + j];
+        }
+      }
+    }
+  }
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc
index ee0bc0c3708ac20ad00e3222060244d42dbd6f2f..fd65d894d5749c97f860d614de354e89f6d9441d 100644
--- a/paddle/operators/matmul_op.cc
+++ b/paddle/operators/matmul_op.cc
@@ -130,7 +130,7 @@ class MatMulOp : public framework::OperatorWithKernel {
 
 class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MatMulOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MatMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The first input of MatMul op");
     AddInput("Y", "The second input of MatMul op");
diff --git a/paddle/operators/max_sequence_len_op.cc b/paddle/operators/max_sequence_len_op.cc
index 798022c9dd904a0ac189b4b550a94264a433ebf2..dec2874a1fd13c1379e37d7b9755d465ffb1a6f7 100644
--- a/paddle/operators/max_sequence_len_op.cc
+++ b/paddle/operators/max_sequence_len_op.cc
@@ -40,8 +40,7 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
 
 class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxSeqenceLenOpProtoMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  MaxSeqenceLenOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("RankTable", "The lod_rank_table.");
     AddOutput("Out", "The max sequence length.");
diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc
index 011616e615a36efa0efe9ff15e678f1486c5177a..3ee32269417e80cd14a6ff0f8e52c0b2dec4b8be 100644
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
@@ -20,7 +20,7 @@ using framework::Tensor;
 
 class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxOutOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MaxOutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index 8932d700c2ae17eefe919eefae2282ae4a5a80a8..411f4d14efbfa5a8ee6dd7da645a044b191bf006 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -32,7 +32,7 @@ class MeanOp : public framework::OperatorWithKernel {
 
 class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MeanOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MeanOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of mean op");
     AddOutput("Out", "The output of mean op");
@@ -60,13 +60,13 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto* grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
     grad_op->SetType("mean_grad");
     grad_op->SetInput("X", Input("X"));
     grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc
index adc688dbd5e13a2203d6842a12acdb8625288275..5edf29c3af958f5a939fdb830d46aca4b8d3dbe0 100644
--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -114,8 +114,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
 
 class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MergeLoDTensorOpProtoMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+  MergeLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input LoDTensor, contains complete lod information to "
@@ -162,15 +161,15 @@ class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("split_lod_tensor");
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetInput("Mask", Input("Mask"));
     grad_op->SetOutput("OutTrue", InputGrad("InTrue"));
     grad_op->SetOutput("OutFalse", InputGrad("InFalse"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index 27f0c8de2053064e65d9984ec9bd4242fee48e5f..2e9cc9d29d8c92ac56b451834f930758216e6a44 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -46,7 +46,7 @@ class MinusOp : public framework::OperatorWithKernel {
 
 class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MinusOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The left tensor of minus operator.");
     AddInput("Y", "The right tensor of minus operator.");
@@ -70,12 +70,11 @@ class MinusGradMaker : public framework::GradOpDescMakerBase {
  public:
   using framework::GradOpDescMakerBase::GradOpDescMakerBase;
 
-  std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
-      const override {
-    std::vector<std::unique_ptr<framework::OpDescBind>> ops;
+  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
+    std::vector<std::unique_ptr<framework::OpDesc>> ops;
     auto x_g = InputGrad("X");
     if (!x_g.empty()) {
-      auto *x_g_op = new framework::OpDescBind();
+      auto *x_g_op = new framework::OpDesc();
       x_g_op->SetType("scale");
       x_g_op->SetInput("X", OutputGrad("Out"));
       x_g_op->SetOutput("Out", x_g);
@@ -85,7 +84,7 @@ class MinusGradMaker : public framework::GradOpDescMakerBase {
 
     auto y_g = InputGrad("Y");
     if (!y_g.empty()) {
-      auto *y_g_op = new framework::OpDescBind();
+      auto *y_g_op = new framework::OpDesc();
       y_g_op->SetType("scale");
       y_g_op->SetInput("X", OutputGrad("Out"));
       y_g_op->SetOutput("Out", y_g);
diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc
index f0a42491bf04a5bbe2de10de2f702877c9a2f839..dbb28f8466b141502fbba8ae5d8a511a6b1d74c3 100644
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -39,8 +39,7 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
 
 class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ModifiedHuberLossOpMaker(framework::OpProto* proto,
-                           framework::OpAttrChecker* op_checker)
+  ModifiedHuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input tensor of modified huber loss op. "
diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc
index 2ab48fedecf0cce95dcf4d0593dcd4b30bc1f505..15b8b80776732f43c3ef4f8b80cffedf5c2a76fd 100644
--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
@@ -54,8 +54,7 @@ class MomentumOp : public framework::OperatorWithKernel {
 
 class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MomentumOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  MomentumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index bc4a5fdf0b37ce07b4c07bba9e1af5611d2be7e3..599df9c3df58db6444d7cb729e1a2c1f9f628b5b 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -71,41 +71,52 @@ class MulOpShapeInference : public framework::InferShapeBase {
 
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MulOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  MulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of mul op");
-    AddInput("Y", "The second input of mul op");
-    AddOutput("Out", "The output of mul op");
+    AddInput("X", "(Tensor), The first input tensor of mul op.");
+    AddInput("Y", "(Tensor), The second input tensor of mul op.");
+    AddOutput("Out", "(Tensor), The output tensor of mul op.");
     AddAttr<int>(
         "x_num_col_dims",
-        "(int, default 1) "
-        R"DOC(mul_op can take tensors with more than two dimensions as input `X`,
-            in that case, tensors will be reshaped to a matrix. The matrix's first
-            dimension(column length) will be the product of tensor's last
-            `num_col_dims` dimensions, and the matrix's second dimension(row length)
-            will be the product of tensor's first `rank - num_col_dims` dimensions.
+        R"DOC((int, default 1), The mul_op can take tensors with more than two
+              dimensions as its inputs. If the input $X$ is a tensor with more
+              than two dimensions, $X$ will be flattened into a two-dimensional
+              matrix first. The flattening rule is: the first `num_col_dims`
+              will be flattened to form the first dimension of the final matrix
+              (the height of the matrix), and the rest `rank(X) - num_col_dims`
+              dimensions are flattened to form the second dimension of the final
+              matrix (the width of the matrix). As a result, height of the
+              flattened matrix is equal to the product of $X$'s first
+              `x_num_col_dims` dimensions' sizes, and width of the flattened
+              matrix is equal to the product of $X$'s last `rank(x) - num_col_dims`
+              dimensions' size. For example, suppose $X$ is a 6-dimensional
+              tensor with the shape [2, 3, 4, 5, 6], and `x_num_col_dims` = 3.
+              Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] =
+              [24, 30].
         )DOC")
         .SetDefault(1)
         .EqualGreaterThan(1);
     AddAttr<int>(
         "y_num_col_dims",
-        "(int, default 1) "
-        R"DOC(mul_op can take tensors with more than two dimensions as input `Y`,
-             in that case, tensors will be reshaped to a matrix. Just like input `X`.
+        R"DOC((int, default 1), The mul_op can take tensors with more than two,
+              dimensions as its inputs. If the input $Y$ is a tensor with more
+              than two dimensions, $Y$ will be flattened into a two-dimensional
+              matrix first. The attribute `y_num_col_dims` determines how $Y$ is
+              flattened. See comments of `x_num_col_dims` for more details.
         )DOC")
         .SetDefault(1)
         .EqualGreaterThan(1);
     AddComment(R"DOC(
-Mul Operator. 
+Mul Operator.
 
-This operator is used to perform matrix multiplication for input X and Y.
+This operator is used to perform matrix multiplication for input $X$ and $Y$.
 
 The equation is:
 
     $$Out = X * Y$$
 
-Both the input `X` and `Y` can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input `X`.
+Both the input $X$ and $Y$ can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input $X$.
 
 )DOC");
   }
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index b1ee8051c4c48f575690b38142ae082930fe2070..f524de60dbb3c652aa2a74478af6c0e38fb3cb43 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -61,8 +61,7 @@ class MultiplexOp : public framework::OperatorWithKernel {
 
 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MultiplexOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  MultiplexOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Ids", "The index tensor of multiplex operator.");
     AddInput("X", "The candidate tensors of multiplex operator.")
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
index 22a37ff1bbf6b8cfb2cbc3c3dbbb20a87c5ea4e7..e19f534f8a2d05cd9b569a0eebb287db3d3321ba 100644
--- a/paddle/operators/nccl_op.cc
+++ b/paddle/operators/nccl_op.cc
@@ -43,8 +43,7 @@ class NCCLInitOp : public framework::OperatorBase {
 
 class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLInitOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  NCCLInitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Communicator",
               "Create Communicator for communicating between gpus");
@@ -52,7 +51,7 @@ class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddComment(R"DOC(
 NCCLInit Operator.
 
@@ -141,8 +140,7 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
 // AllreduceOp
 class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLAllReduceOpMaker(framework::OpProto *proto,
-                       framework::OpAttrChecker *op_checker)
+  NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of AllReduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
@@ -163,8 +161,7 @@ AllReduce the input tensors.
 // ReduceOp
 class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLReduceOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  NCCLReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of Reduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
@@ -190,8 +187,7 @@ Reduce the tensors.
 // BcastOp
 class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLBcastOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  NCCLBcastOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of BcastSend op");
     AddInput("Communicator", "Communicator for communicating between gpus");
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
index d747cc0cf5f74b886bbd40549673e7d64de952e9..c1046aadafbde303a3a8b12f2377018396b9adb8 100644
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -65,7 +65,7 @@ class NCCLTester : public ::testing::Test {
   }
 
   void NCCLInitOp() {
-    std::unique_ptr<f::OpDescBind> op1(new f::OpDescBind);
+    std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
 
     op1->SetType("ncclInit");
     op1->SetOutput("Communicator", {"comm"});
@@ -81,10 +81,9 @@ class NCCLTester : public ::testing::Test {
   }
 
   template <class T>
-  void PerThreadProgram(int gpu_id, const f::OpDescBind &op_desc,
-                        f::Scope *scope) {
+  void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
     std::unique_lock<std::mutex> lk(mu);
-    const f::OpDescBind *op1 = &op_desc;
+    const f::OpDesc *op1 = &op_desc;
 
     p::GPUPlace place(gpu_id);
     auto &ctx = dev_ctxs.at(gpu_id);
@@ -125,7 +124,7 @@ class NCCLTester : public ::testing::Test {
 
 // ncclInitOp with desc
 TEST(NCCL, ncclInitOp) {
-  std::unique_ptr<f::OpDescBind> op_desc(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);
 
   op_desc->SetType("ncclInit");
   op_desc->SetOutput("Communicator", {"x1"});
@@ -145,7 +144,7 @@ TEST(NCCL, ncclInitOp) {
 
 // ncclAllReduceOp with desc
 TEST_F(NCCLTester, ncclAllReduceOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   op2->SetType("ncclAllReduce");
   op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
@@ -192,7 +191,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
 
 // ncclReduceOp with desc
 TEST_F(NCCLTester, ncclReduceOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   const int kRoot = 0;
   op2->SetType("ncclReduce");
   op2->SetInput("X", {"st"});
@@ -240,7 +239,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
 
 // ncclBcastOp with desc
 TEST_F(NCCLTester, ncclBcastOp) {
-  std::unique_ptr<f::OpDescBind> op2(new f::OpDescBind);
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   const int kRoot = 5;
   op2->SetType("ncclBcast");
   op2->SetInput("X", {"st"});
diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc
index 5ad1610fde041ee934486ef98ba41dca42559100..6dd457f7a2e410b65680004599ab753acbb34f71 100644
--- a/paddle/operators/nce_op.cc
+++ b/paddle/operators/nce_op.cc
@@ -73,7 +73,7 @@ class NCEOp : public framework::OperatorWithKernel {
 
 class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCEOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  NCEOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim].");
     AddInput(
diff --git a/paddle/operators/batch_norm_op.md b/paddle/operators/op_documentation/batch_norm_op.md
similarity index 100%
rename from paddle/operators/batch_norm_op.md
rename to paddle/operators/op_documentation/batch_norm_op.md
diff --git a/paddle/operators/name_convention.md b/paddle/operators/op_documentation/name_convention.md
similarity index 96%
rename from paddle/operators/name_convention.md
rename to paddle/operators/op_documentation/name_convention.md
index b5cb176e003b4584321142ac9f1c3380b7010936..a02b356f058da68442516c2705d0bac140f8ef18 100644
--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/op_documentation/name_convention.md
@@ -35,8 +35,8 @@ Here we give some examples to show how these rules will be used.
 ```c++
 class AccumulateOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AccumulateOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  AccumulateOpMaker(OpProto *proto,
+                    OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
     If the output size is not the same as input size,
diff --git a/paddle/operators/net_op_design.md b/paddle/operators/op_documentation/net_op_design.md
similarity index 100%
rename from paddle/operators/net_op_design.md
rename to paddle/operators/op_documentation/net_op_design.md
diff --git a/paddle/operators/op_documentation/op_markdown_format.md b/paddle/operators/op_documentation/op_markdown_format.md
new file mode 100644
index 0000000000000000000000000000000000000000..0ee804d592252c727622cbe59b0644813db3c4fd
--- /dev/null
+++ b/paddle/operators/op_documentation/op_markdown_format.md
@@ -0,0 +1,64 @@
+# Standard Markdown Format for Operators
+The following should be the standard format for documentation for all the operators that will get rendered in the `html`:
+
+```
+Operator Name (In PaddlePaddle)
+
+Operator Name (Standard)
+
+Operator description.
+
+LaTeX equation of how the operator performs an update.
+
+The signature of the operator.
+```
+
+Each section mentioned above has been covered in further detail in the rest of the document.
+
+# PaddlePaddle Operator Name
+This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
+`array to lod tensor` should be written as `array_to_lod_tensor`.
+
+This naming convention should be standard across all PaddlePaddle operators.
+
+# Standard Operator Name
+This is the standard name of the operator as used in the community. The general standard is usually:
+- Standard abbreviations like `SGD` are written in all capital letters.
+- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
+- Keep numbers inside a word as is, with no boundary delimiters.
+- Follow the name of the operator with the keyword: `Activation Operator.`
+
+# Operator description
+This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
+
+# LaTeX equation
+This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
+
+# The signature
+This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
+`Section :
+VariableName : (VariableType) VariableDescription
+...
+...
+`
+
+
+The following example for an `sgd` operator covers the above mentioned sections as they would ideally look like in the `html`:
+
+```
+sgd
+
+SGD operator
+
+This operator implements one step of the stochastic gradient descent algorithm.
+
+param_out = param_learning_rate * grad
+
+Inputs:
+Param : (Tensor) Input parameter
+LearningRate : (Tensor) Learning rate of SGD
+Grad : (Tensor) Input gradient
+
+Outputs:
+ParamOut : (Tensor) Output parameter
+```
diff --git a/paddle/operators/rnn_design.md b/paddle/operators/op_documentation/rnn_design.md
similarity index 100%
rename from paddle/operators/rnn_design.md
rename to paddle/operators/op_documentation/rnn_design.md
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index 936dde22c34a30c5a50e2ac8a76f0f91dfb328ab..40f7a7eed53354fa65373830b0972c0e72ef54da 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -48,7 +48,7 @@ class PadOp : public framework::OperatorWithKernel {
 
 class PadOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PadOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  PadOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input of pad op. "
@@ -116,14 +116,14 @@ class PadOpGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto* bind = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* bind = new framework::OpDesc();
     bind->SetInput("X", Input("X"));
     bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     bind->SetAttrMap(Attrs());
     bind->SetType("pad_grad");
-    return std::unique_ptr<framework::OpDescBind>(bind);
+    return std::unique_ptr<framework::OpDesc>(bind);
   }
 };
 
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index 45fa20280c1ad20f63d6542d5199e002ff60495f..50057eb6483e9c9e745bc07dee26a0bbbbb5a48c 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -67,8 +67,7 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
   ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
 }
 
-Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput(
       "X",
@@ -136,8 +135,7 @@ Example:
 )DOC");
 }
 
-Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
   AddInput("X",
            "(Tensor) The input tensor of pooling operator. "
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
index ab85d587a3131237d7a9ec774a11193c70220c7c..3860e295f4b4dbeb2d60cfb304847de39083f1e1 100644
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -40,14 +40,12 @@ class PoolOpGrad : public framework::OperatorWithKernel {
 
 class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Pool2dOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Pool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Pool3dOpMaker(framework::OpProto* proto,
-                framework::OpAttrChecker* op_checker);
+  Pool3dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
 };
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index 1a2383f8b80357d2927c3b6a8c57c787ba7e366d..980e9dc08b2ac160e6e06dfb11ff8f3e1279be46 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -100,8 +100,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
 
 class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxPool2dWithIndexOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  MaxPool2dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
@@ -178,8 +177,7 @@ Example:
 
 class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxPool3dWithIndexOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
+  MaxPool3dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor) The input tensor of pooling operator. "
diff --git a/paddle/operators/positive_negative_pair_op.cc b/paddle/operators/positive_negative_pair_op.cc
index 4ba40a62ec5f696ad980c2913f7e162879a557e2..ab9f67bfe6b3d6f59b35a57cb8135e9c6d00636e 100644
--- a/paddle/operators/positive_negative_pair_op.cc
+++ b/paddle/operators/positive_negative_pair_op.cc
@@ -95,8 +95,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
 
 class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PositiveNegativePairOpMaker(framework::OpProto *proto,
-                              framework::OpAttrChecker *op_checker)
+  PositiveNegativePairOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Score",
              "(Tensor, float) Model Score on an item (with "
diff --git a/paddle/operators/precision_recall_op.cc b/paddle/operators/precision_recall_op.cc
index 1ace4f2a5935dcb4239526c42599a42d288ff552..21dcd28c67bb5eb1d3af0ac8ba16f1d5df1958a8 100644
--- a/paddle/operators/precision_recall_op.cc
+++ b/paddle/operators/precision_recall_op.cc
@@ -90,8 +90,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
 
 class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PrecisionRecallOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
+  PrecisionRecallOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("MaxProbs",
              "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index 317a2a40154f92f2e13a3012d2f7a63df9a69afb..4af8f85277ddb2262aa534f8d81be30449ccf8da 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -38,7 +38,7 @@ class PReluOp : public framework::OperatorWithKernel {
 
 class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PReluOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  PReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of prelu operator.");
     AddInput("Alpha", "The alpha weight of prelu operator.");
diff --git a/paddle/operators/proximal_adagrad_op.cc b/paddle/operators/proximal_adagrad_op.cc
index cc350f6d26e6d8bd6e59f2fda74a3b734df55247..b92f46b5bd4e48a25f8c87873c5df53f1753b71b 100644
--- a/paddle/operators/proximal_adagrad_op.cc
+++ b/paddle/operators/proximal_adagrad_op.cc
@@ -59,8 +59,7 @@ class ProximalAdagradOp : public framework::OperatorWithKernel {
 
 class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ProximalAdagradOpMaker(framework::OpProto *proto,
-                         framework::OpAttrChecker *op_checker)
+  ProximalAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
diff --git a/paddle/operators/proximal_gd_op.cc b/paddle/operators/proximal_gd_op.cc
index 0b26beb3ac3803c78f45cc2ce0a8f444bdc313b6..2d3bbdaf320a4d6bdf18ec92230a81ad98371498 100644
--- a/paddle/operators/proximal_gd_op.cc
+++ b/paddle/operators/proximal_gd_op.cc
@@ -47,8 +47,7 @@ class ProximalGDOp : public framework::OperatorWithKernel {
 
 class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ProximalGDOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  ProximalGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index b80b175792f3fc56d689c187b7182198542d7345..b5a9949d236bfa6910227092f0682a599543a425 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -45,8 +45,7 @@ class RankLossOp : public framework::OperatorWithKernel {
 
 class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RankLossOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
+  RankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Label",
              "(2-D Tensor with shape [batch_size x 1]) "
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 232d926f7b975c3b8ebecad983d0f1cc54b9486f..5981d5745d24e0b2fe68bf8b9852cb8a6094885f 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -234,7 +234,7 @@ class RecurrentOp : public RecurrentBase {
     auto reverse = Attr<bool>(kReverse);
 
     framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
 
     for (size_t i = 0; i < seq_len; ++i) {
@@ -317,7 +317,7 @@ class RecurrentGradOp : public RecurrentBase {
     auto reverse = Attr<bool>(kReverse);
 
     framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
 
     for (size_t step_id = 0; step_id < seq_len; ++step_id) {
@@ -497,8 +497,7 @@ class RecurrentGradOp : public RecurrentBase {
 
 class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RecurrentOpProtoMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  RecurrentOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(kInputs, "rnn inputs").AsDuplicable();
     AddInput(kInitialStates, "rnn initial states").AsDuplicable();
@@ -523,8 +522,7 @@ The ex-state means the state value in the ex-timestep or the previous time step
         string::Sprintf(
             "The state variable names. [%s, %s, %s] must be the same order",
             kExStates, kStates, kInitStateGrads));
-    AddAttr<framework::BlockDescBind *>(kStepBlock,
-                                        "The step block inside RNN");
+    AddAttr<framework::BlockDesc *>(kStepBlock, "The step block inside RNN");
     AddAttr<bool>(kReverse, R"DOC(Calculate RNN reversely or not.
 By default reverse=False
 
@@ -566,13 +564,13 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  virtual std::unique_ptr<framework::OpDescBind> Apply() const {
-    auto *grad = new framework::OpDescBind();
+  virtual std::unique_ptr<framework::OpDesc> Apply() const {
+    auto *grad = new framework::OpDesc();
     grad->SetType("recurrent_grad");
     for (auto &input_param : this->InputNames()) {
       grad->SetInput(input_param, this->Input(input_param));
       grad->SetOutput(framework::GradVarName(input_param),
-                      this->InputGrad(input_param));
+                      this->InputGrad(input_param, false));
     }
 
     for (auto &output_param : this->OutputNames()) {
@@ -589,7 +587,7 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
     grad->SetAttrMap(this->Attrs());
     grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
 
-    return std::unique_ptr<framework::OpDescBind>(grad);
+    return std::unique_ptr<framework::OpDesc>(grad);
   }
 };
 
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
index eed482c1b458cd442ede523838b400d85c23a155..4e91d1151ebf7a0cd520f2f1fa58a0cc4a0d1bef 100644
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -24,6 +24,7 @@
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/proto_desc.h"
 #include "paddle/operators/detail/send_recv_impl.h"
 #include "paddle/operators/detail/simple_block_queue.h"
 
@@ -61,29 +62,76 @@ class RecvOp : public framework::OperatorBase {
     server_thread_->join();
   }
 
+  std::string GetGradVarNameForTrainer(const std::string &varname) const {
+    if (grads_counter_.find(varname) == grads_counter_.end()) {
+      grads_counter_[varname] = 0;
+    }
+    char ret[256];
+    snprintf(ret, sizeof(ret), "%s.trainer_%d", varname.c_str(),
+             grads_counter_[varname]++);
+    return std::string(ret);
+  }
+
   void Run(const framework::Scope &scope,
            const platform::DeviceContext &dev_ctx) const override {
-    // blocking get one var from client.
-    const framework::LoDTensor &t = rpc_service_->Get();
+    // FIXME(typhoonzero): no new scopes for every run.
     framework::Scope &recv_scope = scope.NewScope();
-    // set graph input var
-    auto *var = recv_scope.Var(Input("RX"));
-    auto *tensor = var->GetMutable<framework::LoDTensor>();
-    // FIXME(typhoonzero): do not copy
-    framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor);
-
-    std::string program_str = Attr<std::string>("OptimizeProgram");
-    framework::ProgramDesc program_desc;
-    program_desc.ParseFromString(program_str);
-    framework::ProgramDescBind program(program_desc);
-    framework::Executor executor(dev_ctx);
-    // Run sub graph to get optimized tensor
-    executor.Run(program, &recv_scope, 0, /*global_block*/
-                 false /*create_local_scope*/);
-
-    auto *out_var = recv_scope.FindVar("Out");
-    // push back
-    rpc_service_->Push(out_var->Get<framework::LoDTensor>());
+    rpc_service_->SetScope(&recv_scope);
+    auto param_list = Attr<std::vector<std::string>>("ParamList");
+    auto grad_list = Attr<std::vector<std::string>>("GradList");
+    auto trainer_count = Attr<int>("Trainers");
+    size_t param_count = param_list.size();
+    rpc_service_->Reset();
+    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
+    while (true) {
+      // Get from multiple trainers, we don't care about order in which
+      // the gradient arrives, just add suffix 0~n then average the gradient.
+      for (size_t i = 0; i < param_count * trainer_count; ++i) {
+        // blocking get one var from client.
+        const detail::TensorWithName &v = rpc_service_->Get();
+        auto grad_var_name = v.first;
+        auto it = std::find(grad_list.begin(), grad_list.end(), grad_var_name);
+        std::string param_var_name;
+        if (it != grad_list.end()) {
+          param_var_name = param_list[it - grad_list.begin()];
+        } else {
+          LOG(ERROR) << "grad have no paired param found!";
+        }
+        VLOG(3) << "recved grad: " << grad_var_name
+                << " updating param: " << param_var_name;
+        auto *merged_grad = recv_scope.FindVar(grad_var_name);
+        if (merged_grad == nullptr) {
+          // create output of merged var.
+          auto merged_var = recv_scope.Var(grad_var_name);
+          merged_var->GetMutable<framework::LoDTensor>();
+        }
+
+        if (trainer_count > 1) {
+          grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
+        }
+
+        auto *var = recv_scope.Var(grad_var_name);
+        auto *tensor = var->GetMutable<framework::LoDTensor>();
+        // FIXME(typhoonzero): do not copy
+        framework::CopyFrom(v.second, dev_ctx.GetPlace(), dev_ctx, tensor);
+      }
+      rpc_service_->Reset();
+
+      std::string program_str = Attr<std::string>("OptimizeProgram");
+      framework::proto::ProgramDesc program_desc;
+      program_desc.ParseFromString(program_str);
+      framework::ProgramDesc program(program_desc);
+      framework::Executor executor(dev_ctx);
+      // Run sub graph to get optimized tensor
+      try {
+        executor.Run(program, &recv_scope, 0, /*global_block*/
+                     false /*create_local_scope*/, false /*create_vars*/);
+      } catch (std::exception &e) {
+        LOG(ERROR) << "run sub program error " << e.what();
+      }
+      rpc_service_->Done();
+      grads_counter_.clear();
+    }  // while(true)
   }
 
  protected:
@@ -93,13 +141,14 @@ class RecvOp : public framework::OperatorBase {
   // grpc send/recv service implement to register.
   std::shared_ptr<detail::SendRecvServerImpl> rpc_service_;
   std::shared_ptr<std::thread> server_thread_;
+  mutable std::unordered_map<std::string, int> grads_counter_;
 };
 
 class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RecvOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("RX", "(Tensor) Input tensor to be saved");
+    AddInput("RX", "(Tensor) Input tensor to be optimized").AsDuplicable();
     AddComment(R"DOC(
 Recv operator
 
@@ -112,6 +161,17 @@ This operator will recv tensor from send_op
         .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
     AddAttr<std::string>("OptimizeProgram", "type string",
                          "Serialized ProgramDesc string for recv to run.");
+    AddAttr<std::vector<std::string>>(
+        "ParamList", "type list of string",
+        "grad->param name mapping to find which param to optimize.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "GradList", "type list of string",
+        "grad->param name mapping to find which param to optimize.")
+        .SetDefault({});
+    AddAttr<int>("Trainers", "type int",
+                 "Number of trainers in the current cluster job")
+        .SetDefault(1);
   }
 };
 
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index fedc2a5c37ff84ffdf8ebd2f19296db92e256e5b..19220f2f59d218e9b4d57b260b35df64b4abd2cb 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -83,7 +83,7 @@ class ReduceGradOp : public framework::OperatorWithKernel {
 
 class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReduceOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor) The input tensor. Tensors with rank at most 6 are "
@@ -135,8 +135,7 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
 
 class ReduceSumOpMaker : public ReduceOpMaker {
  public:
-  ReduceSumOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ReduceSumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : ReduceOpMaker(proto, op_checker) {
     SetComment("ReduceSum", "sum");
     AddComment(comment_);
@@ -145,8 +144,7 @@ class ReduceSumOpMaker : public ReduceOpMaker {
 
 class ReduceMeanOpMaker : public ReduceOpMaker {
  public:
-  ReduceMeanOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
+  ReduceMeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : ReduceOpMaker(proto, op_checker) {
     SetComment("ReduceMean", "mean");
     AddComment(comment_);
@@ -155,8 +153,7 @@ class ReduceMeanOpMaker : public ReduceOpMaker {
 
 class ReduceMaxOpMaker : public ReduceOpMaker {
  public:
-  ReduceMaxOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ReduceMaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : ReduceOpMaker(proto, op_checker) {
     SetComment("ReduceMax", "max");
     AddComment(comment_);
@@ -165,8 +162,7 @@ class ReduceMaxOpMaker : public ReduceOpMaker {
 
 class ReduceMinOpMaker : public ReduceOpMaker {
  public:
-  ReduceMinOpMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  ReduceMinOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : ReduceOpMaker(proto, op_checker) {
     SetComment("ReduceMin", "min");
     AddComment(comment_);
diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e3079ee0c91c337dca1e57729438fb9be4a0ff4
--- /dev/null
+++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
@@ -0,0 +1,234 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <paddle/framework/lod_rank_table.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+class ReorderLoDTensorByRankTableOpProtoMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReorderLoDTensorByRankTableOpProtoMaker(OpProto *proto,
+                                          OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) the input lod tensor need to be reordered.");
+    AddInput("RankTable",
+             "(LoDRankTable) the rank table that input need follow");
+    AddOutput("Out", "(LoDTensor) reordered lod tensor");
+    AddComment(R"DOC(ReorderLoDTensorByRankTable
+
+Reorder the input X by the rank of `RankTable`. If `RankTable` is ordered by
+index [3, 0, 2, 1]. Input X will reorder its sequence, the third sequence of
+X will be the first sequence of Output.
+
+NOTE: The RankTable does not need to be calculated by X.
+
+For example:
+The X = [Seq0, Seq1, Seq2, Seq3]. The indices of RankTable are [3, 0, 2, 1].
+
+The Out =  [Seq3, Seq0, Seq2, Seq1] with correct LoD information.
+)DOC");
+  }
+};
+
+class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
+ public:
+  ReorderLoDTensorByRankTableBase(const std::string &type,
+                                  const framework::VariableNameMap &inputs,
+                                  const framework::VariableNameMap &outputs,
+                                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x =
+        detail::Ref(scope.FindVar(Input("X")),
+                    "Cannot find input lod tensor variable %s", Input("X"))
+            .Get<framework::LoDTensor>();
+    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")),
+                                   "Cannot find input rank table variable %s",
+                                   Input("RankTable"))
+                           .Get<framework::LoDRankTable>();
+    auto &out =
+        *detail::Ref(scope.FindVar(Output("Out")),
+                     "Cannot find output lod tensor variable %s", Output("Out"))
+             .GetMutable<framework::LoDTensor>();
+
+    out.Resize(x.dims());
+    out.mutable_data(x.place(), x.type());
+    this->process(dev_ctx, x, rank_table, &out);
+  }
+
+ protected:
+  virtual void process(const platform::DeviceContext &dev_ctx,
+                       const framework::LoDTensor &x,
+                       const framework::LoDRankTable &rank_table,
+                       framework::LoDTensor *out) const = 0;
+
+  struct AbsoluteRankTableItem {
+    size_t offset;  // the absolute/accumulated offset.
+    size_t length;  // the length
+    framework::LoD lod;
+  };
+
+  std::vector<AbsoluteRankTableItem> GetAbsoluteOffsetAndLengthByLoDRankTable(
+      const framework::LoDTensor &x) const {
+    std::vector<AbsoluteRankTableItem> absolute_table;
+    size_t level = 0;
+    size_t size = x.lod()[level].size();
+
+    for (size_t i = 0; i < size - 1; ++i) {
+      auto lod_offset =
+          framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level);
+
+      auto &offset = lod_offset.second;
+
+      absolute_table.emplace_back();
+      absolute_table.back().length = offset.second - offset.first;
+      absolute_table.back().offset = offset.first;
+      absolute_table.back().lod = lod_offset.first;
+    }
+    return absolute_table;
+  }
+
+  size_t CopyTensorAndLod(const platform::DeviceContext &dev_ctx,
+                          const AbsoluteRankTableItem &item,
+                          const framework::LoDTensor &x,
+                          framework::LoDTensor *out, size_t out_offset) const {
+    auto &out_lod = *out->mutable_lod();
+    auto len = item.length;
+    auto x_offset = item.offset;
+
+    if (out_lod.empty()) {
+      for (size_t i = 0; i < item.lod.size(); ++i) {
+        out_lod.push_back(std::vector<size_t>({0}));
+      }
+    }
+
+    for (size_t i = 0; i < out_lod.size(); ++i) {
+      auto &out_v = out_lod[i];
+      auto &new_lod_v = item.lod[i];
+
+      for (auto &detail : new_lod_v) {
+        out_v.push_back(out_v.back() + detail);
+      }
+    }
+
+    auto x_sliced = x.Slice(x_offset, x_offset + len);
+    auto out_sliced = out->Slice(out_offset, out_offset + len);
+
+    framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
+    out_offset += len;
+    return out_offset;
+  }
+};
+
+class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase {
+ public:
+  ReorderLoDTensorByRankTableOp(const std::string &type,
+                                const framework::VariableNameMap &inputs,
+                                const framework::VariableNameMap &outputs,
+                                const framework::AttributeMap &attrs)
+      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  void process(const platform::DeviceContext &dev_ctx,
+               const framework::LoDTensor &x,
+               const framework::LoDRankTable &rank_table,
+               framework::LoDTensor *out) const override {
+    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
+    size_t out_offset = 0;
+    out->mutable_lod()->clear();
+    for (auto &item : rank_table.items()) {
+      PADDLE_ENFORCE_LT(item.index, absolute_table.size());
+      out_offset = CopyTensorAndLod(dev_ctx, absolute_table[item.index], x, out,
+                                    out_offset);
+    }
+  }
+};
+
+class IdentityInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+};
+
+class ReorderLodTensorByRankGradOpMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("reorder_lod_tensor_by_rank_grad");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetInput("RankTable", Input("RankTable"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase {
+ public:
+  ReorderLoDTensorByRankGradOp(const std::string &type,
+                               const framework::VariableNameMap &inputs,
+                               const framework::VariableNameMap &outputs,
+                               const framework::AttributeMap &attrs)
+      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  void process(const platform::DeviceContext &dev_ctx,
+               const framework::LoDTensor &x,
+               const framework::LoDRankTable &rank_table,
+               framework::LoDTensor *out) const override {
+    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
+
+    // offsets = enumerate([item.index for item in rank_table.items()])
+    std::vector<std::pair<size_t, size_t>> offsets;
+    offsets.reserve(rank_table.items().size());
+    for (size_t i = 0; i < rank_table.items().size(); ++i) {
+      offsets.push_back({i, rank_table.items()[i].index});
+    }
+
+    // offsets.sort(key=lambda x: x[1])
+    std::sort(
+        offsets.begin(), offsets.end(),
+        [](const std::pair<size_t, size_t> &a,
+           const std::pair<size_t, size_t> &b) { return a.second < b.second; });
+
+    // Copy TensorAndLod
+    size_t out_offset = 0;
+    for (auto &offset : offsets) {
+      out_offset = this->CopyTensorAndLod(dev_ctx, absolute_table[offset.first],
+                                          x, out, out_offset);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(reorder_lod_tensor_by_rank,
+                  ops::ReorderLoDTensorByRankTableOp,
+                  ops::ReorderLodTensorByRankGradOpMaker,
+                  ops::ReorderLoDTensorByRankTableOpProtoMaker,
+                  ops::IdentityInferShape);
+REGISTER_OPERATOR(reorder_lod_tensor_by_rank_grad,
+                  ops::ReorderLoDTensorByRankGradOp, ops::IdentityInferShape);
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index d82d828747c0c822195b699359b8e62d1cf7e92d..2c5167295d8546358b09e018ee02f0941f2897d1 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -77,8 +77,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
 
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReshapeOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input tensor of reshape operator.");
     AddOutput("Out", "The output tensor of reshape operator.");
diff --git a/paddle/operators/rmsprop_op.cc b/paddle/operators/rmsprop_op.cc
index fc3f9b8988ec7fe0093ef6b09a105747b0025ec1..f7c250bf913b9213e7d7e2cca9ecadf74cac91a1 100644
--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
@@ -63,8 +63,7 @@ class RmspropOp : public framework::OperatorWithKernel {
 
 class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RmspropOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  RmspropOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
diff --git a/paddle/operators/rnn_memory_helper_op.cc b/paddle/operators/rnn_memory_helper_op.cc
index 3a035f0b9acb94bab60659938e11b4996b8eaa0f..795bdf3e51a2dd323e85c497fcf203ad3ed54183 100644
--- a/paddle/operators/rnn_memory_helper_op.cc
+++ b/paddle/operators/rnn_memory_helper_op.cc
@@ -57,15 +57,14 @@ class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {
 
 class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RNNMemoryHelperOpInfoMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+  RNNMemoryHelperOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "");
     AddOutput("Out", "");
     AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddComment("");
   }
 };
@@ -114,8 +113,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
 class RNNMemoryHelperGradOpInfoMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  RNNMemoryHelperGradOpInfoMaker(framework::OpProto *proto,
-                                 framework::OpAttrChecker *op_checker)
+  RNNMemoryHelperGradOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(framework::GradVarName("Out"), "");
     AddInput("X", "");
@@ -124,7 +122,7 @@ class RNNMemoryHelperGradOpInfoMaker
     AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
     AddComment("");
   }
 };
diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc
index 75fcea8401fbbc2943c0d6a50ca81288268823d8..85b6a8e15160d0c259a270f5e12ca9e67a6508ab 100644
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
@@ -99,8 +99,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {
 
 class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ROIPoolOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  ROIPoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor), "
diff --git a/paddle/operators/row_conv_op.cc b/paddle/operators/row_conv_op.cc
index 5203a5079c8b125f8dc156202f70ce76711a1e30..6b116a9fe704e6ddf18c22455c06346ea14909d2 100644
--- a/paddle/operators/row_conv_op.cc
+++ b/paddle/operators/row_conv_op.cc
@@ -76,8 +76,7 @@ class RowConvGradOp : public framework::OperatorWithKernel {
 
 class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RowConvOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  RowConvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LoDTensor), the input(X) is a LodTensor, which supports "
diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc
index d4921cb80c8d78c52ae1887c36819b52621470eb..eae1146d6c61fe56ebc48ac50e1eacd62e3fa7d0 100644
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -94,8 +94,7 @@ class SaveOp : public framework::OperatorBase {
 
 class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SaveOpProtoMaker(framework::OpProto *proto,
-                   framework::OpAttrChecker *op_checker)
+  SaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor ) Input tensor to be saved");
     AddComment(R"DOC(
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index d848be823e602e595f66138f4b5dfd6e38dd85a1..ee39888713544703ee8d305b2c04e4b03deeceab 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -38,7 +38,7 @@ class ScaleOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) Input tensor of scale operator.");
     AddOutput("Out", "(Tensor) Output tensor of scale operator.");
@@ -58,13 +58,13 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
  public:
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("scale");
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttr("scale", GetAttr("scale"));
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index 573bbcd1875c86a2d843b6c5e9c1af4d48a5cb18..173c9582557eb4e020824d5830731e3e2312dc3c 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -78,8 +78,7 @@ class ScatterGradOp : public framework::OperatorWithKernel {
 
 class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScatterOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Ref", "The source input of scatter op");
     AddInput("Index",
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
index a3059847f2d420359b347e3a5d514d8a3829a4e2..a5681910708bae584bdf4217b61ca63a3988e1ff 100644
--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -34,45 +34,56 @@ class SendOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {
     // init client when the operator is created at runtime.
-    if (!client_) {
-      std::string endpoint = Attr<std::string>("endpoint");
-      client_.reset(new detail::RPCClient(
-          grpc::CreateChannel(endpoint, grpc::InsecureChannelCredentials())));
-      // TODO(typhoonzero): how to call InitVariables
+    std::vector<std::string> endpoints =
+        Attr<std::vector<std::string>>("endpoints");
+    for (auto ep : endpoints) {
+      client_map_[ep].reset(new detail::RPCClient(
+          grpc::CreateChannel(ep, grpc::InsecureChannelCredentials())));
     }
   }
   void Run(const framework::Scope &scope,
            const platform::DeviceContext &dev_ctx) const override {
-    auto iname = Input("X");
-    auto oname = Output("Out");
-    // TODO(typhoonzero): currently it's non-blocking,
-    // should block until server responds.
-    bool ret = client_->SendVariable(scope, iname, oname);
-    if (!ret) {
-      LOG(ERROR) << "send variable error";
+    auto ins = Inputs("X");
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    // TODO(typhoonzero): use async calls to send multiple variable asyncly.
+    for (size_t i = 0; i < ins.size(); ++i) {
+      bool ret = client_map_[epmap[i]]->SendVariable(scope, ins[i]);
+      if (!ret) {
+        LOG(ERROR) << "send variable error: " << ins[i];
+      }
+    }
+    // TODO(typhoonzero): support async optimization
+    client_map_[epmap[0]]->Wait();
+    for (size_t i = 0; i < ins.size(); ++i) {
+      bool ret = client_map_[epmap[i]]->GetVariable(scope, ins[i]);
+      if (!ret) {
+        LOG(ERROR) << "GetVariable error: " << ins[i];
+      }
     }
   }
 
  protected:
-  std::shared_ptr<detail::RPCClient> client_{nullptr};
+  mutable std::unordered_map<std::string, std::shared_ptr<detail::RPCClient>>
+      client_map_;
 };
 
 class SendOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SendOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SendOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) Input tensor to be saved");
-    AddOutput("Out", "(Tensor) Output fetched from server");
+    AddInput("X", "(Tensor) Input tensor to be send").AsDuplicable();
     AddComment(R"DOC(
 Recv operator
 
 This operator will recv tensor from send_op
 )DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
-        .SetDefault("127.0.0.1:6164")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.");
+    AddAttr<std::vector<std::string>>("epmap",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints in the order of input "
+                                      "variables for mapping");
   }
 };
 
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
index 3e2e2051afacb748877e3b0c3dec8d6662ac4e72..d899d8154cce57a88de0e1d19e3393528ce367e2 100644
--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
@@ -16,12 +16,14 @@
 // a RemoteOptimizer.
 
 #include <unistd.h>
+#include <string>
 #include <thread>
 
 #include "gtest/gtest.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/program_desc.h"
+#include "paddle/string/printf.h"
 
 USE_NO_KERNEL_OP(send);
 USE_NO_KERNEL_OP(recv);
@@ -33,30 +35,33 @@ std::unique_ptr<paddle::framework::OperatorBase> recv_op;
 void InitTensorsInScope(paddle::framework::Scope &scope,
                         paddle::platform::CPUPlace &place) {
   paddle::platform::CPUDeviceContext ctx(place);
-  auto var = scope.Var("X");
-  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({10, 10});
-  float *expect = tensor->mutable_data<float>(place);
-  for (int64_t i = 0; i < tensor->numel(); ++i) {
-    expect[i] = static_cast<float>(i);
+  for (int i = 0; i < 2; ++i) {
+    auto var_name = paddle::string::Sprintf("x%d", i);
+    auto var = scope.Var(var_name);
+    auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+    tensor->Resize({10, 10});
+    float *expect = tensor->mutable_data<float>(place);
+    for (int64_t i = 0; i < tensor->numel(); ++i) {
+      expect[i] = static_cast<float>(i);
+    }
   }
 
   auto out_var = scope.Var("Out");
   auto out_tensor = out_var->GetMutable<paddle::framework::LoDTensor>();
   out_tensor->Resize({10, 10});
-  tensor->mutable_data<float>(place);  // allocate
+  out_tensor->mutable_data<float>(place);  // allocate
 }
 
 void AddOp(const std::string &type,
            const paddle::framework::VariableNameMap &inputs,
            const paddle::framework::VariableNameMap &outputs,
            paddle::framework::AttributeMap attrs,
-           paddle::framework::BlockDescBind *block) {
+           paddle::framework::BlockDesc *block) {
   // insert output
   for (auto kv : outputs) {
     for (auto v : kv.second) {
       auto var = block->Var(v);
-      var->SetDataType(paddle::framework::DataType::FP32);
+      var->SetDataType(paddle::framework::proto::DataType::FP32);
     }
   }
 
@@ -78,10 +83,10 @@ void StartServerNet() {
   InitTensorsInScope(scope, place);
 
   // sub program run in recv_op, for simple test we use sum
-  paddle::framework::ProgramDescBind program;
-  paddle::framework::BlockDescBind *block = program.MutableBlock(0);
+  paddle::framework::ProgramDesc program;
+  paddle::framework::BlockDesc *block = program.MutableBlock(0);
   // X for server side tensors, RX for received tensers, must be of same shape.
-  AddOp("sum", {{"X", {"X", "RX"}}}, {{"Out", {"Out"}}}, {}, block);
+  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block);
 
   paddle::framework::AttributeMap attrs;
   attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
@@ -89,8 +94,8 @@ void StartServerNet() {
   PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto));
 
   attrs.insert({"OptimizeProgram", program_proto});
-  recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}},
-                                                    {{"Out", {"Out"}}}, attrs);
+  recv_op = paddle::framework::OpRegistry::CreateOp(
+      "recv", {{"RX", {"x0", "x1"}}}, {{"Out", {"Out"}}}, attrs);
   paddle::platform::CPUDeviceContext ctx(place);
   recv_op->Run(scope, ctx);
 }
@@ -107,11 +112,11 @@ TEST(SendRecvOp, CPU) {
   attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
 
   auto send_op = paddle::framework::OpRegistry::CreateOp(
-      "send", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
+      "send", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, attrs);
   paddle::platform::CPUDeviceContext ctx(place);
   send_op->Run(scope, ctx);
 
-  auto in_var = scope.Var("X");
+  auto in_var = scope.Var("x0");
   auto tensor = in_var->GetMutable<paddle::framework::LoDTensor>();
   float *expected = tensor->data<float>();
 
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
index 9c7e5456e8238af70f920aaaa9cc652d5d12d3e9..2f0aad2003e48952ca26ca27573bc45386a4e585 100644
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -43,8 +43,7 @@ class SequenceConcatOp : public framework::OperatorWithKernel {
 
 class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceConcatOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  SequenceConcatOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LodTensorArray) Input is a vector of LoDTensor, "
@@ -68,12 +67,12 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The level should be less than the level number of inputs.")
         .SetDefault(0);
     AddComment(R"DOC(
-The sequence_concat operator concatenates multiple LoDTensors. 
-It only supports sequence (LoD Tensor with level number is 1) 
+The sequence_concat operator concatenates multiple LoDTensors.
+It only supports sequence (LoD Tensor with level number is 1)
 or a nested sequence (LoD tensor with level number is 2) as its input.
 - Case1:
   If the axis is other than 0(here, axis is 1 and level is 1),
-  each input should have the same LoD information and the LoD 
+  each input should have the same LoD information and the LoD
   information of the output keeps the same as the input.
 
   LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
@@ -81,7 +80,7 @@ or a nested sequence (LoD tensor with level number is 2) as its input.
   LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
 
 - Case2:
-  If the axis is 0(here, leve is 0), the inputs are concatenated along 
+  If the axis is 0(here, leve is 0), the inputs are concatenated along
   time steps, the LoD information of the output need to re-compute.
   The LoD information of level-1 should be same.
 
@@ -125,8 +124,9 @@ class SequenceConcatGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_concat, ops::SequenceConcatOp, ops::SequenceConcatOpMaker,
-            sequence_concat_grad, ops::SequenceConcatGradOp);
+REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp,
+               ops::SequenceConcatOpMaker, sequence_concat_grad,
+               ops::SequenceConcatGradOp, false);
 REGISTER_OP_CPU_KERNEL(
     sequence_concat,
     ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc
index f5c4f1c13331f45183d2810a95f773ad52aca13b..c5b7c81bd7c6e1110aa9e2ced629bea5d88832d1 100644
--- a/paddle/operators/sequence_conv_op.cc
+++ b/paddle/operators/sequence_conv_op.cc
@@ -100,8 +100,7 @@ class SequenceConvGradOp : public framework::OperatorWithKernel {
 
 class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceConvOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  SequenceConvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
diff --git a/paddle/operators/sequence_expand_op.cc b/paddle/operators/sequence_expand_op.cc
index 770161b593e232f2f2cf4a2ccb952391557b9a3d..6227408be0529e63318bddcf6fa4f1927bb05eca 100644
--- a/paddle/operators/sequence_expand_op.cc
+++ b/paddle/operators/sequence_expand_op.cc
@@ -37,8 +37,7 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
 
 class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceExpandOpMaker(framework::OpProto* proto,
-                        framework::OpAttrChecker* op_checker)
+  SequenceExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor or LoDTensor) The input(X) of this operator can be a "
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index 3526e45a1b6565bc21413d381d15c02f08c587bd..0eb675caaddf1274a941cbfe29017cb9ea11f40f 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -37,8 +37,7 @@ class SequencePoolOp : public framework::OperatorWithKernel {
 
 class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequencePoolOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  SequencePoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
     AddOutput("Out",
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
index 481db8f9e548de68c102210035d4ff037ab56261..309ee1f3a82c35104db74084c4ef761bd4b06695 100644
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -79,8 +79,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
 
 class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceSliceOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+  SequenceSliceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LoDTensor), "
diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc
index 37d5452e6ba59411f9ab2e1460fc8584583f0321..b74766f012e333cc2a317e6efe17c5b60238924a 100644
--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -33,8 +33,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
 
 class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceSoftmaxOpMaker(framework::OpProto* proto,
-                         framework::OpAttrChecker* op_checker)
+  SequenceSoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension "
@@ -51,10 +50,14 @@ input Tensor can be either [N, 1] or [N], where N is the sum of the length
 of all sequences.
 
 The algorithm works as follows:
+
     for i-th sequence in a mini-batch:
-        $$Out(X[lod[i]:lod[i+1]], :) =
-            \frac{\exp(X[lod[i]:lod[i+1], :])}
-            {\sum(\exp(X[lod[i]:lod[i+1], :]))}$$
+
+$$
+Out(X[lod[i]:lod[i+1]], :) = \
+\frac{\exp(X[lod[i]:lod[i+1], :])} \
+{\sum(\exp(X[lod[i]:lod[i+1], :]))}
+$$
 
 For example, for a mini-batch of 3 sequences with variable-length,
 each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 121bf60b27c62c1b0dd4c34c12962b7098e29ae2..fb4b43e472f86f2fa30a7013731c4621cb2b3e0e 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -43,7 +43,7 @@ class SGDOp : public framework::OperatorWithKernel {
 
 class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SGDOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("LearningRate", "(Tensor) Learning rate of SGD");
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
index c380e606869fd2c559c7d5f378857ca74fa8d8d3..48194a547bbea5ddda7c5f3e2421431d1d81042d 100644
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -54,8 +54,7 @@ class ShrinkRNNMemoryOp : public ArrayOp {
 
 class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ShrinkRNNMemoryOpProtoMaker(framework::OpProto *proto,
-                              framework::OpAttrChecker *op_checker)
+  ShrinkRNNMemoryOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
     AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
@@ -137,14 +136,14 @@ class ShrinkRNNGradOpMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
     op->SetType("shrink_rnn_memory_grad");
     op->SetInput("X", Input("X"));
     op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(op);
+    return std::unique_ptr<framework::OpDesc>(op);
   }
 };
 
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
index b8a1bf122a78df1e0d8291c77a61b3f917d40960..9b5227d92d1cfd7d6ac7e28186fbba6d887475b1 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -86,8 +86,8 @@ class SigmoidCrossEntropyWithLogitsGradOp
 class SigmoidCrossEntropyWithLogitsOpMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  SigmoidCrossEntropyWithLogitsOpMaker(framework::OpProto* proto,
-                                       framework::OpAttrChecker* op_checker)
+  SigmoidCrossEntropyWithLogitsOpMaker(OpProto* proto,
+                                       OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
diff --git a/paddle/operators/sign_op.cc b/paddle/operators/sign_op.cc
index d5a7ccb77e7d9ad3a93702861dbab295c4ab5bce..b2459fb2f53939b3131af1540044ce361b87d08a 100644
--- a/paddle/operators/sign_op.cc
+++ b/paddle/operators/sign_op.cc
@@ -34,7 +34,7 @@ class SignOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class SignOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SignOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) Input tensor of sign operator.");
     AddOutput("Out", "(Tensor) Output tensor of sign operator.");
@@ -50,13 +50,13 @@ class SignGradMaker : public framework::SingleGradOpDescMaker {
  public:
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("scale");
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttr("scale", 0.0f);
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
index 56e8d9058fcc035c28e74daff778c4e034f46b44..42a53cfa06f7724000ff59c69504629890f6ec87 100644
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -47,8 +47,7 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SmoothL1LossOpMaker(framework::OpProto* proto,
-                      framework::OpAttrChecker* op_checker)
+  SmoothL1LossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 0988c83d43535d7ee1bcef87bf506e5db1a3ecc0..6b3f19bb46c45b7dd8ec6c23ee449521b36d759c 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -36,8 +36,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
 
 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftmaxOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
+  SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "The input tensor of softmax. "
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index 0c302288637ad1713e133d37faa0fb338e1f7022..d9911a6901447d8900c3881a60c7a0852dcbf429 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -20,8 +20,7 @@ namespace operators {
 class SoftmaxWithCrossEntropyOpMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftmaxWithCrossEntropyOpMaker(framework::OpProto* proto,
-                                 framework::OpAttrChecker* op_checker)
+  SoftmaxWithCrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Logits",
              "(Tensor, default: Tensor<float>), The unscaled log probabilities "
@@ -174,8 +173,8 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto* grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
     grad_op->SetType("softmax_with_cross_entropy_grad");
     grad_op->SetInput("Label", Input("Label"));
     grad_op->SetInput("Softmax", Output("Softmax"));
@@ -184,7 +183,7 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
     grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
     grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc
index f164a4771186635232fea46327ca1fb8b86f2852..3542d8624fec49f75314f046434cbcadf307497e 100644
--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -118,8 +118,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
 
 class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SplitLoDTensorOpProtoMaker(framework::OpProto *proto,
-                             framework::OpAttrChecker *op_checker)
+  SplitLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input LoDTensor");
     AddInput("Mask", "A bool column vector which mask the input");
@@ -164,8 +163,8 @@ class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("merge_lod_tensor");
     grad_op->SetInput("InTrue", OutputGrad("OutTrue"));
     grad_op->SetInput("InFalse", OutputGrad("OutFalse"));
@@ -173,7 +172,7 @@ class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
     grad_op->SetInput("X", Input("X"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/split_op.cc b/paddle/operators/split_op.cc
index 275b25e96aa75fdbcb7275e272c49ea8d278d2c8..4dfae043cb1091c9491d89aec4d1415d4741e013 100644
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
@@ -65,7 +65,7 @@ class SplitOp : public framework::OperatorWithKernel {
 
 class SplitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SplitOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SplitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) Input tensor of the split operator.");
     AddOutput("Out", "(Tensor) Output tensors of the split operator.")
@@ -108,13 +108,13 @@ class SplitGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto op = new framework::OpDesc();
     op->SetType("concat");
     op->SetInput("X", OutputGrad("Out"));
     op->SetOutput("Out", InputGrad("X"));
     op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(op);
+    return std::unique_ptr<framework::OpDesc>(op);
   }
 };
 
diff --git a/paddle/operators/spp_op.cc b/paddle/operators/spp_op.cc
index b1807b62616b80ea8a9e48409e0760c1c7b36a38..c0aa87b0f06ca9c7d156dfdf8df188da68ac1450 100644
--- a/paddle/operators/spp_op.cc
+++ b/paddle/operators/spp_op.cc
@@ -18,7 +18,7 @@ namespace operators {
 
 class SppOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SppOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  SppOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
index 50bc6da196e642e3860874cfb883390dd2e93215..9e097176f3434e81e31f2ecf4093af47b654e816 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -56,8 +56,7 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
 
 class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SquaredL2DistanceOpMaker(framework::OpProto* proto,
-                           framework::OpAttrChecker* op_checker)
+  SquaredL2DistanceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) Input of SquaredL2DistanceOp.");
     AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp.");
diff --git a/paddle/operators/squared_l2_norm_op.cc b/paddle/operators/squared_l2_norm_op.cc
index 3cff61a02f71fadf99f73787e2b2c179f7d441a8..9c239042cb5127af7eebc0e534da7a7705388de8 100644
--- a/paddle/operators/squared_l2_norm_op.cc
+++ b/paddle/operators/squared_l2_norm_op.cc
@@ -48,8 +48,7 @@ class SquaredL2NormGradOp : public framework::OperatorWithKernel {
 
 class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SquaredL2NormOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+  SquaredL2NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input of squared_l2_norm op.");
     AddOutput("Out", "(Scalar) The output of squared_l2_norm op.");
diff --git a/paddle/operators/strided_memcpy_test.cc b/paddle/operators/strided_memcpy_test.cc
index 68f064eaee5851333ddf9767b7138da83a28503d..230cc1ab0bbd5ac57eb7494795e3fbcdf02c3cc8 100644
--- a/paddle/operators/strided_memcpy_test.cc
+++ b/paddle/operators/strided_memcpy_test.cc
@@ -85,8 +85,10 @@ TEST(StridedMemcpy, GPUCrop) {
   platform::GPUPlace gpu0(0);
   platform::CPUPlace cpu;
 
+  platform::CUDADeviceContext ctx(gpu0);
+
   int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
 
   framework::DDim src_stride({5, 1});
 
@@ -96,7 +98,6 @@ TEST(StridedMemcpy, GPUCrop) {
   framework::DDim dst_dim({2, 2});
   framework::DDim dst_stride({2, 1});
 
-  platform::CUDADeviceContext ctx(gpu0);
   StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
                      gpu_dst);
 
@@ -122,9 +123,10 @@ TEST(StridedMemcpy, GPUConcat) {
 
   platform::GPUPlace gpu0(0);
   platform::CPUPlace cpu;
+  platform::CUDADeviceContext ctx(gpu0);
 
   int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
-  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
 
   int dst[8];
   int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
@@ -132,7 +134,6 @@ TEST(StridedMemcpy, GPUConcat) {
   framework::DDim src_stride({2, 1});
   framework::DDim dst_dim({2, 2});
   framework::DDim dst_stride({4, 1});
-  platform::CUDADeviceContext ctx(gpu0);
 
   StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
   StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index cd52672f78e3e5826e8dfff92fb8e4668c5c6dcd..891839bf9cd991e15d96b86e24ea61b09e35a7c7 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -29,7 +29,7 @@ class SumOp : public framework::OperatorWithKernel {
                    "Output(Out) of SumOp should not be null.");
     if (ctx->IsRuntime() &&
         ctx->GetOutputsVarType("Out")[0] ==
-            framework::VarDesc::LOD_TENSOR_ARRAY) {
+            framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
       return;  // skip runtime infershape when is tensor array;
     }
 
@@ -72,8 +72,8 @@ class SumOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_NE(dtype, -1,
                         "Sum operator should have at least one tensor");
 
-      return framework::OpKernelType(static_cast<framework::DataType>(dtype),
-                                     ctx.device_context());
+      return framework::OpKernelType(
+          static_cast<framework::proto::DataType>(dtype), ctx.device_context());
     } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
       return framework::OpKernelType(
           framework::ToDataType(
@@ -98,7 +98,7 @@ class SumOp : public framework::OperatorWithKernel {
 
 class SumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SumOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+  SumOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
         .AsDuplicable();
@@ -106,8 +106,8 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Sum operator.
 
-This operators sums the input tensors. All the inputs can carry the 
-LoD (Level of Details) information. However, the output only shares 
+This operators sums the input tensors. All the inputs can carry the
+LoD (Level of Details) information. However, the output only shares
 the LoD information with the first input.
 )DOC");
   }
@@ -115,10 +115,10 @@ the LoD information with the first input.
 
 class SumOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind& op_desc,
-                  framework::BlockDescBind* block) const override {
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
     auto& inputs = op_desc.Input("X");
-    auto var_type = framework::VarDesc::SELECTED_ROWS;
+    auto var_type = framework::proto::VarDesc::SELECTED_ROWS;
 
     for (auto& name : op_desc.Input("X")) {
       VLOG(10) << name << " "
@@ -128,12 +128,12 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
     bool any_input_is_lod_tensor = std::any_of(
         inputs.begin(), inputs.end(), [block](const std::string& name) {
           return block->FindRecursiveOrCreateVar(name)->GetType() ==
-                 framework::VarDesc::LOD_TENSOR;
+                 framework::proto::VarDesc::LOD_TENSOR;
         });
 
     auto is_tensor_array = [block](const std::string& name) {
       return detail::Ref(block->FindRecursiveOrCreateVar(name)).GetType() ==
-             framework::VarDesc::LOD_TENSOR_ARRAY;
+             framework::proto::VarDesc::LOD_TENSOR_ARRAY;
     };
 
     bool any_input_is_tensor_array =
@@ -152,9 +152,9 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
         PADDLE_ENFORCE(all_inputs_are_tensor_array,
                        "Not all inputs are tensor array:\n%s", os.str());
       }
-      var_type = framework::VarDesc::LOD_TENSOR_ARRAY;
+      var_type = framework::proto::VarDesc::LOD_TENSOR_ARRAY;
     } else if (any_input_is_lod_tensor) {
-      var_type = framework::VarDesc::LOD_TENSOR;
+      var_type = framework::proto::VarDesc::LOD_TENSOR;
     }
 
     auto out_var_name = op_desc.Output("Out").front();
@@ -169,20 +169,19 @@ class SumGradMaker : public framework::GradOpDescMakerBase {
  public:
   using framework::GradOpDescMakerBase::GradOpDescMakerBase;
 
-  std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
-      const override {
-    auto x_grads = InputGrad("X");
-    std::vector<std::unique_ptr<framework::OpDescBind>> grad_ops;
+  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
+    auto x_grads = InputGrad("X", false);
+    std::vector<std::unique_ptr<framework::OpDesc>> grad_ops;
     grad_ops.reserve(x_grads.size());
     auto og = OutputGrad("Out");
     std::transform(x_grads.begin(), x_grads.end(), std::back_inserter(grad_ops),
                    [&og](const std::string& x_grad) {
-                     auto* grad_op = new framework::OpDescBind();
+                     auto* grad_op = new framework::OpDesc();
                      grad_op->SetType("scale");
                      grad_op->SetInput("X", og);
                      grad_op->SetOutput("Out", {x_grad});
                      grad_op->SetAttr("scale", 1.0f);
-                     return std::unique_ptr<framework::OpDescBind>(grad_op);
+                     return std::unique_ptr<framework::OpDesc>(grad_op);
                    });
     return grad_ops;
   }
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
index 2835b84f75cad6c8fb01d02b93bb9ff79edb1088..90cbc19d1b1bab2e639e3d6d5b28cd13b30542f6 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -51,8 +51,7 @@ class WriteToArrayOp : public ArrayOp {
 
 class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  WriteToArrayOpProtoMaker(framework::OpProto *proto,
-                           framework::OpAttrChecker *op_checker)
+  WriteToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(LoDTensor) the tensor will be written to tensor array");
     AddInput(
@@ -97,14 +96,14 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
 
 class WriteToArrayInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
     auto x_name = op_desc.Input("X")[0];
     auto out_name = op_desc.Output("Out")[0];
     VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
     auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name),
                             "Cannot found %s", out_name);
-    out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+    out.SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
     auto *x = block->FindVarRecursive(x_name);
     if (x != nullptr) {
       out.SetDataType(x->GetDataType());
@@ -140,8 +139,7 @@ class ReadFromArrayOp : public ArrayOp {
 
 class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReadFromArrayProtoMaker(framework::OpProto *proto,
-                          framework::OpAttrChecker *op_checker)
+  ReadFromArrayProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(TensorArray) the array will be read from.");
     AddInput("I",
@@ -177,14 +175,14 @@ class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("read_from_array");
     grad_op->SetInput("I", Input("I"));
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
@@ -193,14 +191,14 @@ class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad_op = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
     grad_op->SetType("write_to_array");
     grad_op->SetInput("I", Input("I"));
     grad_op->SetInput("X", OutputGrad("Out"));
     grad_op->SetOutput("Out", InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDescBind>(grad_op);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
   }
 };
 
diff --git a/paddle/operators/top_k_op.cc b/paddle/operators/top_k_op.cc
index 16ae925eb5cab1c05f3bc376972cabadc4367d20..bb72210bb67f925af3e450961069f0737dbde35e 100644
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -46,7 +46,7 @@ class TopkOp : public framework::OperatorWithKernel {
 
 class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TopkOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  TopkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) The input of Topk op");
     AddOutput("Out", "(Tensor) The output tensor of Topk op");
diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc
index de5ff561add6183828f6bb4c44e30f6bb13079fa..0109b8bc5c30e0fe3e4ff9d741cd76b741e17926 100644
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -55,8 +55,7 @@ class TransposeOp : public framework::OperatorWithKernel {
 
 class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TransposeOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  TransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 2a49ee471f67cda87415db0e1440a4992c0cd088..3c705cb3396f68f88882388675ab145660e13070 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -66,15 +66,14 @@ class UniformRandomOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        static_cast<framework::DataType>(ctx.Attr<int>("dtype")),
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
         ctx.GetPlace());
   }
 };
 
 class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  UniformRandomOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+  UniformRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : framework::OpProtoAndCheckerMaker(proto, op_checker) {
     AddOutput("Out", "(Tensor) The output tensor of uniform random op");
     AddComment(R"DOC(
@@ -100,7 +99,7 @@ uniform distribution.
                  "0 means use a seed generated by the system.")
         .SetDefault(0);
     AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
-        .SetDefault(framework::DataType::FP32);
+        .SetDefault(framework::proto::DataType::FP32);
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
index 49df2a530cd0c5c13f08db4b1e7db62679081e9b..7c035c0b48ebb0d7115e1499c03f8f40f2ca7282 100644
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
@@ -18,8 +18,7 @@ namespace operators {
 
 class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Unpool2dOpMaker(framework::OpProto* proto,
-                  framework::OpAttrChecker* op_checker)
+  Unpool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "X",
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index 9a092a570ff1f3f529413cd44dff55147adbaadc..324c8b98c4811328b2a89eadc3e3420c080bd7d1 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -46,7 +46,7 @@ class WhileOp : public framework::OperatorBase {
     PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
 
     framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
 
     auto step_scopes =
@@ -64,7 +64,7 @@ class WhileOp : public framework::OperatorBase {
 
 class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  WhileOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(kParameters,
              "A set of variables, which are required by operators inside the "
@@ -82,8 +82,8 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
               "(StepScopeVar) A vector of local scope, which size equals the "
               "step number of While Op. The i'th scope storages temporary "
               "variables generated in the i'th step.");
-    AddAttr<framework::BlockDescBind *>(kStepBlock,
-                                        "The step block inside WhileOp");
+    AddAttr<framework::BlockDesc *>(kStepBlock,
+                                    "The step block inside WhileOp");
     AddComment(R"DOC(
 )DOC");
   }
@@ -99,7 +99,7 @@ class WhileGradOp : public framework::OperatorBase {
   void Run(const framework::Scope &scope,
            const platform::DeviceContext &dev_ctx) const override {
     framework::Executor executor(dev_ctx);
-    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
 
     auto *step_scopes =
@@ -209,8 +209,8 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
   using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 
  protected:
-  std::unique_ptr<framework::OpDescBind> Apply() const override {
-    auto *grad = new framework::OpDescBind();
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad = new framework::OpDesc();
     grad->SetType("while_grad");
     grad->SetInput(kParameters, Input(kParameters));
 
@@ -279,14 +279,14 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     // while operator could be renamed.
     grad->SetAttr("original_output_grad", extra_inputs_list);
 
-    return std::unique_ptr<framework::OpDescBind>(grad);
+    return std::unique_ptr<framework::OpDesc>(grad);
   }
 };
 
 class WhileGradOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDescBind &op_desc,
-                  framework::BlockDescBind *block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
     auto p_names = op_desc.Input(kParameters);
     auto pg_names = op_desc.Output(framework::GradVarName(kParameters));
 
@@ -321,10 +321,10 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
         continue;
       }
       auto dims = ctx->GetInputsElementDim(kParameters, i);
-      if (var_types[i] == framework::VarDesc::LOD_TENSOR) {
+      if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) {
         names_to_set.push_back(pg_names[i]);
         dims_to_set.push_back(dims);
-      } else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) {
+      } else if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
         // not sure how to set the dim of LOD_TENSOR_ARRAY
         names_to_set.push_back(pg_names[i]);
         dims_to_set.push_back(dims);
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 8cdc5f43403b0c54d3f1f01a3e97405fd5b2f434..dacee74fff369586c7ca2ff62cfe6aeebd8f39c7 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -19,7 +19,7 @@ CPUDeviceContext::CPUDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
 
-CPUDeviceContext::CPUDeviceContext(CPUPlace place) {
+CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) {
   eigen_device_.reset(new Eigen::DefaultDevice());
 }
 
@@ -27,7 +27,7 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
 
-Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
+Place CPUDeviceContext::GetPlace() const { return place_; }
 
 #ifdef PADDLE_WITH_CUDA
 
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 56813a1d5b3c2a7f4ff7b4eddc6fa47ed861700c..6cc0508522a97f3097b30e3340e7413a7093714a 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -45,6 +45,7 @@ class CPUDeviceContext : public DeviceContext {
   Place GetPlace() const override;
 
  private:
+  CPUPlace place_;
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
 
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index 541eca5f39c2e6a4b464aec79fd8a920ab4c7732..7037551d7544d6fea54e2f4bf887309b7dc5a52e 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -97,17 +97,6 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                  "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
 }
 
-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum cudaMemcpyKind kind) {
-  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
-                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync");
-  // note: cudaMemcpy may actually be asynchronous with respect to the caller,
-  //       block on stream 0 to make sure the copy has completed
-  PADDLE_ENFORCE(
-      cudaStreamSynchronize(0),
-      "cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync");
-}
-
 void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
                    size_t count, cudaStream_t stream) {
   PADDLE_ENFORCE(
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index db961f3838af73855312d4cf6a80e2355306e08f..d05131fa4196057d19a8ae57bf4574c666e409cf 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -52,10 +52,6 @@ size_t GpuMaxChunkSize();
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                     enum cudaMemcpyKind kind, cudaStream_t stream);
 
-//! Copy memory from address src to dst synchronously.
-void GpuMemcpySync(void *dst, const void *src, size_t count,
-                   enum cudaMemcpyKind kind);
-
 //! Copy memory from one device to another device.
 void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
                    size_t count, cudaStream_t stream);
diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu
index d36eac8379ebedb284b36012a46186cd3ac43b91..464096111e4a85b8d64d9223bfb85a1d1d42fad4 100644
--- a/paddle/platform/transform_test.cu
+++ b/paddle/platform/transform_test.cu
@@ -53,11 +53,11 @@ TEST(Transform, GPUUnary) {
   CUDADeviceContext ctx(gpu0);
   float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
   float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
-  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
+  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
   Transform<paddle::platform::CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
   ctx.Wait();
-  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
+  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
   Free(gpu0, gpu_buf);
   for (int i = 0; i < 4; ++i) {
     ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
@@ -83,11 +83,11 @@ TEST(Transform, GPUBinary) {
   GPUPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
   int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
-  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
+  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
   Transform<paddle::platform::CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
   ctx.Wait();
-  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
+  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
   Free(gpu0, gpu_buf);
   for (int i = 0; i < 4; ++i) {
     ASSERT_EQ((i + 1) * (i + 1), buf[i]);
diff --git a/paddle/pybind/print_operators_doc.cc b/paddle/pybind/print_operators_doc.cc
index 24f2a9383f7a069f1a8c7ed2bf3da46720470efa..f4f281229e611a6c9c8e9ecd54e0097ab683bbf3 100644
--- a/paddle/pybind/print_operators_doc.cc
+++ b/paddle/pybind/print_operators_doc.cc
@@ -31,31 +31,32 @@ std::string Escape(const std::string& s) {
   return r;
 }
 
-std::string AttrType(paddle::framework::AttrType at) {
+std::string AttrType(paddle::framework::proto::AttrType at) {
   switch (at) {
-    case paddle::framework::INT:
+    case paddle::framework::proto::INT:
       return "int";
-    case paddle::framework::FLOAT:
+    case paddle::framework::proto::FLOAT:
       return "float";
-    case paddle::framework::STRING:
+    case paddle::framework::proto::STRING:
       return "string";
-    case paddle::framework::BOOLEAN:
+    case paddle::framework::proto::BOOLEAN:
       return "bool";
-    case paddle::framework::INTS:
+    case paddle::framework::proto::INTS:
       return "int array";
-    case paddle::framework::FLOATS:
+    case paddle::framework::proto::FLOATS:
       return "float array";
-    case paddle::framework::STRINGS:
+    case paddle::framework::proto::STRINGS:
       return "string array";
-    case paddle::framework::BOOLEANS:
+    case paddle::framework::proto::BOOLEANS:
       return "bool array";
-    case paddle::framework::BLOCK:
+    case paddle::framework::proto::BLOCK:
       return "block id";
   }
   return "UNKNOWN";  // not possible
 }
 
-void PrintVar(const paddle::framework::OpProto::Var& v, std::stringstream& ss) {
+void PrintVar(const paddle::framework::proto::OpProto::Var& v,
+              std::stringstream& ss) {
   ss << " { "
      << "\n"
      << "   \"name\" : \"" << Escape(v.name()) << "\",\n"
@@ -65,7 +66,7 @@ void PrintVar(const paddle::framework::OpProto::Var& v, std::stringstream& ss) {
      << " },";
 }
 
-void PrintAttr(const paddle::framework::OpProto::Attr& a,
+void PrintAttr(const paddle::framework::proto::OpProto::Attr& a,
                std::stringstream& ss) {
   ss << " { "
      << "\n"
@@ -81,7 +82,7 @@ void PrintOpProto(const std::string& type,
                   std::stringstream& ss) {
   std::cerr << "Processing " << type << "\n";
 
-  const paddle::framework::OpProto* p = opinfo.proto_;
+  const paddle::framework::proto::OpProto* p = opinfo.proto_;
   if (p == nullptr) {
     return;  // It is possible that an operator doesn't have OpProto.
   }
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 6c8f06cccb92fa9cd22fdb89a9d410e6853895cc..f105370f226e2cceaac685f280d55134d4291028 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -108,21 +108,21 @@ static py::bytes SerializeMessage(T &self) {
 
 // Bind Methods
 void BindProgramDesc(py::module &m) {
-  py::class_<ProgramDescBind>(m, "ProgramDesc", "")
+  py::class_<ProgramDesc>(m, "ProgramDesc", "")
       .def(py::init<>())
       .def("__init__",
-           [](ProgramDescBind &self, const ProgramDescBind &other) {
-             new (&self) ProgramDescBind(other);
+           [](ProgramDesc &self, const ProgramDesc &other) {
+             new (&self) ProgramDesc(other);
            })
       .def("__init__",
-           [](ProgramDescBind &self, const py::bytes &binary_str) {
+           [](ProgramDesc &self, const py::bytes &binary_str) {
              std::string str(binary_str);
-             new (&self) ProgramDescBind(str);
+             new (&self) ProgramDesc(str);
            })
-      .def("append_block", &ProgramDescBind::AppendBlock,
+      .def("append_block", &ProgramDesc::AppendBlock,
            py::return_value_policy::reference)
       .def("append_backward",
-           [](ProgramDescBind &program_desc, const VarDescBind &target,
+           [](ProgramDesc &program_desc, const VarDesc &target,
               const std::unordered_set<std::string> &no_grad_vars) {
              ParamGradInfoMap param_grad_map =
                  AppendBackward(program_desc, target, no_grad_vars);
@@ -138,13 +138,13 @@ void BindProgramDesc(py::module &m) {
              }
              return retv;
            })
-      .def("block", &ProgramDescBind::MutableBlock,
+      .def("block", &ProgramDesc::MutableBlock,
            py::return_value_policy::reference)
-      .def("num_blocks", &ProgramDescBind::Size)
-      .def("serialize_to_string", SerializeMessage<ProgramDescBind>)
+      .def("num_blocks", &ProgramDesc::Size)
+      .def("serialize_to_string", SerializeMessage<ProgramDesc>)
       .def("parse_from_string",
-           [](ProgramDescBind &program_desc, const std::string &data) {
-             ProgramDesc *desc = program_desc.Proto();
+           [](ProgramDesc &program_desc, const std::string &data) {
+             proto::ProgramDesc *desc = program_desc.Proto();
              PADDLE_ENFORCE(desc->ParseFromString(data),
                             "Fail to parse ProgramDesc from string. This could "
                             "be a bug of Paddle.");
@@ -152,109 +152,115 @@ void BindProgramDesc(py::module &m) {
 }
 
 void BindBlockDesc(py::module &m) {
-  py::class_<BlockDescBind>(m, "BlockDesc", "")
-      .def_property_readonly("id", &BlockDescBind::ID)
-      .def_property_readonly("parent", &BlockDescBind::Parent)
-      .def("append_op", &BlockDescBind::AppendOp,
+  py::class_<BlockDesc>(m, "BlockDesc", "")
+      .def_property_readonly("id", &BlockDesc::ID)
+      .def_property_readonly("parent", &BlockDesc::Parent)
+      .def("append_op", &BlockDesc::AppendOp,
            py::return_value_policy::reference)
-      .def("prepend_op", &BlockDescBind::PrependOp,
+      .def("prepend_op", &BlockDesc::PrependOp,
            py::return_value_policy::reference)
+      .def("remove_op", &BlockDesc::RemoveOp)
       .def("var",
-           [](BlockDescBind &self, py::bytes byte_name) {
+           [](BlockDesc &self, py::bytes byte_name) {
              std::string name = byte_name;
              return self.Var(name);
            },
            py::return_value_policy::reference)
       .def("has_var",
-           [](BlockDescBind &self, py::bytes byte_name) {
+           [](BlockDesc &self, py::bytes byte_name) {
              std::string name = byte_name;
              return self.HasVar(name);
            })
       .def("find_var",
-           [](BlockDescBind &self, py::bytes byte_name) {
+           [](BlockDesc &self, py::bytes byte_name) {
              std::string name = byte_name;
              return self.FindVar(name);
            },
            py::return_value_policy::reference)
-      .def("all_vars", &BlockDescBind::AllVars,
-           py::return_value_policy::reference)
-      .def("op_size", &BlockDescBind::OpSize)
-      .def("op", &BlockDescBind::Op, py::return_value_policy::reference)
-      .def("serialize_to_string", SerializeMessage<BlockDescBind>);
+      .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
+      .def("op_size", &BlockDesc::OpSize)
+      .def("op", &BlockDesc::Op, py::return_value_policy::reference)
+      .def("serialize_to_string", SerializeMessage<BlockDesc>);
 }
 
 void BindVarDsec(py::module &m) {
-  py::enum_<DataType>(m, "DataType", "")
-      .value("BOOL", DataType::BOOL)
-      .value("INT16", DataType::INT16)
-      .value("INT32", DataType::INT32)
-      .value("INT64", DataType::INT64)
-      .value("FP16", DataType::FP16)
-      .value("FP32", DataType::FP32)
-      .value("FP64", DataType::FP64);
+  py::enum_<proto::DataType>(m, "DataType", "")
+      .value("BOOL", proto::DataType::BOOL)
+      .value("INT16", proto::DataType::INT16)
+      .value("INT32", proto::DataType::INT32)
+      .value("INT64", proto::DataType::INT64)
+      .value("FP16", proto::DataType::FP16)
+      .value("FP32", proto::DataType::FP32)
+      .value("FP64", proto::DataType::FP64);
 
-  py::class_<VarDescBind> var_desc(m, "VarDesc", "");
+  py::class_<VarDesc> var_desc(m, "VarDesc", "");
   var_desc
       .def("name",
-           [](const VarDescBind &self) {
+           [](const VarDesc &self) {
              py::bytes name = self.Name();
              return name;
            },
            py::return_value_policy::reference)
-      .def("set_shape", &VarDescBind::SetShape)
-      .def("set_dtype", &VarDescBind::SetDataType)
-      .def("shape", &VarDescBind::Shape, py::return_value_policy::reference)
-      .def("dtype", &VarDescBind::GetDataType)
-      .def("lod_level", &VarDescBind::GetLodLevel)
-      .def("set_lod_level", &VarDescBind::SetLoDLevel)
-      .def("type", &VarDescBind::GetType)
-      .def("set_type", &VarDescBind::SetType)
-      .def("serialize_to_string", SerializeMessage<VarDescBind>)
-      .def("persistable", &VarDescBind::Persistable)
-      .def("set_persistable", &VarDescBind::SetPersistable);
+      .def("set_shape", &VarDesc::SetShape)
+      .def("set_dtype", &VarDesc::SetDataType)
+      .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
+      .def("dtype", &VarDesc::GetDataType)
+      .def("lod_level", &VarDesc::GetLodLevel)
+      .def("set_lod_level", &VarDesc::SetLoDLevel)
+      .def("type", &VarDesc::GetType)
+      .def("set_type", &VarDesc::SetType)
+      .def("serialize_to_string", SerializeMessage<VarDesc>)
+      .def("persistable", &VarDesc::Persistable)
+      .def("set_persistable", &VarDesc::SetPersistable);
 
-  py::enum_<VarDesc::VarType>(var_desc, "VarType", "")
-      .value("LOD_TENSOR", VarDesc::LOD_TENSOR)
-      .value("SELECTED_ROWS", VarDesc::SELECTED_ROWS)
-      .value("FEED_MINIBATCH", VarDesc::FEED_MINIBATCH)
-      .value("FETCH_LIST", VarDesc::FETCH_LIST)
-      .value("STEP_SCOPES", VarDesc::STEP_SCOPES)
-      .value("LOD_RANK_TABLE", VarDesc::LOD_RANK_TABLE)
-      .value("LOD_TENSOR_ARRAY", VarDesc::LOD_TENSOR_ARRAY);
+  py::enum_<proto::VarDesc::VarType>(var_desc, "VarType", "")
+      .value("LOD_TENSOR", proto::VarDesc::LOD_TENSOR)
+      .value("SELECTED_ROWS", proto::VarDesc::SELECTED_ROWS)
+      .value("FEED_MINIBATCH", proto::VarDesc::FEED_MINIBATCH)
+      .value("FETCH_LIST", proto::VarDesc::FETCH_LIST)
+      .value("STEP_SCOPES", proto::VarDesc::STEP_SCOPES)
+      .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
+      .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY);
 }
 
 void BindOpDesc(py::module &m) {
-  py::enum_<AttrType>(m, "AttrType", "")
-      .value("INT", AttrType::INT)
-      .value("INTS", AttrType::INTS)
-      .value("FLOAT", AttrType::FLOAT)
-      .value("FLOATS", AttrType::FLOATS)
-      .value("STRING", AttrType::STRING)
-      .value("STRINGS", AttrType::STRINGS)
-      .value("BOOL", AttrType::BOOLEAN)
-      .value("BOOLS", AttrType::BOOLEANS)
-      .value("BLOCK", AttrType::BLOCK);
+  py::enum_<proto::AttrType>(m, "AttrType", "")
+      .value("INT", proto::AttrType::INT)
+      .value("INTS", proto::AttrType::INTS)
+      .value("FLOAT", proto::AttrType::FLOAT)
+      .value("FLOATS", proto::AttrType::FLOATS)
+      .value("STRING", proto::AttrType::STRING)
+      .value("STRINGS", proto::AttrType::STRINGS)
+      .value("BOOL", proto::AttrType::BOOLEAN)
+      .value("BOOLS", proto::AttrType::BOOLEANS)
+      .value("BLOCK", proto::AttrType::BLOCK);
 
-  py::class_<OpDescBind> op_desc(m, "OpDesc", "");
-  op_desc.def("type", &OpDescBind::Type)
-      .def("set_type", &OpDescBind::SetType)
-      .def("input", &OpDescBind::Input)
-      .def("input_names", &OpDescBind::InputNames)
-      .def("set_input", &OpDescBind::SetInput)
-      .def("output", &OpDescBind::Output)
-      .def("output_names", &OpDescBind::OutputNames)
-      .def("set_output", &OpDescBind::SetOutput)
-      .def("has_attr", &OpDescBind::HasAttr)
-      .def("attr_type", &OpDescBind::GetAttrType)
-      .def("attr_names", &OpDescBind::AttrNames)
-      .def("set_attr", &OpDescBind::SetAttr)
-      .def("attr", &OpDescBind::GetAttr)
-      .def("set_block_attr", &OpDescBind::SetBlockAttr)
-      .def("block_attr", &OpDescBind::GetBlockAttr)
-      .def("check_attrs", &OpDescBind::CheckAttrs)
-      .def("infer_shape", &OpDescBind::InferShape)
-      .def("infer_var_type", &OpDescBind::InferVarType)
-      .def("serialize_to_string", SerializeMessage<OpDescBind>);
+  py::class_<OpDesc> op_desc(m, "OpDesc", "");
+  op_desc.def("type", &OpDesc::Type)
+      .def("set_type", &OpDesc::SetType)
+      .def("input", &OpDesc::Input)
+      .def("input_names", &OpDesc::InputNames)
+      .def("set_input", &OpDesc::SetInput)
+      .def("output", &OpDesc::Output)
+      .def("output_names", &OpDesc::OutputNames)
+      .def("set_output", &OpDesc::SetOutput)
+      .def("has_attr", &OpDesc::HasAttr)
+      .def("attr_type", &OpDesc::GetAttrType)
+      .def("attr_names", &OpDesc::AttrNames)
+      .def("set_attr", &OpDesc::SetAttr)
+      .def("attr", &OpDesc::GetAttr)
+      .def("set_block_attr", &OpDesc::SetBlockAttr)
+      .def("set_serialized_attr",
+           [](OpDesc &self, const std::string &name,
+              const py::bytes &seriralized) {
+             std::string ser(seriralized);
+             self.SetAttr(name, ser);
+           })
+      .def("block_attr", &OpDesc::GetBlockAttr)
+      .def("check_attrs", &OpDesc::CheckAttrs)
+      .def("infer_shape", &OpDesc::InferShape)
+      .def("infer_var_type", &OpDesc::InferVarType)
+      .def("serialize_to_string", SerializeMessage<OpDesc>);
 }
 
 }  // namespace pybind
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 4a82f1596eb0b7b3cfe9b9bbce32549f58efdbc9..2d7fe251416dce629dd0a2318aaa020ec9668d9b 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -266,36 +266,36 @@ All parameter, weight, gradient are variables in Paddle.
     return ret_values;
   });
   m.def("get_grad_op_descs",
-        [](const OpDescBind &op_desc,
+        [](const OpDesc &op_desc,
            const std::unordered_set<std::string> &no_grad_set,
            std::unordered_map<std::string, std::string> &grad_to_var,
-           const std::vector<BlockDescBind *> &grad_sub_block) {
-          std::vector<std::unique_ptr<OpDescBind>> grad_op_descs =
+           const std::vector<BlockDesc *> &grad_sub_block) {
+          std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
               framework::OpInfoMap::Instance()
                   .Get(op_desc.Type())
                   .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
                                  grad_sub_block);
-          std::vector<OpDescBind *> grad_op_desc_ptrs(grad_op_descs.size());
+          std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
           std::transform(
               grad_op_descs.begin(), grad_op_descs.end(),
               grad_op_desc_ptrs.begin(),
-              [](std::unique_ptr<OpDescBind> &p) { return p.release(); });
+              [](std::unique_ptr<OpDesc> &p) { return p.release(); });
           return grad_op_desc_ptrs;
         });
-  m.def("prune", [](const ProgramDescBind &origin,
+  m.def("prune", [](const ProgramDesc &origin,
                     const std::vector<std::array<size_t, 2>> &targets) {
-    ProgramDescBind prog_with_targets(origin);
+    ProgramDesc prog_with_targets(origin);
     for (const auto &t : targets) {
       prog_with_targets.MutableBlock(t[0])->Op(t[1])->MarkAsTarget();
     }
-    ProgramDesc pruned_desc;
+    proto::ProgramDesc pruned_desc;
     Prune(*prog_with_targets.Proto(), &pruned_desc);
-    return new ProgramDescBind(pruned_desc);
+    return new ProgramDesc(pruned_desc);
   });
-  m.def("inference_optimize", [](ProgramDescBind &origin) {
-    ProgramDesc pruned_desc;
+  m.def("inference_optimize", [](ProgramDesc &origin) {
+    proto::ProgramDesc pruned_desc;
     InferenceOptimize(*(origin.Proto()), &pruned_desc);
-    return new ProgramDescBind(pruned_desc);
+    return new ProgramDesc(pruned_desc);
   });
   m.def_submodule(
        "var_names",
@@ -345,7 +345,7 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<OperatorBase>(m, "Operator")
       .def_static("create",
                   [](py::bytes protobin) {
-                    OpDesc desc;
+                    proto::OpDesc desc;
                     PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
                                    "Cannot parse user input to OpDesc");
                     PADDLE_ENFORCE(desc.IsInitialized(),
@@ -398,7 +398,7 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
       .def_static("create",
                   [](py::bytes protobin) -> operators::CondOp * {
-                    OpDesc desc;
+                    proto::OpDesc desc;
                     PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
                                    "Cannot parse user input to OpDesc");
                     PADDLE_ENFORCE(desc.IsInitialized(),
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 41fa658502d341fe9653a3e99b58498fcaeada47..268a0f2fa386adf99f7ea1589ff1f301f943a68b 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <string>
+#include "paddle/framework/executor.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/memory/memcpy.h"
 #include "pybind11/numpy.h"
@@ -61,11 +62,15 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
         auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
         auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
             tensor.dims(), platform::CPUPlace()));
-        // TODO(qijun): Here we use default CUDA stream to set GPU Tensor to
-        // a Python numpy array. It's better to manage CDUA stream unifiedly.
-        paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
-                                        sizeof(CUR_TYPE) * tensor.numel(),
-                                        cudaMemcpyDeviceToHost);
+
+        framework::DeviceContextPool &pool =
+            framework::DeviceContextPool::Get();
+        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
+            pool.Borrow(tensor.place()));
+
+        paddle::platform::GpuMemcpyAsync(
+            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
+            cudaMemcpyDeviceToHost, dev_ctx->stream());
 #else
         PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
 #endif
@@ -132,10 +137,12 @@ void PyCUDATensorSetFromArray(
 
   self.Resize(framework::make_ddim(dims));
   auto *dst = self.mutable_data<T>(place);
-  // TODO(qijun): Here we use default CUDA stream to set a Python numpy
-  // array to a GPU Tensor. It's better to manage CDUA stream unifiedly.
-  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
-                                  cudaMemcpyHostToDevice);
+
+  framework::DeviceContextPool &pool = framework::DeviceContextPool::Get();
+  auto dev_ctx =
+      static_cast<const platform::CUDADeviceContext *>(pool.Borrow(place));
+  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
+                                   cudaMemcpyHostToDevice, dev_ctx->stream());
 }
 #endif
 
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index ff0bac6a0740111dfa1a1440daaf1ceaf3a7b0d8..0db8d33bbcb5278ed0dd5584b5822502b719ede9 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -14,9 +14,8 @@ make -j `nproc` print_operators_doc
 paddle/pybind/print_operators_doc > doc/en/html/operators.json
 
 # check websites for broken links
-# It will be failed now!
-#linkchecker doc/en/html/index.html
-#linkchecker doc/cn/html/index.html
+linkchecker doc/en/html/index.html
+linkchecker doc/cn/html/index.html
 
 # Parse Github URL
 REPO=`git config remote.origin.url`
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 8bfe56d795e394efffabb61f145b1a20d806447d..b5cde7bac779ee1d54395b68941df2693e1ed0f5 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -25,10 +25,10 @@ from paddle.trainer.config_parser import *
 __all__ = [
     'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
     "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
-    'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
-    'simple_attention', 'dot_product_attention', 'multi_head_attention',
-    'simple_gru2', 'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm',
-    'inputs', 'outputs'
+    'img_conv_group', 'img_separable_conv', 'vgg_16_network', 'gru_unit',
+    'gru_group', 'simple_gru', 'simple_attention', 'dot_product_attention',
+    'multi_head_attention', 'simple_gru2', 'bidirectional_gru',
+    'text_conv_pool', 'bidirectional_lstm', 'inputs', 'outputs'
 ]
 
 ######################################################
@@ -251,13 +251,13 @@ def img_conv_bn_pool(input,
                      pool_layer_attr=None):
     """
     Convolution, batch normalization, pooling group.
-    
+
     Img input => Conv => BN => Pooling => Output.
 
     :param name: group name.
     :type name: basestring
     :param input: input layer.
-    :type input: LayerOutput 
+    :type input: LayerOutput
     :param filter_size: see img_conv_layer for details.
     :type filter_size: int
     :param num_filters: see img_conv_layer for details.
@@ -435,6 +435,85 @@ def img_conv_group(input,
         input=tmp, stride=pool_stride, pool_size=pool_size, pool_type=pool_type)
 
 
+@wrap_name_default("separable_conv")
+def img_separable_conv(input,
+                       num_channels,
+                       num_out_channels,
+                       filter_size,
+                       stride=1,
+                       padding=0,
+                       depth_multiplier=1,
+                       act=None,
+                       bias_attr=None,
+                       param_attr=None,
+                       shared_bias=True,
+                       layer_type='exconv',
+                       name=None):
+    """
+    Separable Convolution.
+
+    The separable convolution module is consisted of a depthwise convolution
+    that acts separately on input channels, followed by a pointwise convolution
+    with 1*1 kernels that mixes channels. It is used for Xception:
+    https://arxiv.org/pdf/1610.02357.pdf
+
+    :param input: input layer.
+    :type input: LayerOutput
+    :param num_channels: the number of input channels.
+    :type num_channels: int
+    :param num_out_channels: the number of output channels.
+    :type num_out_channels: int
+    :param filter_size: the filter size for the depthwise convolution.
+    :type filter_size: int|tuple
+    :param stride: the stride size for the depthwise convolution.
+    :type stride: int|tuple
+    :param padding: the padding size for the depthwise convolution.
+    :type padding: int|tuple
+    :param depth_multiplier: the number of filter for one channel in the
+                             depthwize convolution.
+    :type depth_multiplier: int
+    :param act: the activation function for the output.
+    :type act: BaseActivation
+    :param bias_attr: see img_conv_layer for details.
+    :type bias_attr: ParameterAttribute
+    :param param_attr: see img_conv_layer for details.
+    :type param_attr: ParameterAttribute
+    :param shared_bias: see img_conv_layer for details.
+    :type shared_bias: bool
+    :param layer_type: see img_conv_layer for details.
+    :type layer_type: bool
+    :return: layer's output
+    :rtype: LayerOutput
+    """
+    __depthwise_conv__ = img_conv_layer(
+        name="%s_depthwise_conv" % name,
+        input=input,
+        num_channels=num_channels,
+        num_filters=num_channels * depth_multiplier,
+        groups=num_channels,
+        filter_size=filter_size,
+        stride=stride,
+        padding=padding,
+        act=LinearActivation(),
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        shared_biases=shared_bias,
+        layer_type=layer_type)
+    __pointwise_conv__ = img_conv_layer(
+        name="%s_pointwise_conv" % name,
+        input=__depthwise_conv__,
+        num_channels=num_channels * depth_multiplier,
+        num_filters=num_out_channels,
+        filter_size=1,
+        stride=1,
+        padding=0,
+        act=act,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        shared_biases=shared_bias)
+    return __pointwise_conv__
+
+
 def small_vgg(input_image, num_channels, num_classes):
     def __vgg__(ipt, num_filter, times, dropouts, num_channels_=None):
         return img_conv_group(
@@ -648,7 +727,7 @@ def lstmemory_unit(input,
                    lstm_bias_attr=None,
                    lstm_layer_attr=None):
     """
-    lstmemory_unit defines the caculation process of a LSTM unit during a 
+    lstmemory_unit defines the caculation process of a LSTM unit during a
     single time step. This function is not a recurrent layer, so it can not be
     directly used to process sequence input. This function is always used in
     recurrent_group (see layers.py for more details) to implement attention
@@ -869,7 +948,7 @@ def gru_unit(input,
              gru_layer_attr=None,
              naive=False):
     """
-    gru_unit defines the calculation process of a gated recurrent unit during a single 
+    gru_unit defines the calculation process of a gated recurrent unit during a single
     time step. This function is not a recurrent layer, so it can not be
     directly used to process sequence input. This function is always used in
     the recurrent_group (see layers.py for more details) to implement attention
@@ -1012,7 +1091,7 @@ def simple_gru(input,
     simple_gru in network.py. The reason why there are so many interfaces is
     that we have two ways to implement recurrent neural network. One way is to
     use one complete layer to implement rnn (including simple rnn, gru and lstm)
-    with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But 
+    with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But
     the multiplication operation :math:`W x_t` is not computed in these layers.
     See details in their interfaces in layers.py.
     The other implementation is to use an recurrent group which can ensemble a
@@ -1116,7 +1195,7 @@ def simple_gru2(input,
     :type act: BaseActivation
     :param gate_act: gate activiation type of gru
     :type gate_act: BaseActivation
-    :param gru_bias_attr: bias parameter attribute of gru layer, 
+    :param gru_bias_attr: bias parameter attribute of gru layer,
                           False means no bias, None means default bias.
     :type gru_bias_attr: ParameterAttribute|False|None
     :param gru_param_attr: param parameter attribute of gru layer,
@@ -1189,7 +1268,7 @@ def bidirectional_gru(input,
     :type size: int
     :param return_seq: If set False, the last time step of output are
                        concatenated and returned.
-                       If set True, the entire output sequences in forward 
+                       If set True, the entire output sequences in forward
                        and backward directions are concatenated and returned.
     :type return_seq: bool
     :return: LayerOutput object.
@@ -1278,7 +1357,7 @@ def bidirectional_lstm(input,
     :type size: int
     :param return_seq: If set False, the last time step of output are
                        concatenated and returned.
-                       If set True, the entire output sequences in forward 
+                       If set True, the entire output sequences in forward
                        and backward directions are concatenated and returned.
     :type return_seq: bool
     :return: LayerOutput object.
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index 9b3792ee9e3e4c6f319b3e2b13c4aa3a05cce8be..471255ef50a3be4739a89efbd978cdb4304d992d 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -16,13 +16,14 @@ import regularizer
 from param_attr import ParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, GPUPlace
+from distribute_transpiler import DistributeTranspiler
 import clip
 
 Tensor = LoDTensor
 __all__ = framework.__all__ + executor.__all__ + [
     'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
     'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr'
-    'DataFeeder', 'clip'
+    'DataFeeder', 'clip', 'DistributeTranspiler'
 ]
 
 
diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..50364c64bec46aabc7be4f4b4370a3ad5b0eb07c
--- /dev/null
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -0,0 +1,238 @@
+import framework
+from framework import Program, default_main_program, Parameter, Variable
+import optimizer
+from layer_helper import LayerHelper
+
+
+def hash_name_to_server(params_grads, pserver_endpoints):
+    """
+    :param param_grads:
+    :return: a map of pserver endpoint -> 
+                    params -> [param list]
+                    grads  -> [grad list]
+    """
+
+    def _hash_param(param_name, total):
+        return hash(param_name) % total
+
+    param_grad_map = dict()
+    for param, grad in params_grads:
+        if param.trainable is True and grad is not None:
+            server_id = _hash_param(param.name, len(pserver_endpoints))
+            server_for_param = pserver_endpoints[server_id]
+            if not param_grad_map.has_key(server_for_param):
+                param_grad_map[server_for_param] = {"params": [], "grads": []}
+            param_grad_map[server_for_param]["params"].append(param)
+            param_grad_map[server_for_param]["grads"].append(grad)
+
+    return param_grad_map
+
+
+def round_robin(params_grads, pserver_endpoints):
+    assert (len(params_grads) > len(pserver_endpoints))
+
+    param_grad_map = dict()
+    pserver_idx = 0
+    for param, grad in params_grads:
+        if param.trainable is True:
+            server_for_param = pserver_endpoints[pserver_idx]
+            if not param_grad_map.has_key(server_for_param):
+                param_grad_map[server_for_param] = {"params": [], "grads": []}
+
+            param_grad_map[server_for_param]["params"].append(param)
+            param_grad_map[server_for_param]["grads"].append(grad)
+
+            pserver_idx += 1
+            if pserver_idx >= len(pserver_endpoints):
+                pserver_idx = 0
+    return param_grad_map
+
+
+class DistributeTranspiler:
+    def transpile(self,
+                  optimize_ops,
+                  params_grads,
+                  program=None,
+                  pservers="127.0.0.1:6174",
+                  trainers=1,
+                  split_method=round_robin):
+        """
+            Transpile the program to a distributed data-parallelism programs.
+
+            The main_program will be transform to use a remote parameter server
+            to do parameter optimization. And the optimization graph will be put
+            in to a parameter server program.
+
+            Use different methods to split trainable varialbles to different
+            parameter servers.
+
+            Example to run:
+
+            exe = fluid.Executor(place)
+            t = fluid.DistributeTranspiler()
+            t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)
+
+            pserver_endpoint = os.getenv("PSERVER")
+            if pserver_endpoint:
+                pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops)
+                exe.run(fluid.default_startup_program())
+                exe.run(pserver_prog)
+            else:
+                feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+                exe.run(fluid.default_startup_program())
+
+                for pass_id in range(PASS_NUM):
+                    ...
+
+            :param optimize_ops: op list of optimization, should be the
+                                 return value of Optimizer.minimize
+            :type optimize_ops: list
+            :param program: program to optimize, default default_main_program
+            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
+            :type pservers: string
+
+            :return: return a list of programs
+        """
+        if program is None:
+            program = default_main_program()
+        self.trainers = trainers
+        self._optimize_distributed(
+            optimize_ops,
+            program,
+            params_grads,
+            pservers=pservers,
+            trainers=trainers,
+            split_method=split_method)
+
+    def _clone_param(self, block, v):
+        assert isinstance(v, Parameter)
+        new_p = Parameter(
+            block=block,
+            shape=v.shape,
+            dtype=v.dtype,
+            type=v.type,
+            lod_level=v.lod_level,
+            stop_gradient=v.stop_gradient,
+            trainable=v.trainable,
+            optimize_attr=v.optimize_attr,
+            regularizer=v.regularizer,
+            name=v.name)
+        block.vars[new_p.name] = new_p
+
+    def _clone_var(self, block, var):
+        assert isinstance(var, Variable)
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            lod_level=var.lod_level,
+            persistable=var.persistable)
+
+    def _optimize_distributed(self, optimize_ops, program, params_and_grads,
+                              **kwargs):
+        if kwargs.has_key("split_method"):
+            split_method = kwargs["split_method"]
+        else:
+            split_method = round_robin
+
+        assert (callable(split_method))
+        pserver_endpoints = kwargs["pservers"].split(",")
+        self.param_grad_map = split_method(params_and_grads, pserver_endpoints)
+
+        send_op_ordered_inputs = []
+        epmap = []
+        for ep, v in self.param_grad_map.iteritems():
+            send_op_ordered_inputs.extend(v["grads"])
+            for i in v["grads"]:
+                epmap.append(ep)
+        send_op = program.global_block().append_op(
+            type="send",
+            inputs={"X": send_op_ordered_inputs
+                    },  # inputs is a list of tensors to be send
+            outputs={},
+            attrs={"endpoints": pserver_endpoints,
+                   "epmap": epmap})
+
+    def get_trainer_program(optimize_ops, program):
+        # remove optimize ops and add a send op to main_program
+        program.global_block().delete_ops(optimize_ops)
+
+    def _create_var_for_trainers(self, block, var, trainers):
+        var_list = []
+        for i in xrange(trainers):
+            var_each = block.create_var(
+                name="%s.trainer_%d" % (var.name, i),
+                psersistable=var.persistable,
+                dtype=var.dtype,
+                shape=var.shape)
+            var_list.append(var_each)
+        return var_list
+
+    def get_pserver_program(self, endpoint, optimize_ops):
+        pserver_program = Program()
+        for v in self.param_grad_map[endpoint]["params"]:
+            self._clone_param(pserver_program.global_block(), v)
+
+        optimize_sub_program = Program()
+        grad_var_names = [
+            var.name for var in self.param_grad_map[endpoint]["grads"]
+        ]
+        for opt_op in optimize_ops:
+            for _, var in opt_op.inputs.iteritems():
+                # NOTE: append operators to merge gradients from multiple
+                # trainers. If trainers == 1, this is not needed.
+                if self.trainers > 1 and var.name in grad_var_names:
+                    vars2merge = self._create_var_for_trainers(
+                        optimize_sub_program.global_block(), var, self.trainers)
+                    merged_var = optimize_sub_program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+                    optimize_sub_program.global_block().append_op(
+                        type="sum",
+                        inputs={"X": vars2merge},
+                        outputs={"Out": merged_var})
+                    optimize_sub_program.global_block().append_op(
+                        type="scale",
+                        inputs={"X": merged_var},
+                        outputs={"Out": merged_var},
+                        attrs={"scale": 1.0 / float(self.trainers)})
+                else:
+                    optimize_sub_program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+
+            if opt_op.inputs.has_key("Grad"):
+                if opt_op.inputs["Grad"].name in grad_var_names:
+                    print "appending ", opt_op.type, opt_op.inputs
+                    optimize_sub_program.global_block().append_op(
+                        type=opt_op.type,
+                        inputs=opt_op.inputs,
+                        outputs=opt_op.outputs,
+                        attrs=opt_op.attrs)
+            else:
+                optimize_sub_program.global_block().append_op(
+                    type=opt_op.type,
+                    inputs=opt_op.inputs,
+                    outputs=opt_op.outputs,
+                    attrs=opt_op.attrs)
+        pserver_program.global_block().append_op(
+            type="recv",
+            inputs={"RX":
+                    self.param_grad_map[endpoint]["grads"]},  # grads to recv
+            outputs={},
+            attrs={
+                "OptimizeProgram": optimize_sub_program.desc,
+                "endpoint": endpoint,
+                "ParamList":
+                [p.name for p in self.param_grad_map[endpoint]["params"]],
+                "GradList":
+                [p.name for p in self.param_grad_map[endpoint]["grads"]],
+                "Trainers": self.trainers
+            })
+        pserver_program.sync_with_cpp()
+        return pserver_program
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index 9a99b045dc70a9e4662a6f4da141183ffc8f1846..4b4a0820abb9a85f3e9936190c835c2f186107b3 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -1,6 +1,6 @@
 import numpy as np
 from . import core
-from framework import Program, default_main_program
+from framework import Program, default_main_program, Parameter, Variable
 
 __all__ = ['Executor', 'g_scope']
 
@@ -148,7 +148,7 @@ class Executor(object):
                 outputs={'Out': [fetch_var]},
                 attrs={'col': i})
 
-        self.executor.run(program.desc, scope, 0, True)
+        self.executor.run(program.desc, scope, 0, True, True)
         outs = [
             core.get_fetch_variable(scope, fetch_var_name, i)
             for i in xrange(len(fetch_list))
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index d1b12a8f097ed674b6b6384fe8ba5db950a94da5..7b65fe80aed6a450c7aea1ef5e0fcf03a2a26686 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -359,6 +359,10 @@ class Operator(object):
         """
         self.block = block
         self.desc = desc
+        # for clone a new operator
+        self.inputs = inputs
+        self.outputs = outputs
+        self.attrs = attrs
         if len(self.desc.type()) != 0:
             return
         if type is None:
@@ -389,7 +393,10 @@ class Operator(object):
                             % (in_proto.name, len(in_args)))
                     in_arg_names = []
                     for arg in in_args:
-                        in_arg_names.append(arg.name)
+                        if isinstance(arg, basestring):
+                            in_arg_names.append(arg)
+                        else:
+                            in_arg_names.append(arg.name)
                     self.desc.set_input(in_proto.name, in_arg_names)
                 else:
                     self.desc.set_input(in_proto.name, [])
@@ -430,13 +437,18 @@ class Operator(object):
                     continue
                 if isinstance(attrs[attr_name], Block):
                     self.desc.set_block_attr(attr_name, attrs[attr_name].desc)
+                elif isinstance(attrs[attr_name], core.BlockDesc) or \
+                   isinstance(attrs[attr_name], core.ProgramDesc):
+                    self.desc.set_serialized_attr(
+                        attr_name, attrs[attr_name].serialize_to_string())
                 else:
                     self.desc.set_attr(attr_name, attrs[attr_name])
 
         self.desc.check_attrs()
         no_kernel_op_set = {
             'feed', 'fetch', 'save', 'load', 'recurrent',
-            'rnn_memory_helper_grad', 'conditional_block', 'while'
+            'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
+            'recv'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)
@@ -582,6 +594,7 @@ class Block(object):
         self.vars = dict()  # var_name --> var
         self.ops = collections.deque()  # operator list
         self.program = program
+        self.removed_vars = dict()
 
     def __str__(self):
         return self.to_string(True)
@@ -638,6 +651,16 @@ class Block(object):
         self.ops.append(op)
         return op
 
+    def delete_ops(self, ops):
+        # remove from cpp
+        # FIXME(typhoonzero): remove only the first occuracy.
+        try:
+            start = list(self.ops).index(ops[0])
+            end = list(self.ops).index(ops[-1])
+        except Exception, e:
+            raise e
+        self.desc.remove_op(start, end)
+
     def prepend_op(self, *args, **kwargs):
         op_desc = self.desc.prepend_op()
         op = Operator(self, op_desc, *args, **kwargs)
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
index 8df30ad76b0b5ff2140e28935c386bbb603d8bea..a076f26f7ff279812f762bd62f72195dc2376378 100644
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -194,3 +194,9 @@ class LayerHelper(object):
         else:
             # For integer and boolean types, initialize with all zeros
             return Constant()
+
+    def is_instance(self, param_name, cls):
+        param = self.kwargs.get(param_name, None)
+        if not isinstance(param, cls):
+            raise TypeError("The input {0} parameter of method {1} must be {2}",
+                            param_name, self.layer_type, cls.__name__)
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
index dc6c0e7f518ee47b332a501df803a2364e0cffc0..22a37c22c3fc777cadcdee6632bbf1fb558fef70 100644
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -3,6 +3,7 @@ from ..framework import Program, Variable, Operator
 from .. import core
 from tensor import assign, fill_constant
 import contextlib
+from ..registry import autodoc
 
 __all__ = [
     'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard', 'StaticRNNGuard',
@@ -10,7 +11,7 @@ __all__ = [
     'max_sequence_len', 'topk', 'lod_tensor_to_array', 'array_to_lod_tensor',
     'increment', 'array_write', 'create_array', 'less_than', 'array_read',
     'shrink_memory', 'array_length', 'IfElse', 'DynamicRNN', 'ConditionalBlock',
-    'StaticRNN'
+    'StaticRNN', 'reorder_lod_tensor_by_rank'
 ]
 
 
@@ -440,9 +441,25 @@ def topk(input, k):
 
 
 def lod_tensor_to_array(x, table):
-    """
-    This function creates an operator to convert an LOD_Tensor to
-    an array.
+    """This function performs the operation that converts an LOD_Tensor to
+       an array.
+
+    Args:
+        x (Variable|list): The tensor that needs to be converted to an array.
+        table (ParamAttr|list): The variable that stores the level of lod
+                                which is ordered by sequence length in
+                                descending order.
+
+    Returns:
+        Variable: The variable of type array that has been converted from a
+                  tensor.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[10])
+          table = fluid.layers.lod_rank_table(x, level=0)
+          array = fluid.layers.lod_tensor_to_array(x, table)
     """
     helper = LayerHelper("lod_tensor_to_array", **locals())
     array = helper.create_variable(
@@ -458,9 +475,26 @@ def lod_tensor_to_array(x, table):
 
 
 def array_to_lod_tensor(x, table):
-    """
-    This function creates an operator to convert an array to a
-    LOD_Tensor.
+    """This function performs the operations that converts an array to
+       an LOD_Tensor.
+
+    Args:
+        x (Variable|list): The array that needs to be converted to a tensor.
+        table (ParamAttr|list): The variable that stores the level of lod
+                                which is ordered by sequence length in
+                                descending order.
+
+    Returns:
+        Variable: The variable of type tensor that has been converted
+                  from an array.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[10])
+          table = fluid.layers.lod_rank_table(x, level=0)
+          array = fluid.layers.lod_tensor_to_array(x, table)
+          lod_tensor = fluid.layers.array_to_lod_tensor(array, table)
     """
     helper = LayerHelper("array_to_lod_tensor", **locals())
     tmp = helper.create_tmp_variable(dtype=x.dtype)
@@ -473,10 +507,24 @@ def array_to_lod_tensor(x, table):
 
 
 def increment(x, value=1.0, in_place=True):
-    """
-    This function creates an operator to increment each value in the input
-    `x` by an amount: `value` as mentioned in the input parameter. This
-    operation is performed in-place by default.
+    """This function performs an operation that increments each value in the
+    input :math:`x` by an amount: :math:`value` as mentioned in the input
+    parameter. This operation is performed in-place by default.
+
+    Args:
+        x (Variable|list): The tensor that has the input values.
+        value (float): The amount by which the values should be incremented.
+        in_place (bool): If the increment should be performed in-place.
+
+    Returns:
+        Variable: The tensor variable storing the transformation of
+                  element-wise increment of each value in the input.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32')
+          data = fluid.layers.increment(x=data, value=3.0, in_place=True)
     """
     helper = LayerHelper("increment", **locals())
     if not in_place:
@@ -492,9 +540,24 @@ def increment(x, value=1.0, in_place=True):
 
 
 def array_write(x, i, array=None):
-    """
-    This function creates an operator to write the data out as a
+    """This function performs the operation to write the data out as an
     LOD_TENSOR_ARRAY.
+
+    Args:
+        x (Variable|list): The input tensor from which the data will be read.
+        i (Variable|list): The subscript index in tensor array, that points the
+                           place from which data will be read.
+        array (Variable|list): The data can be read into this variable if
+                               this is assigned.
+    Returns:
+        Variable: The tensor type variable that has the data written to it.
+
+    Examples:
+        .. code-block::python
+
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = layers.array_write(tmp, i=i)
     """
     helper = LayerHelper('array_write', **locals())
     if array is None:
@@ -511,6 +574,21 @@ def array_write(x, i, array=None):
 
 
 def create_array(dtype):
+    """This function creates an array of type :math:`LOD_TENSOR_ARRAY` using the
+    LayerHelper.
+
+    Args:
+        dtype (int|float): The data type of the elements in the array.
+
+    Returns:
+        Variable: The tensor variable storing the elements of data type.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.create_array(dtype='float32')
+
+    """
     helper = LayerHelper("array", **locals())
     return helper.create_variable(
         name="{0}.out".format(helper.name),
@@ -519,6 +597,24 @@ def create_array(dtype):
 
 
 def less_than(x, y, cond=None, **ignored):
+    """
+    **Less than**
+
+    This layer returns the truth value of :math:`x < y` elementwise.
+
+    Args:
+        x(Variable): First operand of *less_than*
+        y(Variable): Second operand of *less_than*
+        cond(Variable|None): Optional output variable to store the result of *less_than*
+
+    Returns:
+        Variable: The tensor variable storing the output of *less_than*.
+
+    Examples:
+        .. code-block:: python
+
+          less = fluid.layers.less_than(x=label, y=limit)
+    """
     helper = LayerHelper("less_than", **locals())
     if cond is None:
         cond = helper.create_tmp_variable(dtype='bool')
@@ -531,9 +627,19 @@ def less_than(x, y, cond=None, **ignored):
 
 
 def array_read(array, i):
-    """
-    This function creates an operator to read the data in as a
+    """This function performs the operation to read the data in as an
     LOD_TENSOR_ARRAY.
+    Args:
+        array (Variable|list): The input tensor that will be written to an array.
+        i (Variable|list): The subscript index in tensor array, that points the
+                           place where data will be written to.
+    Returns:
+        Variable: The tensor type variable that has the data written to it.
+    Examples:
+        .. code-block::python
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = layers.array_read(tmp, i=i)
     """
     helper = LayerHelper('array_read', **locals())
     if not isinstance(
@@ -567,9 +673,23 @@ def shrink_memory(x, i, table):
 
 
 def array_length(array):
-    """
-    This function creates an operator to find the length of the
+    """This function performs the operation to find the length of the input
     LOD_TENSOR_ARRAY.
+
+    Args:
+        array (LOD_TENSOR_ARRAY): The input array that will be used
+                                  to compute the length.
+
+    Returns:
+        Variable: The length of the input LoDTensorArray.
+
+    Examples:
+        .. code-block::python
+
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = fluid.layers.array_write(tmp, i=i)
+          arr_len = fluid.layers.array_length(arr)
     """
     helper = LayerHelper('array_length', **locals())
     tmp = helper.create_tmp_variable(dtype='int64')
@@ -963,3 +1083,18 @@ class DynamicRNN(object):
         if self.status != DynamicRNN.IN_RNN:
             raise ValueError("{0} can only be invoked inside rnn block.".format(
                 method))
+
+
+@autodoc
+def reorder_lod_tensor_by_rank(x, rank_table):
+    helper = LayerHelper('reorder_lod_tensor_by_rank', **locals())
+    helper.is_instance('x', Variable)
+    helper.is_instance('rank_table', Variable)
+
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='reorder_lod_tensor_by_rank',
+        inputs={'X': [x],
+                'RankTable': [rank_table]},
+        outputs={'Out': [out]})
+    return out
diff --git a/python/paddle/v2/fluid/layers/io.py b/python/paddle/v2/fluid/layers/io.py
index f4c5907f48b46ee5d9bcaba48370e5baf036c615..56c3f7b7b7f174338bb56bc5785423ca634650a6 100644
--- a/python/paddle/v2/fluid/layers/io.py
+++ b/python/paddle/v2/fluid/layers/io.py
@@ -12,20 +12,9 @@ def data(name,
          type=core.VarDesc.VarType.LOD_TENSOR,
          stop_gradient=True):
     """
-    Data Layer.
+    **Data Layer**
 
-    Args:
-       name: The name/alias of the function
-       shape: Tuple declaring the shape.
-       append_batch_size: Whether or not to append the data as a batch.
-       dtype: The type of data : float32, float_16, int etc
-       type: The output type. By default it is LOD_TENSOR.
-       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
-       stop_gradient: A boolean that mentions whether gradient should flow.
-
-    This function takes in input and based on whether data has
+    This function takes in the input and based on whether data has
     to be returned back as a minibatch, it creates the global variable using
     the helper functions. The global variables can be accessed by all the
     following operations and layers in the graph.
@@ -33,6 +22,24 @@ def data(name,
     All the input variables of this function are passed in as local variables
     to the LayerHelper constructor.
 
+    Args:
+       name(str): The name/alias of the function
+       shape(list): Tuple declaring the shape.
+       append_batch_size(bool): Whether or not to append the data as a batch.
+       dtype(int|float): The type of data : float32, float_16, int etc
+       type(VarType): The output type. By default it is LOD_TENSOR.
+       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
+       main_program(Program): Name of the main program that calls this
+       startup_program(Program): Name of the startup program
+       stop_gradient(bool): A boolean that mentions whether gradient should flow.
+
+    Returns:
+        Variable: The global variable that gives access to the data.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='x', shape=[784], dtype='float32')
     """
     helper = LayerHelper('data', **locals())
     shape = list(shape)
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 2c38c232240fbe3541ca5e0efc51d8f47c6e4190..2adce99d052639ec7d9063b1c234c623e7cdb9c6 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -13,7 +13,8 @@ __all__ = [
     'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', 'accuracy',
     'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d',
     'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'sequence_expand',
-    'lstm_unit'
+    'lstm_unit', 'reduce_sum', 'reduce_mean', 'sequence_first_step',
+    'sequence_last_step'
 ]
 
 
@@ -25,34 +26,83 @@ def fc(input,
        act=None,
        name=None):
     """
-    Fully Connected Layer.
+    **Fully Connected Layer**
+
+    The fully connected layer can take multiple tensors as its inputs. It
+    creates a variable (one for each input tensor) called weights for each input
+    tensor, which represents a fully connected weight matrix from each input
+    unit to each output unit. The fully connected layer multiplies each input
+    tensor with its coresponding weight to produce an output Tensor. If
+    multiple input tensors are given, the results of multiple multiplications
+    will be sumed up. If bias_attr is not None, a biases variable will be
+    created and added to the output. Finally, if activation is not None,
+    it will be applied to the output as well.
+
+    This process can be formulated as follows:
+
+    .. math::
+
+        Out = Act({\sum_{i=0}^{N-1}W_iX_i + b})
+
+    In the above equation:
+
+    * :math:`N`: Number of the input.
+    * :math:`X_i`: The input tensor.
+    * :math:`W`: The weights created by this layer.
+    * :math:`b`: The bias parameter created by this layer (if needed).
+    * :math:`Act`: The activation funtion.
+    * :math:`Out`: The output tensor.
 
     Args:
-       input: The input tensor to the function
-       size: The size of the layer
-       num_flatten_dims: Number of columns in input
-       param_attr: The parameters/weights to the FC Layer
-       param_initializer: Initializer used for the weight/parameter. If None, XavierInitializer() is used
-       bias_attr: The bias parameter for the FC layer
-       bias_initializer: Initializer used for the bias. If None, then ConstantInitializer() is used
-       act: Activation to be applied to the output of FC layer
-       name: Name/alias of the function
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
+       input(Variable|list): The input tensor(s) to the fully connected layer.
+       size(int): The number of output units in the fully connected layer.
+       num_flatten_dims(int): The fc layer can accept an input tensor with more
+                              than two dimensions. If this happens, the
+                              multidimensional tensor will first be flattened
+                              into a 2-dimensional matrix. The parameter
+                              `num_flatten_dims` determines how the input tensor
+                              is flattened: the first `num_flatten_dims`
+                              dimensions will be flatten to form the first
+                              dimension of the final matrix (height of the
+                              matrix), and the rest `rank(X) - num_col_dims`
+                              dimensions are flattened to form the second
+                              dimension of the final matrix (width of the matrix).
+                              For example, suppose `X` is a 6-dimensional tensor
+                              with a shape [2, 3, 4, 5, 6], and
+                              `x_num_col_dims` = 3. Then, the flattened matrix
+                              will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+                              By default, `x_num_col_dims` is set to 1.
+       param_attr(ParamAttr|list): The parameter attribute for learnable
+                                   parameters/weights of the fully connected
+                                   layer.
+       param_initializer(ParamAttr|list): The initializer used for the
+                                          weight/parameter. If set None,
+                                          XavierInitializer() will be used.
+       bias_attr(ParamAttr|list): The parameter attribute for the bias parameter
+                                  for this layer. If set None, no bias will be
+                                  added to the output units.
+       bias_initializer(ParamAttr|list): The initializer used for the bias.
+                                        If set None, then ConstantInitializer()
+                                        will be used.
+       act(str): Activation to be applied to the output of the fully connected
+                 layer.
+       name(str): Name/alias of the fully connected layer.
 
-    This function can take in multiple inputs and performs the Fully Connected
-    function (linear transformation) on top of each of them.
-    So for input x, the output will be : Wx + b. Where W is the parameter,
-    b the bias and x is the input.
 
-    The function also applies an activation (non-linearity) on top of the
-    output, if activation is passed in the input.
+    Returns:
+        Variable: The output tensor variable.
+
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.
 
-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
+    Examples:
+        .. code-block:: python
 
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          fc = fluid.layers.fc(input=data, size=1000, act="tanh")
     """
-    helper = LayerHelper('fc', **locals())
+
+    helper = LayerHelper("fc", **locals())
 
     dtype = helper.input_dtype()
 
@@ -72,8 +122,8 @@ def fc(input,
                 "Y": w,
             },
             outputs={"Out": tmp},
-            attrs={'x_num_col_dims': num_flatten_dims,
-                   'y_num_col_dims': 1})
+            attrs={"x_num_col_dims": num_flatten_dims,
+                   "y_num_col_dims": 1})
         mul_results.append(tmp)
 
     # sum
@@ -91,25 +141,30 @@ def fc(input,
 
 def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'):
     """
-    Embedding Layer.
+    **Embedding Layer**
+
+    This layer is used to lookup a vector of IDs, provided by *input*, in a lookup table.
+    The result of this lookup is the embedding of each ID in the *input*.
+
+    All the input variables are passed in as local variables to the LayerHelper
+    constructor.
 
     Args:
-       param_initializer:
-       input: The input to the function
-       size: The size of the layer
-       is_sparse: A flag that decleares whether the input is sparse
-       param_attr: Parameters for this layer
-       dtype: The type of data : float32, float_16, int etc
-       main_program: Name of the main program that calls this
-       startup_program: Name of the startup program
+       input(Variable): Input to the function
+       size(tuple|list|None): Shape of the look up table parameter 
+       is_sparse(bool): Boolean flag that specifying whether the input is sparse
+       param_attr(ParamAttr): Parameters for this layer
+       dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc
 
-    This function can take in the input (which is a vector of IDs) and
-    performs a lookup in the lookup_table using these IDs, to result into
-    the embedding of each ID in the input.
+    Returns:
+        Variable: The tensor variable storing the embeddings of the \
+                  supplied inputs.
 
-    All the input variables of this function are passed in as local variables
-    to the LayerHelper constructor.
+    Examples:
+        .. code-block:: python
 
+          data = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
+          fc = fluid.layers.embedding(input=data, size=16)
     """
 
     helper = LayerHelper('embedding', **locals())
@@ -402,8 +457,8 @@ def chunk_eval(input,
         },
         attrs={
             "num_chunk_types": num_chunk_types,
-            'chunk_scheme': chunk_scheme,
-            'excluded_chunk_types': excluded_chunk_types or []
+            "chunk_scheme": chunk_scheme,
+            "excluded_chunk_types": excluded_chunk_types or []
         })
     return precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks
 
@@ -520,9 +575,53 @@ def conv2d(input,
 
 def sequence_pool(input, pool_type, **kwargs):
     """
-    This function add the operator for sequence pooling.
-    This is applied on top of the input using pool_type mentioned
-    in the parameters.
+    This function add the operator for sequence pooling. 
+    It pools features of all time-steps of each instance, and is applied 
+    on top of the input using pool_type mentioned in the parameters. 
+
+    It supports four pool_type:
+
+    - average: :math:`Out[i] = \\frac{\sum_i X_i}{N}`
+    - sum:     :math:`Out[i] = \sum_jX_{ij}`
+    - sqrt:    :math:`Out[i] = \\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}`
+    - max:     :math:`Out[i] = max(X_i)`
+
+    .. code-block:: text
+
+       x is a 1-level LoDTensor:
+         x.lod = [[0, 2, 5, 7]]
+         x.data = [1, 3, 2, 4, 6, 5, 1]
+         x.dims = [7, 1]
+
+       then output is a Tensor:
+         out.dim = [3, 1]
+         with condition len(x.lod[-1]) - 1 == out.dims[0]
+
+       for different pool_type:
+         average: out.data = [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
+         sum    : out.data = [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
+         sqrt   : out.data = [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
+                    6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
+         max    : out.data = [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+         
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+        pool_type (string): The pooling type of sequence_pool. 
+            It supports average, sum, sqrt and max.
+
+    Returns:
+        The sequence pooling variable which is a Tensor.
+
+    Examples:
+
+        .. code-block:: python
+             
+             x = fluid.layers.data(name='x', shape=[7, 1], 
+                              dtype='float32', lod_level=1)
+             avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
+             sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
+             sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
+             max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
     """
     helper = LayerHelper('sequence_pool', input=input, **kwargs)
     dtype = helper.input_dtype()
@@ -539,6 +638,72 @@ def sequence_pool(input, pool_type, **kwargs):
     return pool_out
 
 
+def sequence_first_step(input, **kwargs):
+    """
+    This funciton get the first step of sequence.
+
+    .. code-block:: text
+
+       x is a 1-level LoDTensor:
+         x.lod = [[0, 2, 5, 7]]
+         x.data = [1, 3, 2, 4, 6, 5, 1]
+         x.dims = [7, 1]
+
+       then output is a Tensor:
+         out.dim = [3, 1]
+         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
+         
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+
+    Returns:
+        The sequence's first step variable which is a Tensor.
+
+    Examples:
+
+        .. code-block:: python
+             
+             x = fluid.layers.data(name='x', shape=[7, 1], 
+                              dtype='float32', lod_level=1)
+             x_first_step = fluid.layers.sequence_first_step(input=x)
+    """
+    return sequence_pool(input=input, pool_type="first")
+
+
+def sequence_last_step(input, **kwargs):
+    """
+    This funciton get the last step of sequence.
+
+    .. code-block:: text
+
+       x is a 1-level LoDTensor:
+         x.lod = [[0, 2, 5, 7]]
+         x.data = [1, 3, 2, 4, 6, 5, 1]
+         x.dims = [7, 1]
+
+       then output is a Tensor:
+         out.dim = [3, 1]
+         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+         
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+
+    Returns:
+        The sequence's last step variable which is a Tensor.
+
+    Examples:
+
+        .. code-block:: python
+             
+             x = fluid.layers.data(name='x', shape=[7, 1], 
+                              dtype='float32', lod_level=1)
+             x_last_step = fluid.layers.sequence_last_step(input=x)
+    """
+    return sequence_pool(input=input, pool_type="last")
+
+
 def pool2d(input,
            pool_size,
            pool_type,
@@ -683,6 +848,7 @@ def conv2d_transpose(input,
                      filter_size=None,
                      padding=None,
                      stride=None,
+                     dilation=None,
                      param_attr=None):
     """
     The transpose of conv2d layer.
@@ -706,6 +872,9 @@ def conv2d_transpose(input,
         stride(int|tuple): The stride size. If stride is a tuple, it must
             contain two integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation.
         param_attr: Parameter Attribute.
         main_program(Program): the main program
         startup_program(Program): the startup program
@@ -726,10 +895,15 @@ def conv2d_transpose(input,
         op_attr['paddings'] = padding
 
     if isinstance(stride, int):
-        op_attr['strides'] = stride
+        op_attr['strides'] = [stride, stride]
     elif stride is not None:
         op_attr['strides'] = stride
 
+    if isinstance(dilation, int):
+        op_attr['dilations'] = [dilation, dilation]
+    elif dilation is not None:
+        op_attr['dilations'] = dilation
+
     if filter_size is None:
         if output_size is None:
             raise ValueError("output_size must be set when filter_size is None")
@@ -738,14 +912,17 @@ def conv2d_transpose(input,
 
         padding = op_attr.get('paddings', [0, 0])
         stride = op_attr.get('strides', [1, 1])
+        dilation = op_attr.get('dilations', [1, 1])
 
         h_in = input.shape[2]
         w_in = input.shape[3]
-        filter_size_h = output_size[0] - \
-                        (h_in - 1) * stride[0] + 2 * padding[0]
-        filter_size_w = output_size[1] - \
-                        (w_in - 1) * stride[1] + 2 * padding[1]
+
+        filter_size_h = (output_size[0] - (h_in - 1) * stride[0] + 2 *
+                         padding[0] - 1) / dilation[0] + 1
+        filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + 2 *
+                         padding[1] - 1) / dilation[1] + 1
         filter_size = [filter_size_h, filter_size_w]
+
     elif isinstance(filter_size, int):
         filter_size = [filter_size, filter_size]
 
@@ -935,3 +1112,91 @@ def lstm_unit(x_t,
         attrs={"forget_bias": forget_bias})
 
     return h, c
+
+
+def reduce_sum(input, dim=None, keep_dim=False):
+    """
+    Computes the sum of tensor elements over the given dimension. 
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the sum is performed. If 
+            :attr:`None`, sum all elements of :attr:`input` and return a 
+            Tensor variable with a single element, otherwise must be in the 
+            range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, 
+            the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the 
+            output Tensor. The result tensor will have one fewer dimension 
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+    
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_sum(x)  # [3.5]
+            fluid.layers.reduce_sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
+            fluid.layers.reduce_sum(x, dim=-1)  # [1.9, 1.6]
+            fluid.layers.reduce_sum(x, dim=1, keep_dim=True)  # [[1.9], [1.6]]
+    """
+    helper = LayerHelper('reduce_sum', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_sum',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def reduce_mean(input, dim=None, keep_dim=False):
+    """
+    Computes the mean of tensor elements over the given dimension. 
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the mean is computed. If 
+            :attr:`None`, compute the mean over all elements of :attr:`input` 
+            and return a Tensor variable with a single element, otherwise 
+            must be in the range :math:`[-rank(input), rank(input))`. If 
+            :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the 
+            output Tensor. The result tensor will have one fewer dimension 
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+    
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_mean(x)  # [0.4375]
+            fluid.layers.reduce_mean(x, dim=0)  # [0.15, 0.25, 0.55, 0.8]
+            fluid.layers.reduce_mean(x, dim=-1)  # [0.475, 0.4]
+            fluid.layers.reduce_mean(x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
+    """
+    helper = LayerHelper('reduce_mean', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_mean',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
index bda017b141dcba5ac268c34388742c433a533337..e5820d24cd2b34ef53cbb91e2be66efc1b74d315 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -27,10 +27,23 @@ def cast(x, dtype):
     return out
 
 
-def concat(input, axis):
+def concat(input, axis=0):
     """
-    This function concats the input along the axis mentioned
+    **Concat**
+
+    This function concatenates the input along the axis mentioned
     and returns that as the output.
+
+    Args:
+        input(list): List of tensors to be concatenated
+        axis(int): Integer axis along which the tensors will be concatenated
+
+    Returns:
+        Variable: Output variable of the concatenation
+
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
     """
     helper = LayerHelper('concat', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -43,9 +56,28 @@ def concat(input, axis):
 
 
 def sums(input, out=None):
-    """
-    This function takes in the input and performs the sum operation on it
-    and returns that as the output.
+    """This function performs the sum operation on the input and returns the
+    result as the output.
+
+    Args:
+        input (Variable|list): The input tensor that has the elements
+                               that need to be summed up.
+
+    Returns:
+        Variable: The tensor type variable that has the sum of input
+                  written to it.
+
+    Examples:
+        .. code-block::python
+
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          a0 = layers.array_read(array=tmp, i=i)
+          i = layers.increment(x=i)
+          a1 = layers.array_read(array=tmp, i=i)
+          mean_a0 = layers.mean(x=a0)
+          mean_a1 = layers.mean(x=a1)
+          a_sum = layers.sums(input=[mean_a0, mean_a1])
     """
     helper = LayerHelper('sum', **locals())
     if out is None:
@@ -55,6 +87,24 @@ def sums(input, out=None):
 
 
 def assign(input, output):
+    """
+    **Assign**
+
+    This function copies the *input* Variable to the *output* Variable.
+
+    Args:
+        input(Variable): The source variable
+        output(Variable): The destination variable
+
+    Returns:
+        Variable: The destination variable that was supplied as the *output*.
+
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.create_tensor(dtype='float32')
+          hidden = fluid.layers.fc(input=data, size=10)
+          fluid.layers.assign(hidden, out)
+    """
     helper = LayerHelper('assign', **locals())
     helper.append_op(
         type='scale',
@@ -66,9 +116,26 @@ def assign(input, output):
 
 def fill_constant(shape, dtype, value, out=None):
     """
-    This function creates a tensor , with shape as mentioned in the input and
-    specified dtype and fills this up with a constant value that
-    comes in the input. It also sets the stop_gradient to be True.
+    **fill_constant**
+
+    This function creates a tensor of specified *shape* and
+    *dtype*, and initializes this with a constant supplied in *value*.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+        value(float): Constant value to initialize the output tensor
+        out(Variable): Output Variable to initialize
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
     """
     helper = LayerHelper("fill_constant", **locals())
     if out is None:
@@ -90,6 +157,31 @@ def fill_constant_batch_size_like(input,
                                   value,
                                   input_dim_idx=0,
                                   output_dim_idx=0):
+    """
+    **fill_constant_batch_size_like**
+
+    This function creates a tensor of specified *shape*, *dtype* and batch size,
+    and initializes this with a constant supplied in *value*. The batch size is
+    obtained from the `input` tensor.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        input(Variable): Tensor whose dimensions will be used to get batch size
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+        value(float): Constant value to initialize the output tensor
+        input_dim_idx(int): Index of input's batch size dimension
+        output_dim_idx(int): Index of output's batch size dimension
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
+    """
     helper = LayerHelper("fill_constant_batch_size_like", **locals())
     out = helper.create_tmp_variable(dtype=dtype)
     helper.append_op(
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index 84fcbcdc2f2868a84bad5e145a934e33485b1fef..c56a531ed531cf0219e94854ba66c7399e003292 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -207,7 +207,7 @@ class Optimizer(object):
 
         optimize_ops = self.create_optimization_pass(params_grads, loss,
                                                      startup_program)
-        return optimize_ops
+        return optimize_ops, params_grads
 
 
 class SGDOptimizer(Optimizer):
diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py
index f6f320c788e7e08d44df8aff5ad3792b237e103a..ab4561b0423dd73c8c0d529cbf34b52876b1b77c 100644
--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
@@ -58,7 +58,9 @@ class ParamAttr(object):
     def to_kwargs(self, with_initializer=False):
         kwargs = {
             'name': self.name,
-            'learning_rate': self.learning_rate,
+            'optimize_attr': {
+                'learning_rate': self.learning_rate
+            },
             'regularizer': self.regularizer,
             'trainable': self.trainable,
             'clip_attr': self.clip
diff --git a/python/paddle/v2/fluid/registry.py b/python/paddle/v2/fluid/registry.py
index 6f5dd365ded628ad49800f0a04f208ec49cca4c5..7aa82906114b355277185211134bb791e5dc43f9 100644
--- a/python/paddle/v2/fluid/registry.py
+++ b/python/paddle/v2/fluid/registry.py
@@ -8,7 +8,7 @@ import proto.framework_pb2 as framework_pb2
 from framework import OpProtoHolder, Variable, Program, Operator
 from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
 
-__all__ = ['deprecated', 'register_layer']
+__all__ = ['deprecated', 'register_layer', 'autodoc']
 
 
 def _convert_(name):
@@ -175,12 +175,18 @@ def deprecated(func_or_class):
         """
         Wrap func with deprecated warning
         """
-        warnings.simplefilter('always', DeprecationWarning)  #turn off filter
+        warnings.simplefilter('always', DeprecationWarning)  # turn off filter
         warnings.warn(
             "Call to deprecated function {}.".format(func.__name__),
             category=DeprecationWarning,
             stacklevel=2)
-        warnings.simplefilter('default', DeprecationWarning)  #reset filter
+        warnings.simplefilter('default', DeprecationWarning)  # reset filter
         return func(*args, **kwargs)
 
     return func_wrapper
+
+
+def autodoc(func):
+    func.__doc__ = _generate_doc_string_(OpProtoHolder.instance().get_op_proto(
+        func.__name__))
+    return func
diff --git a/python/paddle/v2/fluid/tests/__init__.py b/python/paddle/v2/fluid/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py b/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..2680502efb91061be37a77fbe5b451960fdd15f7
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/notest_recognize_digits_conv_dist.py
@@ -0,0 +1,72 @@
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    input=images,
+    filter_size=5,
+    num_filters=20,
+    pool_size=2,
+    pool_stride=2,
+    act="relu")
+conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    input=conv_pool_1,
+    filter_size=5,
+    num_filters=50,
+    pool_size=2,
+    pool_stride=2,
+    act="relu")
+
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+BATCH_SIZE = 50
+PASS_NUM = 3
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+t = fluid.DistributeTranspiler()
+pserver_endpoints = os.getenv("PSERVERS")
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=1)
+
+if training_role == "PSERVER":
+    pserver_prog = t.get_pserver_program(pserver_endpoints, optimize_ops)
+    exe.run(fluid.default_startup_program())
+    exe.run(pserver_prog)
+elif training_role == "TRAINER":
+    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in range(PASS_NUM):
+        accuracy.reset(exe)
+        for data in train_reader():
+            loss, acc = exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+            pass_acc = accuracy.eval(exe)
+            # print loss, acc
+            if loss < 10.0 and pass_acc > 0.9:
+                # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
+                exit(0)
+
+        pass_acc = accuracy.eval(exe)
+        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
+else:
+    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
index 80ffc5a544c201ed45a6de46b5a2addff82246b7..e79864b3977ed8111903f9497685ee7ebf76e1da 100644
--- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
@@ -33,7 +33,7 @@ def encoder_decoder():
 
     fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
     lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
-    encoder_out = layers.sequence_pool(input=lstm_hidden0, pool_type="last")
+    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
 
     # decoder
     trg_language_word = layers.data(
diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
index dee2febb83d171ed4a13921e3b7d37322ead2786..a9c0b1cfd3417d0583fa9d4e15550e7543a6bd19 100644
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -208,7 +208,7 @@ class TestBatchNormOp(OpTest):
         print 'python: NHWC, NCHW, backward checking passed'
 
     def test_forward_backward(self):
-        def test_with_place(place, tensor_format, shape):
+        def test_with_place(place, data_layout, shape):
             # attr
             epsilon = 0.00001
             momentum = 0.9
@@ -292,7 +292,7 @@ class TestBatchNormOp(OpTest):
                 SavedVariance="saved_variance",
                 # attrs
                 is_test=False,
-                tensor_format=tensor_format,
+                data_layout=data_layout,
                 momentum=momentum,
                 epsilon=epsilon)
 
@@ -311,7 +311,7 @@ class TestBatchNormOp(OpTest):
                 atol = 1e-4
             self.__assert_close(variance_out_tensor, variance_out,
                                 "variance_out", atol)
-            print "op test forward passed: ", str(place), tensor_format
+            print "op test forward passed: ", str(place), data_layout
 
             # run backward
             batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set())
@@ -336,11 +336,15 @@ class TestBatchNormOp(OpTest):
             self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad")
             self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad")
             self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad")
-            print "op test backward passed: ", str(place), tensor_format
+            print "op test backward passed: ", str(place), data_layout
 
         places = [core.CPUPlace()]
         if core.is_compile_gpu() and core.op_support_gpu("batch_norm"):
             places.append(core.GPUPlace(0))
+
+            core.init_devices(["CPU", "GPU:0"])
+        else:
+            core.init_devices(["CPU"])
         for place in places:
             for data_format in ["NCHW", "NHWC"]:
                 test_with_place(place, data_format, [2, 3, 4, 5])
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
index d7b1f2f2a3abf6335998742dbbef8e17794170fa..d59537b924d57d40f7d740d99eb814c95f528e5f 100644
--- a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
@@ -3,14 +3,17 @@ import numpy as np
 from op_test import OpTest
 
 
-def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
+def conv2dtranspose_forward_naive(input_, filter_, attrs):
     in_n, in_c, in_h, in_w = input_.shape
     f_c, out_c, f_h, f_w = filter_.shape
     assert in_c == f_c
 
-    stride, pad = conv2dtranspose_param['stride'], conv2dtranspose_param['pad']
-    out_h = (in_h - 1) * stride[0] + f_h
-    out_w = (in_w - 1) * stride[1] + f_w
+    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
+        'dilations']
+    d_bolck_h = dilations[0] * (f_h - 1) + 1
+    d_bolck_w = dilations[1] * (f_w - 1) + 1
+    out_h = (in_h - 1) * stride[0] + d_bolck_h
+    out_w = (in_w - 1) * stride[1] + d_bolck_w
 
     out = np.zeros((in_n, out_c, out_h, out_w))
 
@@ -23,9 +26,9 @@ def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
 
                 for k in range(out_c):
                     tmp_out = np.sum(input_masked * filter_[:, k, :, :], axis=0)
-                    i1, i2 = i * stride[0], i * stride[0] + f_h
-                    j1, j2 = j * stride[0], j * stride[0] + f_w
-                    out[n, k, i1:i2, j1:j2] += tmp_out
+                    i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
+                    j1, j2 = j * stride[0], j * stride[0] + d_bolck_h
+                    out[n, k, i1:i2:dilations[0], j1:j2:dilations[1]] += tmp_out
 
     out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
     return out
@@ -37,11 +40,8 @@ class TestConv2dTransposeOp(OpTest):
         self.init_op_type()
         self.init_test_case()
 
-        conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad}
         input_ = np.random.random(self.input_size).astype("float32")
         filter_ = np.random.random(self.filter_size).astype("float32")
-        output = conv2dtranspose_forward_naive(
-            input_, filter_, conv2dtranspose_param).astype('float32')
 
         self.inputs = {'Input': input_, 'Filter': filter_}
         self.attrs = {
@@ -49,6 +49,10 @@ class TestConv2dTransposeOp(OpTest):
             'paddings': self.pad,
             'dilations': self.dilations
         }
+
+        output = conv2dtranspose_forward_naive(input_, filter_,
+                                               self.attrs).astype('float32')
+
         self.outputs = {'Output': output}
 
     def test_check_output(self):
@@ -104,11 +108,60 @@ class TestWithStride(TestConv2dTransposeOp):
         self.filter_size = [f_c, 6, 3, 3]
 
 
+class TestWithDilation(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
 # ------------ test_cudnn ------------
 class TestCudnn(TestConv2dTransposeOp):
     def init_op_type(self):
         self.op_type = "conv2d_transpose_cudnn"
 
 
+class TestCudnnWithPad(TestWithPad):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv2d_transpose_cudnn"
+
+
+class TestCudnnWithStride(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv2d_transpose_cudnn"
+
+
+# #cudnn v5 does not support dilation conv.
+# class TestCudnnWithDilation(TestWithDilation):
+#     def init_test_case(self):
+#         self.pad = [1, 1]
+#         self.stride = [2, 2]
+#         self.dilations = [2, 2]
+#         self.input_size = [2, 3, 5, 5]  # NCHW
+#         f_c = self.input_size[1]
+#         self.filter_size = [f_c, 6, 3, 3]
+#
+#     def init_op_type(self):
+#         self.op_type = "conv2d_transpose_cudnn"
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
index 8fd34b87bfea91307f52fdcbb9f71f2e1a9c6c56..a353f9b4d40233de46237005138f21430f4d865a 100644
--- a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
@@ -3,15 +3,20 @@ import numpy as np
 from op_test import OpTest
 
 
-def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
+def conv3dtranspose_forward_naive(input_, filter_, attrs):
     in_n, in_c, in_d, in_h, in_w = input_.shape
     f_c, out_c, f_d, f_h, f_w = filter_.shape
     assert in_c == f_c
 
-    stride, pad = conv3dtranspose_param['stride'], conv3dtranspose_param['pad']
-    out_d = (in_d - 1) * stride[0] + f_d
-    out_h = (in_h - 1) * stride[1] + f_h
-    out_w = (in_w - 1) * stride[2] + f_w
+    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
+        'dilations']
+
+    d_bolck_d = dilations[0] * (f_d - 1) + 1
+    d_bolck_h = dilations[1] * (f_h - 1) + 1
+    d_bolck_w = dilations[2] * (f_w - 1) + 1
+    out_d = (in_d - 1) * stride[0] + d_bolck_d
+    out_h = (in_h - 1) * stride[1] + d_bolck_h
+    out_w = (in_w - 1) * stride[2] + d_bolck_w
     out = np.zeros((in_n, out_c, out_d, out_h, out_w))
 
     for n in range(in_n):
@@ -25,10 +30,11 @@ def conv3dtranspose_forward_naive(input_, filter_, conv3dtranspose_param):
                     for k in range(out_c):
                         tmp_out = np.sum(input_masked * filter_[:, k, :, :, :],
                                          axis=0)
-                        d1, d2 = d * stride[0], d * stride[0] + f_d
-                        i1, i2 = i * stride[1], i * stride[1] + f_h
-                        j1, j2 = j * stride[2], j * stride[2] + f_w
-                        out[n, k, d1:d2, i1:i2, j1:j2] += tmp_out
+                        d1, d2 = d * stride[0], d * stride[0] + d_bolck_d
+                        i1, i2 = i * stride[1], i * stride[1] + d_bolck_h
+                        j1, j2 = j * stride[2], j * stride[2] + d_bolck_w
+                        out[n, k, d1:d2:dilations[0], i1:i2:dilations[1], j1:j2:
+                            dilations[2]] += tmp_out
 
     out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
               pad[2]]
@@ -41,18 +47,19 @@ class TestConv3dTransposeOp(OpTest):
         self.init_op_type()
         self.init_test_case()
 
-        conv3dtranspose_param = {'stride': self.stride, 'pad': self.pad}
         input_ = np.random.random(self.input_size).astype("float32")
         filter_ = np.random.random(self.filter_size).astype("float32")
-        output = conv3dtranspose_forward_naive(
-            input_, filter_, conv3dtranspose_param).astype("float32")
 
         self.inputs = {'Input': input_, 'Filter': filter_}
         self.attrs = {
             'strides': self.stride,
             'paddings': self.pad,
-            # 'dilations': self.dilations
+            'dilations': self.dilations
         }
+
+        output = conv3dtranspose_forward_naive(input_, filter_,
+                                               self.attrs).astype("float32")
+
         self.outputs = {'Output': output}
 
     def test_check_output(self):
@@ -108,11 +115,60 @@ class TestWithStride(TestConv3dTransposeOp):
         self.filter_size = [f_c, 6, 3, 3, 3]
 
 
+class TestWithDilation(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [2, 2, 2]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
 # ------------ test_cudnn ------------
 class TestCudnn(TestConv3dTransposeOp):
     def init_op_type(self):
         self.op_type = "conv3d_transpose_cudnn"
 
 
+class TestCudnnWithPad(TestWithPad):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose_cudnn"
+
+
+class TestCudnnWithStride(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose_cudnn"
+
+
+# #cudnn v5 does not support dilation conv.
+# class TestCudnnWithDilation(TestWithDilation):
+#     def init_test_case(self):
+#         self.pad = [1, 1, 1]
+#         self.stride = [2, 2, 2]
+#         self.dilations = [2, 2, 2]
+#         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+#         f_c = self.input_size[1]
+#         self.filter_size = [f_c, 6, 3, 3, 3]
+#
+#     def init_op_type(self):
+#         self.op_type = "conv3d_transpose_cudnn"
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_dropout_op.py b/python/paddle/v2/fluid/tests/test_dropout_op.py
index 4f5ea836b44102e5599a2302efd669291ebe920b..2483200212686caf9c46f9c1129b5d8ffdcc9145 100644
--- a/python/paddle/v2/fluid/tests/test_dropout_op.py
+++ b/python/paddle/v2/fluid/tests/test_dropout_op.py
@@ -47,7 +47,9 @@ class TestDropoutOp4(OpTest):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
         self.attrs = {'dropout_prob': 0.35, 'is_test': True}
-        self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
+        self.outputs = {
+            'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -58,7 +60,9 @@ class TestDropoutOp5(OpTest):
         self.op_type = "dropout"
         self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
         self.attrs = {'dropout_prob': 0.75, 'is_test': True}
-        self.outputs = {'Out': self.inputs['X'] * self.attrs['dropout_prob']}
+        self.outputs = {
+            'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
+        }
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/fluid/tests/test_dyn_rnn.py b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
index 034266c26f48197872a3419135d45b30a8120e8a..8090c5f47814c60034f2f46f00e56c530e0f2c19 100644
--- a/python/paddle/v2/fluid/tests/test_dyn_rnn.py
+++ b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
@@ -63,8 +63,7 @@ class TestDynRNN(unittest.TestCase):
 
             all_timesteps = fluid.layers.array_to_lod_tensor(
                 x=out, table=rank_table)
-            last = fluid.layers.sequence_pool(
-                input=all_timesteps, pool_type='last')
+            last = fluid.layers.sequence_last_step(input=all_timesteps)
             logits = fluid.layers.fc(input=last, size=1, act=None)
             loss = fluid.layers.sigmoid_cross_entropy_with_logits(
                 x=logits, label=label)
@@ -101,7 +100,7 @@ class TestDynRNN(unittest.TestCase):
                 rnn.update_memory(mem, out_)
                 rnn.output(out_)
 
-            last = fluid.layers.sequence_pool(input=rnn(), pool_type='last')
+            last = fluid.layers.sequence_last_step(input=rnn())
             logits = fluid.layers.fc(input=last, size=1, act=None)
             label = fluid.layers.data(name='label', shape=[1], dtype='float32')
             loss = fluid.layers.sigmoid_cross_entropy_with_logits(
diff --git a/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
index eff8fa87d9c0dafc6935604101e94ee6c8b081ce..cd91769a22f8d6af193efabd8d997913676fbba6 100644
--- a/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
@@ -7,7 +7,7 @@ class TestFillZerosLikeOp(OpTest):
     def setUp(self):
         self.op_type = "fill_zeros_like"
         self.inputs = {'X': np.random.random((219, 232)).astype("float32")}
-        self.outputs = {'Y': np.zeros_like(self.inputs["X"])}
+        self.outputs = {'Out': np.zeros_like(self.inputs["X"])}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
index 627ab4e23562f14538d85f2e21edeb7d72d940bb..a9d943b8b7f7d9bc0dec89c5360769e0328527ba 100644
--- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
@@ -1,32 +1,47 @@
 import unittest
+import numpy
+
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.op import Operator
-import numpy
+from paddle.v2.fluid.executor import Executor
 
 
 class TestGaussianRandomOp(unittest.TestCase):
+    def setUp(self):
+        self.op_type = "gaussian_random"
+        self.inputs = {}
+        self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10}
+
+        self.outputs = ["Out"]
+
     def test_cpu(self):
-        self.gaussian_random_test(place=core.CPUPlace())
+        self.gaussian_random_test(place=fluid.CPUPlace())
 
     def test_gpu(self):
         if core.is_compile_gpu():
-            self.gaussian_random_test(place=core.GPUPlace(0))
+            self.gaussian_random_test(place=fluid.GPUPlace(0))
 
     def gaussian_random_test(self, place):
-        scope = core.Scope()
-        scope.var('Out').get_tensor()
-
-        op = Operator(
-            "gaussian_random",
-            Out='Out',
-            shape=[1000, 784],
-            mean=.0,
-            std=1.,
-            seed=10)
 
         context = core.DeviceContext.create(place)
-        op.run(scope, context)
-        tensor = numpy.array(scope.find_var('Out').get_tensor())
+        program = fluid.Program()
+        block = program.global_block()
+        vout = block.create_var(name="Out")
+        op = block.append_op(
+            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
+
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+
+        fetch_list = []
+        for var_name in self.outputs:
+            fetch_list.append(block.var(var_name))
+
+        exe = Executor(place)
+        outs = exe.run(program, fetch_list=fetch_list)
+        tensor = outs[0]
+
         self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
         self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
 
diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py
index 2459dfd664300d405edb36c4ca906c1769b5e7d2..29694be58bce0eb41b05439da35ef07a542ef12a 100644
--- a/python/paddle/v2/fluid/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -27,7 +27,7 @@ class TestOptimizer(unittest.TestCase):
         block.append_op(
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
-        opts = sgd_optimizer.minimize(mean_out, init_program)
+        opts, _ = sgd_optimizer.minimize(mean_out, init_program)
         self.assertEqual(len(opts), 1)
         sgd_op = opts[0]
         self.assertEqual(sgd_op.type, "sgd")
@@ -57,7 +57,7 @@ class TestOptimizer(unittest.TestCase):
         learning_rate = 0.01
         sgd_optimizer = optimizer.SGDOptimizer(
             learning_rate=learning_rate, global_step=global_step)
-        opts = sgd_optimizer.minimize(mean_out, init_program)
+        opts, _ = sgd_optimizer.minimize(mean_out, init_program)
         self.assertEqual(len(opts), 2)
         sgd_op = opts[0]
         self.assertEqual(sgd_op.type, "sgd")
diff --git a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f5774835e02191a068e86ea56f3f877c464a391
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
@@ -0,0 +1,47 @@
+import unittest
+import paddle.v2.fluid as fluid
+import numpy
+
+
+class TestReorderLoDTensor(unittest.TestCase):
+    def test_reorder(self):
+        dat = fluid.layers.data(name='input', shape=[1], lod_level=2)
+        dat.stop_gradient = False
+        rank_dat = fluid.layers.data(name='ref', shape=[1], lod_level=1)
+        table = fluid.layers.lod_rank_table(rank_dat)
+        new_dat = fluid.layers.reorder_lod_tensor_by_rank(
+            x=dat, rank_table=table)
+        loss = fluid.layers.mean(x=new_dat)
+        fluid.backward.append_backward_ops(loss=loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        exe.run(fluid.default_startup_program())
+
+        ref = fluid.Tensor()
+        ref_lod = [0, 3, 4, 7, 8, 14]
+        ref.set_lod([ref_lod])
+
+        ref.set(numpy.random.random(size=[14, 1]).astype('float32'), cpu)
+        input = fluid.Tensor()
+        lod_level_0 = numpy.random.randint(low=1, high=5, size=5)
+        lod_level_0 = [0] + numpy.cumsum(lod_level_0).tolist()
+        lod_level_1 = numpy.random.randint(low=1, high=5, size=lod_level_0[-1])
+        lod_level_1 = [0] + numpy.cumsum(lod_level_1).tolist()
+
+        input.set_lod([lod_level_0, lod_level_1])
+        input.set(
+            numpy.random.random(size=[lod_level_1[-1], 1]).astype('float32'),
+            cpu)
+
+        ig = exe.run(fluid.default_main_program(),
+                     feed={'input': input,
+                           'ref': ref},
+                     fetch_list=['input@GRAD'],
+                     return_numpy=False)[0]
+        self.assertAlmostEqual(numpy.array(ig).sum(), 1.0, delta=0.001)
+        self.assertEqual(input.lod(), ig.lod())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
index f736dfb2e85552b321403c961da517f3b3efb100..00b4f196209a6414f1063a33c0e31093e33ca39d 100644
--- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
@@ -1,32 +1,50 @@
 import unittest
+import numpy
+
 from paddle.v2.fluid.op import Operator
 import paddle.v2.fluid.core as core
-import numpy
+import paddle.v2.fluid as fluid
 
 
 class TestUniformRandomOp(unittest.TestCase):
-    def test_uniform_random_cpu(self):
+    def setUp(self):
+        self.op_type = "uniform_random"
+        self.inputs = {}
+        self.attrs = {
+            "shape": [1000, 784],
+            "min": -5.0,
+            "max": 10.0,
+            "seed": 10
+        }
+        self.outputs = ["Out"]
+
+    def test_cpu(self):
         self.uniform_random_test(place=core.CPUPlace())
 
-    def test_uniform_random_gpu(self):
+    def test_gpu(self):
         if core.is_compile_gpu():
             self.uniform_random_test(place=core.GPUPlace(0))
 
     def uniform_random_test(self, place):
-        scope = core.Scope()
-        scope.var('X').get_tensor()
-
-        op = Operator(
-            "uniform_random",
-            Out='X',
-            shape=[1000, 784],
-            min=-5.0,
-            max=10.0,
-            seed=10)
-
-        ctx = core.DeviceContext.create(place)
-        op.run(scope, ctx)
-        tensor = numpy.array(scope.find_var('X').get_tensor())
+        context = core.DeviceContext.create(place)
+        program = fluid.Program()
+        block = program.global_block()
+        vout = block.create_var(name="Out")
+        op = block.append_op(
+            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
+
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+
+        fetch_list = []
+        for var_name in self.outputs:
+            fetch_list.append(block.var(var_name))
+
+        exe = fluid.Executor(place)
+        outs = exe.run(program, fetch_list=fetch_list)
+
+        tensor = outs[0]
+
         self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1)